![ds4a_colombia.svg](attachment:ds4a_colombia.svg)

# Impacto de la deforestación en el regimen de caudales de los rios en Colombia (TEAM 28)

## Multivariate time series forecasting

Sources :

https://towardsdatascience.com/vector-autoregressions-vector-error-correction-multivariate-model-a69daf6ab618

https://towardsdatascience.com/pairs-trading-with-cryptocurrencies-e79b4a00b015

### Libraries

In [None]:
import seaborn as sns
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

import numpy as np
import pandas as pd
import sklearn.metrics as skm

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(num = None, figsize = (15, 12), dpi = 80, facecolor = 'w', edgecolor = 'k')
plt.rcParams.update({'font.size': 16, 'figure.figsize': (15, 10), 
                     'figure.max_open_warning': 200})

In [None]:
def table2lags(table, max_lag, min_lag=0, separator='_'):
    """ Given a dataframe, return a dataframe with different lags of all its columns """
    values=[]
    for i in range(min_lag, max_lag + 1):
        values.append(table.shift(i).copy())
        values[-1].columns = [c + separator + str(i) for c in table.columns]
    return pd.concat(values, axis=1)

In [None]:
print(plt.rcParams.keys())

### Read Data

In [None]:
macrodata = pd.read_csv('../data/matrix/matrix_consol_v2.zip')

macrodata.head(10)

In [None]:
is_stationary = pd.read_csv('../model/mc_stationary.csv')

is_stationary.fillna(value = 0, inplace = True)

is_stationary = is_stationary.astype('int')

is_stationary.head(10)

In [None]:
mcs = macrodata['mc'].unique()
mcs.sort()

print(mcs)

In [None]:
dates = pd.DataFrame(pd.date_range('2018-01-01','2019-12-31' , freq='1M') - 
             pd.offsets.MonthBegin(1))
dates.columns = ['date']

dates['year'] = pd.DatetimeIndex(dates['date']).year
dates['month'] = pd.DatetimeIndex(dates['date']).month

dates.head()

### Process for all Macro basins (Version 1 : Without hyperparameter tuning)

In [None]:
nobs = 24 # 24 meses para test

RFR_metrics = pd.DataFrame()
RFR_prediction = pd.DataFrame()

for i in mcs:
    
    print('\n==================================================================')
    print('MC = %s' % i)
    print('====================================================================\n')
    
    temp_df = macrodata[macrodata['mc'] == i].copy().reset_index(drop = True)
    temp_df['v_loss_cover_10k'] = temp_df['v_loss_cover'] * 10000
    X = table2lags(temp_df[['v_loss_cover_10k', 'v_rainfall_total']], 2)
    X.fillna(0.00, inplace=True)

    X_train = X.iloc[0:-nobs].reset_index(drop = True)
    y_train = temp_df[0:-nobs]['v_flow_mean'].reset_index(drop = True)

    X_test = X.iloc[-nobs:].reset_index(drop = True)
    X_test_org = temp_df.iloc[-nobs:].reset_index(drop = True)
    y_test = temp_df[-nobs:]['v_flow_mean'].reset_index(drop = True)

#     print('\n== X train ==========================================================')
#     print(X_train.head())
#     print('\n== y train ==========================================================')
#     print(y_train.head())
#     print('\n== X test ==========================================================')
#     print(X_test.head())
#     print('\n== y test ==========================================================')
#     print(y_test.head())
#     print('=====================================================================\n')
    
    rfR = RandomForestRegressor(n_estimators = 200, max_depth = 100, criterion = 'mse', 
                                random_state = 42, verbose = 0, 
                                n_jobs = 5).fit(X_train, y_train)
    
#     print(rfR)
    
    rfR.score(X_test,y_test)

    df_forecast = pd.DataFrame({'v_flow_mean_mean': rfR.predict(X_test)})
    df_forecast.reset_index(drop = True, inplace = True)

    df_forecast = pd.concat([y_test, df_forecast], axis = 1)
    df_forecast.reset_index(drop = True, inplace = True)
    
    forecast_errors = [df_forecast.v_flow_mean.iloc[j] - df_forecast.v_flow_mean_mean.iloc[j] 
                       for j in range(nobs)]
    bias = sum(forecast_errors) * 1.0 / (nobs)
#     print('Bias : %f' % bias)

    mae = skm.mean_absolute_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
#     print('MAE : %f' % mae)

    mse = skm.mean_squared_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
    rmse = np.sqrt(mse)
#     print('MSE : %f' % mse)
#     print('RMSE : %f' % rmse) 
    
    X_test_org.drop(columns = ['v_flow_mean'], inplace = True)
    
    df_forecast = pd.concat([X_test_org, df_forecast], axis = 1)
    df_forecast.drop(columns = ['v_temperature_mean', 'v_loss_cover_10k'], inplace = True)
    df_forecast.reset_index(drop = True, inplace = True)

    df_forecast = df_forecast[['date', 'year', 'month', 'mc', 'v_flow_mean_mean', 
                               'v_flow_mean', 'v_loss_cover', 'v_rainfall_total']]

    print('\n== y predict =======================================================')
    print(df_forecast.head())
    print('=====================================================================\n')
    
    metrics = [i, bias, mae, mse, rmse]
    metrics = pd.DataFrame([metrics], columns = ['mc', 'Bias', 'MAE', 'MSE', 'RMSE'])
    
    print('\n== Metrics =======================================================')
    print(metrics.head())
    print('==================================================================\n')
    
    RFR_metrics = pd.concat([RFR_metrics, metrics], axis = 0)
    RFR_prediction = pd.concat([RFR_prediction, df_forecast], axis = 0)  

In [None]:
RFR_metrics.to_csv('../model/RFR_results_v1.csv', index = False)
RFR_metrics.head()

In [None]:
RFR_prediction['v_flow_mean_mean'] = RFR_prediction['v_flow_mean_mean'].apply(lambda x: 
                                                                              0.01 if x <= 0 
                                                                              else x)
RFR_prediction.to_csv('../model/RFR_predictions_v1.csv', index = False)

RFR_prediction.head()

---------------------------

### Process for all Macro basins (Version 1 : Without hyperparameter tuning - Prediction 2020 2021 )

#### Read scenarios for independent variables

In [None]:
scenarios = pd.read_excel('../data/matrix/Esc_Predicciones_longitudinal.xlsx')

scenarios['v_flow_mean'] = 0

scenarios = scenarios[['date', 'mc', 'v_flow_mean', 'v_loss_cover', 'v_rainfall_total', 
                       'scenario']]

scenarios.head(10)

In [None]:
dates = pd.DataFrame(pd.date_range('2020-01-01','2021-12-31' , freq='1M') - 
             pd.offsets.MonthBegin(1))
dates.columns = ['date']

dates['year'] = pd.DatetimeIndex(dates['date']).year
dates['month'] = pd.DatetimeIndex(dates['date']).month

dates.head()

In [None]:
escen = scenarios['scenario'].unique()

nobs = 24 # 24 meses para test

RFR_prediction = pd.DataFrame()

for j in escen:
   
    print('Escenario :', j, '\n')
    
    data_test_predict = scenarios[scenarios['scenario'] == j]

    for i in mcs:
    
        print('\n==================================================================')
        print('MC = %s' % i)
        print('====================================================================\n')

        temp_df = macrodata[macrodata['mc'] == i].copy().reset_index(drop = True)
        temp_df['v_loss_cover_10k'] = temp_df['v_loss_cover'] * 10000
       
        X = table2lags(temp_df[['v_loss_cover_10k', 'v_rainfall_total']], 2)
        X.fillna(0.00, inplace=True)

        X_train = X.reset_index(drop = True)
        y_train = temp_df['v_flow_mean'].reset_index(drop = True)

        X_test = data_test_predict[data_test_predict['mc'] == i].reset_index(drop = True)
        X_test['v_loss_cover_10k'] = X_test['v_loss_cover'] * 10000
        y_test = X_test['v_flow_mean'].reset_index(drop = True)
        
        # =======================================================================
        # Generamos la variable v_rainfall_total de manera aleatoria para pruebas
        # =======================================================================
#         X_test['v_rainfall_total'] = np.around(
#             np.random.uniform(X_test['v_rainfall_total'].min(), 
#                               X_test['v_rainfall_total'].max(), size = nobs),4)
#         print(X_test['v_rainfall_total'].min(), X_test['v_rainfall_total'].max())
        # =======================================================================
        
        X_test_org = X_test
                
        X_test = table2lags(X_test[['v_loss_cover_10k', 'v_rainfall_total']], 2)
        X_test.fillna(0.00, inplace=True)

        rfR = RandomForestRegressor(n_estimators = 200, max_depth = 100, criterion = 'mse', 
                                    random_state = 42, verbose = 0, 
                                    n_jobs = 5).fit(X_train, y_train)

        rfR.score(X_test,y_test)
        
        df_forecast = pd.DataFrame({'v_flow_mean_forecast': rfR.predict(X_test)})
        df_forecast.reset_index(drop = True, inplace = True)
        
        df_forecast = pd.concat([X_test_org, df_forecast], axis = 1)
        df_forecast.drop(columns = ['v_flow_mean', 'v_loss_cover_10k'], inplace = True)
        
        RFR_prediction = pd.concat([RFR_prediction, df_forecast], axis = 0)
        
RFR_prediction.head()

In [None]:
# Temporal con valores aleatorios en v_rainfall_total

# RFR_prediction['v_flow_mean_forecast'] = RFR_prediction['v_flow_mean_forecast'].apply(lambda x: 
#                                                                               0.01 if x <= 0 
#                                                                               else x)
# RFR_prediction.to_csv('../model/RFR_forecast_2020_2021_temp.csv', index = False)

In [None]:
# RFR_prediction['v_flow_mean_forecast'] = RFR_prediction['v_flow_mean_forecast'].apply(lambda x: 
#                                                                               0.01 if x <= 0 
#                                                                               else x)
RFR_prediction.to_csv('../model/RFR_forecast_2020_2021.csv', index = False)

---------------------------

### Process for all Macro basins (Version 2 : Without hyperparameter tuning - with stationary treatement)

In [None]:
nobs = 24 # 24 meses para test

RFR_metrics = pd.DataFrame()
RFR_prediction = pd.DataFrame()

for i in mcs:
    
    print('\n==================================================================')
    print('MC = %s' % i)
    print('====================================================================\n')
    
    temp_df = macrodata[macrodata['mc'] == i].copy().reset_index(drop = True)
    temp_df['v_loss_cover_10k'] = temp_df['v_loss_cover'] * 10000
    X = table2lags(temp_df[['v_loss_cover_10k', 'v_rainfall_total']], 2)
    X.fillna(0.00, inplace=True)

    X_train = X.iloc[0:-nobs].reset_index(drop = True)
    y_train = temp_df[0:-nobs]['v_flow_mean'].reset_index(drop = True)
    y_train = np.log(y_train + 0.001)

    X_test = X.iloc[-nobs:].reset_index(drop = True)
    X_test_org = temp_df.iloc[-nobs:].reset_index(drop = True)
    y_test = temp_df[-nobs:]['v_flow_mean'].reset_index(drop = True)
    y_test = np.log(y_test + 0.001)

#     print('\n== X train ==========================================================')
#     print(X_train.head())
#     print('\n== y train ==========================================================')
#     print(y_train.head())
#     print('\n== X test ==========================================================')
#     print(X_test.head())
#     print('\n== y test ==========================================================')
#     print(y_test.head())
#     print('=====================================================================\n')
    
    rfR = RandomForestRegressor(n_estimators = 200, max_depth = 100, criterion = 'mse', 
                                random_state = 42, verbose = 0, 
                                n_jobs = 5).fit(X_train, y_train)
    
#     print(rfR)
    
    rfR.score(X_test,y_test)

    df_forecast = pd.DataFrame({'v_flow_mean_mean': rfR.predict(X_test)})
    df_forecast.reset_index(drop = True, inplace = True)
    df_forecast['v_flow_mean_mean'] = np.exp(df_forecast['v_flow_mean_mean'])

    df_forecast = pd.concat([y_test, df_forecast], axis = 1)
    df_forecast['v_flow_mean'] = np.exp(df_forecast['v_flow_mean'])
    df_forecast.reset_index(drop = True, inplace = True)
    
#     print(df_forecast.head())
    
    forecast_errors = [df_forecast.v_flow_mean.iloc[j] - df_forecast.v_flow_mean_mean.iloc[j] 
                       for j in range(nobs)]
    bias = sum(forecast_errors) * 1.0 / (nobs)
#     print('Bias : %f' % bias)

    mae = skm.mean_absolute_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
#     print('MAE : %f' % mae)

    mse = skm.mean_squared_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
    rmse = np.sqrt(mse)
#     print('MSE : %f' % mse)
#     print('RMSE : %f' % rmse) 
    
    X_test_org.drop(columns = ['v_flow_mean'], inplace = True)
    
    df_forecast = pd.concat([X_test_org, df_forecast], axis = 1)
    df_forecast.drop(columns = ['v_temperature_mean', 'v_loss_cover_10k'], inplace = True)
    df_forecast.reset_index(drop = True, inplace = True)
#     df_forecast.drop(columns = ['v_flow_mean'], inplace = True)

    df_forecast = df_forecast[['date', 'year', 'month', 'mc', 'v_flow_mean_mean', 
                               'v_flow_mean', 'v_loss_cover', 'v_rainfall_total']]

    print('\n== y predict =======================================================')
    print(df_forecast.head())
    print('=====================================================================\n')
    
    metrics = [i, bias, mae, mse, rmse]
    metrics = pd.DataFrame([metrics], columns = ['mc', 'Bias', 'MAE', 'MSE', 'RMSE'])
    
    print('\n== Metrics =======================================================')
    print(metrics.head())
    print('==================================================================\n')
    
    RFR_metrics = pd.concat([RFR_metrics, metrics], axis = 0)
    RFR_prediction = pd.concat([RFR_prediction, df_forecast], axis = 0)  

In [None]:
RFR_metrics.to_csv('../model/RFR_results_v2.csv', index = False)
RFR_metrics.head()

In [None]:
RFR_prediction['v_flow_mean_mean'] = RFR_prediction['v_flow_mean_mean'].apply(lambda x: 
                                                                              0.01 if x <= 0 
                                                                              else x)
RFR_prediction.to_csv('../model/RFR_predictions_v2.csv', index = False)

RFR_prediction.head()

---------------------------