![ds4a_colombia.svg](attachment:ds4a_colombia.svg)

# Impacto de la deforestación en el regimen de caudales de los rios en Colombia (TEAM 28)

## Multivariate time series forecasting

Sources :

https://towardsdatascience.com/vector-autoregressions-vector-error-correction-multivariate-model-a69daf6ab618

https://towardsdatascience.com/pairs-trading-with-cryptocurrencies-e79b4a00b015

### Libraries

In [1]:
import seaborn as sns
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

import numpy as np
import pandas as pd
import sklearn.metrics as skm

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(num = None, figsize = (15, 12), dpi = 80, facecolor = 'w', edgecolor = 'k')
plt.rcParams.update({'font.size': 16, 'figure.figsize': (15, 10), 
                     'figure.max_open_warning': 200})

<Figure size 1200x960 with 0 Axes>

In [2]:
def table2lags(table, max_lag, min_lag=0, separator='_'):
    """ Given a dataframe, return a dataframe with different lags of all its columns """
    values=[]
    for i in range(min_lag, max_lag + 1):
        values.append(table.shift(i).copy())
        values[-1].columns = [c + separator + str(i) for c in table.columns]
    return pd.concat(values, axis=1)

In [3]:
print(plt.rcParams.keys())

KeysView(RcParams({'_internal.classic_mode': False,
          'agg.path.chunksize': 0,
          'animation.avconv_args': [],
          'animation.avconv_path': 'avconv',
          'animation.bitrate': -1,
          'animation.codec': 'h264',
          'animation.convert_args': [],
          'animation.convert_path': 'convert',
          'animation.embed_limit': 20.0,
          'animation.ffmpeg_args': [],
          'animation.ffmpeg_path': 'ffmpeg',
          'animation.frame_format': 'png',
          'animation.html': 'none',
          'animation.html_args': [],
          'animation.writer': 'ffmpeg',
          'axes.autolimit_mode': 'data',
          'axes.axisbelow': 'line',
          'axes.edgecolor': 'black',
          'axes.facecolor': 'white',
          'axes.formatter.limits': [-5, 6],
          'axes.formatter.min_exponent': 0,
          'axes.formatter.offset_threshold': 4,
          'axes.formatter.use_locale': False,
          'axes.formatter.use_mathtext': False,
        

### Read Data

In [4]:
macrodata = pd.read_csv('../data/matrix/matrix_consol_v2.zip')

macrodata.head(10)

Unnamed: 0,date,year,month,mc,v_flow_mean,v_loss_cover,v_rainfall_total,v_temperature_mean
0,2000-01,2000,1,7,230.4,0.0,334.0,
1,2000-02,2000,2,7,272.4,0.000133,400.0,
2,2000-03,2000,3,7,321.6,0.000265,319.0,
3,2000-04,2000,4,7,310.8,0.000398,248.0,
4,2000-05,2000,5,7,410.0,0.000531,302.0,
5,2000-06,2000,6,7,295.9,0.000663,81.0,
6,2000-07,2000,7,7,244.2,0.000796,96.0,
7,2000-08,2000,8,7,255.0,0.000928,64.0,
8,2000-09,2000,9,7,233.8,0.001061,262.0,
9,2000-10,2000,10,7,216.0,0.001194,141.0,


In [5]:
is_stationary = pd.read_csv('../model/mc_stationary.csv')

is_stationary.fillna(value = 0, inplace = True)

is_stationary = is_stationary.astype('int')

is_stationary.head(10)

Unnamed: 0,mbasin,v_flow_mean,v_loss_cover,v_rainfall_total,v_temperature_mean
0,1,0,0,1,1
1,2,1,0,1,1
2,3,1,0,1,0
3,4,0,0,1,0
4,5,1,0,1,0
5,6,1,0,1,0
6,7,1,0,0,0
7,8,1,0,1,0
8,9,1,0,1,0
9,10,1,0,1,1


In [6]:
mcs = macrodata['mc'].unique()
mcs.sort()

print(mcs)

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48]


In [7]:
dates = pd.DataFrame(pd.date_range('2018-01-01','2019-12-31' , freq='1M') - 
             pd.offsets.MonthBegin(1))
dates.columns = ['date']

dates['year'] = pd.DatetimeIndex(dates['date']).year
dates['month'] = pd.DatetimeIndex(dates['date']).month

dates.head()

Unnamed: 0,date,year,month
0,2018-01-01,2018,1
1,2018-02-01,2018,2
2,2018-03-01,2018,3
3,2018-04-01,2018,4
4,2018-05-01,2018,5


### Process for all Macro basins (Version 1 : Without hyperparameter tuning)

In [8]:
nobs = 24 # 24 meses para test

RFR_metrics = pd.DataFrame()
RFR_prediction = pd.DataFrame()

for i in mcs:
    
    print('\n==================================================================')
    print('MC = %s' % i)
    print('====================================================================\n')
    
    temp_df = macrodata[macrodata['mc'] == i].copy().reset_index(drop = True)
    temp_df['v_loss_cover_10k'] = temp_df['v_loss_cover'] * 10000
    X = table2lags(temp_df[['v_loss_cover_10k', 'v_rainfall_total']], 2)
    X.fillna(0.00, inplace=True)

    X_train = X.iloc[0:-nobs].reset_index(drop = True)
    y_train = temp_df[0:-nobs]['v_flow_mean'].reset_index(drop = True)

    X_test = X.iloc[-nobs:].reset_index(drop = True)
    X_test_org = temp_df.iloc[-nobs:].reset_index(drop = True)
    y_test = temp_df[-nobs:]['v_flow_mean'].reset_index(drop = True)

#     print('\n== X train ==========================================================')
#     print(X_train.head())
#     print('\n== y train ==========================================================')
#     print(y_train.head())
#     print('\n== X test ==========================================================')
#     print(X_test.head())
#     print('\n== y test ==========================================================')
#     print(y_test.head())
#     print('=====================================================================\n')
    
    rfR = RandomForestRegressor(n_estimators = 200, max_depth = 100, criterion = 'mse', 
                                random_state = 42, verbose = 0, 
                                n_jobs = 5).fit(X_train, y_train)
    
#     print(rfR)
    
    rfR.score(X_test,y_test)

    df_forecast = pd.DataFrame({'v_flow_mean_mean': rfR.predict(X_test)})
    df_forecast.reset_index(drop = True, inplace = True)

    df_forecast = pd.concat([y_test, df_forecast], axis = 1)
    df_forecast.reset_index(drop = True, inplace = True)
    
    forecast_errors = [df_forecast.v_flow_mean.iloc[j] - df_forecast.v_flow_mean_mean.iloc[j] 
                       for j in range(nobs)]
    bias = sum(forecast_errors) * 1.0 / (nobs)
#     print('Bias : %f' % bias)

    mae = skm.mean_absolute_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
#     print('MAE : %f' % mae)

    mse = skm.mean_squared_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
    rmse = np.sqrt(mse)
#     print('MSE : %f' % mse)
#     print('RMSE : %f' % rmse) 
    
    X_test_org.drop(columns = ['v_flow_mean'], inplace = True)
    
    df_forecast = pd.concat([X_test_org, df_forecast], axis = 1)
    df_forecast.drop(columns = ['v_temperature_mean', 'v_loss_cover_10k'], inplace = True)
    df_forecast.reset_index(drop = True, inplace = True)

    df_forecast = df_forecast[['date', 'year', 'month', 'mc', 'v_flow_mean_mean', 
                               'v_flow_mean', 'v_loss_cover', 'v_rainfall_total']]

    print('\n== y predict =======================================================')
    print(df_forecast.head())
    print('=====================================================================\n')
    
    metrics = [i, bias, mae, mse, rmse]
    metrics = pd.DataFrame([metrics], columns = ['mc', 'Bias', 'MAE', 'MSE', 'RMSE'])
    
    print('\n== Metrics =======================================================')
    print(metrics.head())
    print('==================================================================\n')
    
    RFR_metrics = pd.concat([RFR_metrics, metrics], axis = 0)
    RFR_prediction = pd.concat([RFR_prediction, df_forecast], axis = 0)  


MC = 1


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1   1           3.32735         1.60      0.072060   
1  2018-02  2018      2   1           3.47285         1.57      0.072256   
2  2018-03  2018      3   1           3.32255         2.10      0.072453   
3  2018-04  2018      4   1           4.24710         2.99      0.072649   
4  2018-05  2018      5   1           3.74035         3.40      0.072846   

   v_rainfall_total  
0            222.87  
1            143.49  
2            274.72  
3             65.35  
4            214.84  


   mc      Bias       MAE       MSE      RMSE
0   1 -0.486402  0.852827  1.105587  1.051469


MC = 2


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1   2           27.8416        33.51      0.050399   
1  2018-02  2018      2   2           22.5616        20.82      0.050824   
2  2018-03  2018      3   2           27.2751        12.38      0.05


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1   9          1.748621     2.672188      0.006040   
1  2018-02  2018      2   9          1.337259     2.135938      0.006055   
2  2018-03  2018      3   9          1.936437     3.051875      0.006070   
3  2018-04  2018      4   9         18.719178    24.348594      0.006084   
4  2018-05  2018      5   9         28.764202    29.940625      0.006099   

   v_rainfall_total  
0              14.0  
1              12.0  
2              63.0  
3             171.0  
4             188.0  


   mc      Bias       MAE        MSE      RMSE
0   9  0.421919  2.962753  16.350804  4.043613


MC = 10


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1  10         13.956664    15.517500      0.060708   
1  2018-02  2018      2  10         25.164960    22.378229      0.061050   
2  2018-03  2018      3  10         18.329113    16.103229      0.061392  


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1  17         10.163143     9.336937      0.035487   
1  2018-02  2018      2  17          8.733635     9.910594      0.035892   
2  2018-03  2018      3  17          9.280934     9.797875      0.036298   
3  2018-04  2018      4  17         12.740397    14.993953      0.036703   
4  2018-05  2018      5  17         17.091076    16.743000      0.037108   

   v_rainfall_total  
0        126.453448  
1         80.530917  
2         92.041223  
3        185.881308  
4        250.499646  


   mc      Bias       MAE       MSE      RMSE
0  17  0.301072  0.872286  1.480048  1.216572


MC = 18


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1  18         51.592779    46.659375      0.163456   
1  2018-02  2018      2  18         41.113992    39.600000      0.164367   
2  2018-03  2018      3  18         42.389197    41.425000      0.165277   



      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1  25           8.37370         7.50      0.055072   
1  2018-02  2018      2  25           7.87320         5.18      0.055410   
2  2018-03  2018      3  25           8.84345         5.56      0.055748   
3  2018-04  2018      4  25          10.43535        12.68      0.056086   
4  2018-05  2018      5  25          11.10830        20.90      0.056423   

   v_rainfall_total  
0              19.0  
1               0.0  
2              13.0  
3              74.5  
4             269.5  


   mc      Bias     MAE        MSE      RMSE
0  25 -0.900071  2.5857  10.238961  3.199838


MC = 26


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1  26           0.05585        32.83      0.062901   
1  2018-02  2018      2  26           2.82675        16.94      0.063136   
2  2018-03  2018      3  26           2.40215        35.38      0.063370   
3 


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1  33           4.50370         6.41      0.025342   
1  2018-02  2018      2  33           5.54875         5.42      0.025392   
2  2018-03  2018      3  33           5.31655         4.92      0.025442   
3  2018-04  2018      4  33           4.44270         4.81      0.025493   
4  2018-05  2018      5  33           4.66000         3.47      0.025543   

   v_rainfall_total  
0            355.54  
1            296.24  
2            395.62  
3            195.47  
4            207.81  


   mc      Bias       MAE       MSE      RMSE
0  33 -0.154777  0.708565  0.793752  0.890928


MC = 34


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1  34        174.209448   196.682292      0.029006   
1  2018-02  2018      2  34        239.737904   147.127604      0.029322   
2  2018-03  2018      3  34        314.492193   185.255208      0.029638   



      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1  41        797.766823   1072.65625      0.021326   
1  2018-02  2018      2  41        869.082313    991.54375      0.021425   
2  2018-03  2018      3  41        632.477917    997.84375      0.021523   
3  2018-04  2018      4  41       2283.882913   1677.84375      0.021622   
4  2018-05  2018      5  41       6766.262968   6221.93750      0.021721   

   v_rainfall_total  
0          90.99919  
1           3.66993  
2         132.61149  
3         502.75084  
4         429.41482  


   mc       Bias         MAE            MSE        RMSE
0  41 -28.774981  502.902574  543643.691362  737.321973


MC = 42


      date  year  month  mc  v_flow_mean_mean  v_flow_mean  v_loss_cover  \
0  2018-01  2018      1  42        590.126089   527.618750      0.036243   
1  2018-02  2018      2  42        631.227865   179.143750      0.036491   
2  2018-03  2018      3  42        509.139073   103.5000

In [9]:
RFR_metrics.to_csv('../model/RFR_results_v1.csv', index = False)
RFR_metrics.head()

Unnamed: 0,mc,Bias,MAE,MSE,RMSE
0,1,-0.486402,0.852827,1.105587,1.051469
0,2,-0.688358,8.16725,99.990858,9.999543
0,3,-17.709754,57.093754,5147.01176,71.742677
0,4,-3.749044,4.24774,22.702059,4.764668
0,5,1.98776,12.502152,230.812583,15.192517


In [10]:
RFR_prediction['v_flow_mean_mean'] = RFR_prediction['v_flow_mean_mean'].apply(lambda x: 
                                                                              0.01 if x <= 0 
                                                                              else x)
RFR_prediction.to_csv('../model/RFR_predictions_v1.csv', index = False)

RFR_prediction.head()

Unnamed: 0,date,year,month,mc,v_flow_mean_mean,v_flow_mean,v_loss_cover,v_rainfall_total
0,2018-01,2018,1,1,3.32735,1.6,0.07206,222.87
1,2018-02,2018,2,1,3.47285,1.57,0.072256,143.49
2,2018-03,2018,3,1,3.32255,2.1,0.072453,274.72
3,2018-04,2018,4,1,4.2471,2.99,0.072649,65.35
4,2018-05,2018,5,1,3.74035,3.4,0.072846,214.84


---------------------------

### Process for all Macro basins (Version 2 : Without hyperparameter tuning - with stationary treatement)

In [None]:
nobs = 24 # 24 meses para test

RFR_metrics = pd.DataFrame()
RFR_prediction = pd.DataFrame()

for i in mcs:
    
    print('\n==================================================================')
    print('MC = %s' % i)
    print('====================================================================\n')
    
    temp_df = macrodata[macrodata['mc'] == i].copy().reset_index(drop = True)
    temp_df['v_loss_cover_10k'] = temp_df['v_loss_cover'] * 10000
    X = table2lags(temp_df[['v_loss_cover_10k', 'v_rainfall_total']], 2)
    X.fillna(0.00, inplace=True)

    X_train = X.iloc[0:-nobs].reset_index(drop = True)
    y_train = temp_df[0:-nobs]['v_flow_mean'].reset_index(drop = True)
    y_train = np.log(y_train + 0.001)

    X_test = X.iloc[-nobs:].reset_index(drop = True)
    X_test_org = temp_df.iloc[-nobs:].reset_index(drop = True)
    y_test = temp_df[-nobs:]['v_flow_mean'].reset_index(drop = True)
    y_test = np.log(y_test + 0.001)

#     print('\n== X train ==========================================================')
#     print(X_train.head())
#     print('\n== y train ==========================================================')
#     print(y_train.head())
#     print('\n== X test ==========================================================')
#     print(X_test.head())
#     print('\n== y test ==========================================================')
#     print(y_test.head())
#     print('=====================================================================\n')
    
    rfR = RandomForestRegressor(n_estimators = 200, max_depth = 100, criterion = 'mse', 
                                random_state = 42, verbose = 0, 
                                n_jobs = 5).fit(X_train, y_train)
    
#     print(rfR)
    
    rfR.score(X_test,y_test)

    df_forecast = pd.DataFrame({'v_flow_mean_mean': rfR.predict(X_test)})
    df_forecast.reset_index(drop = True, inplace = True)
    df_forecast['v_flow_mean_mean'] = np.exp(df_forecast['v_flow_mean_mean'])

    df_forecast = pd.concat([y_test, df_forecast], axis = 1)
    df_forecast['v_flow_mean'] = np.exp(df_forecast['v_flow_mean'])
    df_forecast.reset_index(drop = True, inplace = True)
    
#     print(df_forecast.head())
    
    forecast_errors = [df_forecast.v_flow_mean.iloc[j] - df_forecast.v_flow_mean_mean.iloc[j] 
                       for j in range(nobs)]
    bias = sum(forecast_errors) * 1.0 / (nobs)
#     print('Bias : %f' % bias)

    mae = skm.mean_absolute_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
#     print('MAE : %f' % mae)

    mse = skm.mean_squared_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
    rmse = np.sqrt(mse)
#     print('MSE : %f' % mse)
#     print('RMSE : %f' % rmse) 
    
    X_test_org.drop(columns = ['v_flow_mean'], inplace = True)
    
    df_forecast = pd.concat([X_test_org, df_forecast], axis = 1)
    df_forecast.drop(columns = ['v_temperature_mean', 'v_loss_cover_10k'], inplace = True)
    df_forecast.reset_index(drop = True, inplace = True)
#     df_forecast.drop(columns = ['v_flow_mean'], inplace = True)

    df_forecast = df_forecast[['date', 'year', 'month', 'mc', 'v_flow_mean_mean', 
                               'v_flow_mean', 'v_loss_cover', 'v_rainfall_total']]

    print('\n== y predict =======================================================')
    print(df_forecast.head())
    print('=====================================================================\n')
    
    metrics = [i, bias, mae, mse, rmse]
    metrics = pd.DataFrame([metrics], columns = ['mc', 'Bias', 'MAE', 'MSE', 'RMSE'])
    
    print('\n== Metrics =======================================================')
    print(metrics.head())
    print('==================================================================\n')
    
    RFR_metrics = pd.concat([RFR_metrics, metrics], axis = 0)
    RFR_prediction = pd.concat([RFR_prediction, df_forecast], axis = 0)  

In [None]:
RFR_metrics.to_csv('../model/RFR_results_v2.csv', index = False)
RFR_metrics.head()

In [None]:
RFR_prediction['v_flow_mean_mean'] = RFR_prediction['v_flow_mean_mean'].apply(lambda x: 
                                                                              0.01 if x <= 0 
                                                                              else x)
RFR_prediction.to_csv('../model/RFR_predictions_v2.csv', index = False)

RFR_prediction.head()

---------------------------

### Process for all Macro basins (Version 3 : With hyperparameter tuning) - In process

In [None]:
nobs = 24 # 24 meses para test

RFR_metrics = pd.DataFrame()
RFR_prediction = pd.DataFrame()

for i in mcs:
    
    print('\n==================================================================')
    print('MC = %s' % i)
    print('====================================================================\n')
    
    temp_df = macrodata[macrodata['mc'] == i].copy().reset_index(drop = True)
    temp_df['v_loss_cover_10k'] = temp_df['v_loss_cover'] * 10000
    X = table2lags(temp_df[['v_loss_cover_10k', 'v_rainfall_total']], 2)
    X.fillna(0.00, inplace=True)

    X_train = X.iloc[0:-nobs].reset_index(drop = True)
    y_train = temp_df[0:-nobs]['v_flow_mean'].reset_index(drop = True)

    X_test = X.iloc[-nobs:].reset_index(drop = True)
    X_test_org = temp_df.iloc[-nobs:].reset_index(drop = True)
    y_test = temp_df[-nobs:]['v_flow_mean'].reset_index(drop = True)

#     print('\n== X train ==========================================================')
#     print(X_train.head())
#     print('\n== y train ==========================================================')
#     print(y_train.head())
#     print('\n== X test ==========================================================')
#     print(X_test.head())
#     print('\n== y test ==========================================================')
#     print(y_test.head())
#     print('=====================================================================\n')
    
    rfR = RandomForestRegressor(n_estimators = 200, max_depth = 100, criterion = 'mse', 
                                random_state = 42, verbose = 0, 
                                n_jobs = 5).fit(X_train, y_train)
    
#     print(rfR)
    
    rfR.score(X_test,y_test)

    df_forecast = pd.DataFrame({'v_flow_mean_mean': rfR.predict(X_test)})
    df_forecast.reset_index(drop = True, inplace = True)

    df_forecast = pd.concat([y_test, df_forecast], axis = 1)
    df_forecast.reset_index(drop = True, inplace = True)
    
    forecast_errors = [df_forecast.v_flow_mean.iloc[j] - df_forecast.v_flow_mean_mean.iloc[j] 
                       for j in range(nobs)]
    bias = sum(forecast_errors) * 1.0 / (nobs)
#     print('Bias : %f' % bias)

    mae = skm.mean_absolute_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
#     print('MAE : %f' % mae)

    mse = skm.mean_squared_error(df_forecast.v_flow_mean, df_forecast.v_flow_mean_mean)
    rmse = np.sqrt(mse)
#     print('MSE : %f' % mse)
#     print('RMSE : %f' % rmse) 
    
    X_test_org.drop(columns = ['v_flow_mean'], inplace = True)
    
    df_forecast = pd.concat([X_test_org, df_forecast], axis = 1)
    df_forecast.drop(columns = ['v_temperature_mean', 'v_loss_cover_10k'], inplace = True)
    df_forecast.reset_index(drop = True, inplace = True)

    df_forecast = df_forecast[['date', 'year', 'month', 'mc', 'v_flow_mean_mean', 
                               'v_flow_mean', 'v_loss_cover', 'v_rainfall_total']]

    print('\n== y predict =======================================================')
    print(df_forecast.head())
    print('=====================================================================\n')
    
    metrics = [i, bias, mae, mse, rmse]
    metrics = pd.DataFrame([metrics], columns = ['mc', 'Bias', 'MAE', 'MSE', 'RMSE'])
    
    print('\n== Metrics =======================================================')
    print(metrics.head())
    print('==================================================================\n')
    
    RFR_metrics = pd.concat([RFR_metrics, metrics], axis = 0)
    RFR_prediction = pd.concat([RFR_prediction, df_forecast], axis = 0)  

In [None]:
RFR_metrics.to_csv('../model/RFR_results_v3.csv', index = False)
RFR_metrics.head()

In [None]:
RFR_prediction['v_flow_mean_mean'] = RFR_prediction['v_flow_mean_mean'].apply(lambda x: 
                                                                              0.01 if x <= 0 
                                                                              else x)

RFR_prediction.to_csv('../model/RFR_predictions_v3.csv', index = False)

RFR_prediction.head()