In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import os
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error

from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller


import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import datetime
from datetime import date, timedelta
e_date = datetime.datetime.strptime('2020-06-20', '%Y-%m-%d')
i_date = datetime.datetime.strptime('2020-04-01', '%Y-%m-%d')

data_path = os.path.join('..','OD')

In [2]:
delta = e_date - i_date

target_days = []
for i in range(delta.days + 1):
    day = i_date + timedelta(days=i)
    target_days.append(day)

In [3]:
def read_OD_fn(source_type):
    ods = {}
    for date in tqdm_notebook(target_days):
        od_date= pd.read_csv(os.path.join(data_path, 'date_{}_OD_{}.csv'.format(source_type,date.strftime('%Y-%m-%d'))), 
                            index_col=0)
        ods[date] = od_date
    return ods

In [4]:
ine_ods = read_OD_fn('INE')

  0%|          | 0/81 [00:00<?, ?it/s]

In [5]:
twt_ods = read_OD_fn('TWT')

  0%|          | 0/81 [00:00<?, ?it/s]

# Serializar los datos

In [6]:
lista_twt1 = []
lista_twt2 = []
lista_twt3 = []
lista_twt4 = []
lista_twt5 = []

for date in tqdm_notebook(target_days):
    lista_twt1.append(twt_ods[date].loc['11CA','total'])
    lista_twt2.append(twt_ods[date].loc['33PO','total'])
    lista_twt3.append(twt_ods[date].loc['026V','total'])
    lista_twt4.append(twt_ods[date].loc['21BU','total'])
    lista_twt5.append(twt_ods[date].loc['49GI','total'])
    
    
lista_ine1 = []
lista_ine2 = []
lista_ine3 = []
lista_ine4 = []
lista_ine5 = []

for date in tqdm_notebook(target_days):
    lista_ine1.append(ine_ods[date].loc['11CA','total'])
    lista_ine2.append(ine_ods[date].loc['33PO','total'])
    lista_ine3.append(ine_ods[date].loc['026V','total'])
    lista_ine4.append(ine_ods[date].loc['21BU','total'])
    lista_ine5.append(ine_ods[date].loc['49GI','total'])
    

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

In [7]:
df_twt1 = pd.DataFrame (lista_twt1, columns = ['Viajes'], index= target_days)
df_ine1 = pd.DataFrame (lista_ine1, columns = ['Viajes'], index= target_days)

df_twt2 = pd.DataFrame (lista_twt2, columns = ['Viajes'], index= target_days)
df_ine2 = pd.DataFrame (lista_ine2, columns = ['Viajes'], index= target_days)

df_twt3 = pd.DataFrame (lista_twt3, columns = ['Viajes'], index= target_days)
df_ine3 = pd.DataFrame (lista_ine3, columns = ['Viajes'], index= target_days)

df_twt4 = pd.DataFrame (lista_twt4, columns = ['Viajes'], index= target_days)
df_ine4 = pd.DataFrame (lista_ine4, columns = ['Viajes'], index= target_days)

df_twt5 = pd.DataFrame (lista_twt5, columns = ['Viajes'], index= target_days)
df_ine5 = pd.DataFrame (lista_ine5, columns = ['Viajes'], index= target_days)

In [8]:
lista_twt = []
lista_ine = []

df_twt = df_twt1 + df_twt2 + df_twt3 + df_twt4 + df_twt5
df_ine = df_ine1 + df_ine2 + df_ine3 + df_ine4 + df_ine5

In [9]:
df_twt.head()

Unnamed: 0,Viajes
2020-04-01,0
2020-04-02,2
2020-04-03,0
2020-04-04,0
2020-04-05,0


In [10]:
df_ine.head()

Unnamed: 0,Viajes
2020-04-01,194
2020-04-02,183
2020-04-03,317
2020-04-04,177
2020-04-05,142


In [11]:
del twt_ods
del ine_ods

# Visualización de datos

In [12]:
print(df_twt.index.min())
print(df_ine.index.max())

2020-04-01 00:00:00
2020-06-20 00:00:00


In [13]:
print(len(df_twt['2020']))

81


In [14]:
print(len(df_ine['2020']))

81


In [15]:
df_twt.describe()

Unnamed: 0,Viajes
count,81.0
mean,0.407407
std,0.862812
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,4.0


In [16]:
df_ine.describe()

Unnamed: 0,Viajes
count,81.0
mean,366.592593
std,108.663446
min,110.0
25%,303.0
50%,390.0
75%,433.0
max,614.0


In [17]:
meses = df_twt.resample('M').mean()
meses

Unnamed: 0,Viajes
2020-04-30,0.366667
2020-05-31,0.387097
2020-06-30,0.5


In [18]:
meses = df_ine.resample('M').mean()
meses

Unnamed: 0,Viajes
2020-04-30,276.233333
2020-05-31,402.096774
2020-06-30,447.1


### Cointegration test

In [19]:
trips_df = df_twt.copy()
trips_df['ine_trips']= df_ine['Viajes']
trips_df.head()

Unnamed: 0,Viajes,ine_trips
2020-04-01,0,194
2020-04-02,2,183
2020-04-03,0,317
2020-04-04,0,177
2020-04-05,0,142


In [20]:
def cointegration_test(df, alpha=0.05): 
    """Perform Johanson's Cointegration Test and Report Summary"""
    out = coint_johansen(df,-1,5)
    d = {'0.90':0, '0.95':1, '0.99':2}
    traces = out.lr1
    cvts = out.cvt[:, d[str(1-alpha)]]
    
    def adjust(val, length= 6): 
        return str(val).ljust(length)

    # Summary
    print('Name   ::  Test Stat > C(95%)    =>   Signif  \n', '--'*20)
    for col, trace, cvt in zip(df.columns, traces, cvts):
        print(adjust(col), ':: ', adjust(round(trace,2), 9), ">", adjust(cvt, 8), ' =>  ' , trace > cvt)


cointegration_test(trips_df)

Name   ::  Test Stat > C(95%)    =>   Signif  
 ----------------------------------------
Viajes ::  14.68     > 12.3212   =>   True
ine_trips ::  0.68      > 4.1296    =>   False


### Granger test

In [22]:
maxlag=12
test = 'ssr_chi2test'
def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False):    
    """Check Granger Causality of all possible combinations of the Time series.
    The rows are the response variable, columns are predictors. The values in the table 
    are the P-Values. P-Values lesser than the significance level (0.05), implies 
    the Null Hypothesis that the coefficients of the corresponding past values is 
    zero, that is, the X does not cause Y can be rejected.

    data      : pandas dataframe containing the time series variables
    variables : list containing names of the time series variables.
    """
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

grangers_causation_matrix(trips_df, variables = trips_df.columns)

Unnamed: 0,Viajes_x,ine_trips_x
Viajes_y,1.0,0.0095
ine_trips_y,0.0256,1.0


### Check stationarity

In [24]:
def adfuller_test(series, signif=0.05, name='', verbose=False):
    """Perform ADFuller to test for Stationarity of given series and print report"""
    r = adfuller(series, autolag='AIC')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    # Print Summary
    print(f'    Augmented Dickey-Fuller Test on "{name}"', "\n   ", '-'*47)
    print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
    print(f' Significance Level    = {signif}')
    print(f' Test Statistic        = {output["test_statistic"]}')
    print(f' No. Lags Chosen       = {output["n_lags"]}')

    for key,val in r[4].items():
        print(f' Critical value {adjust(key)} = {round(val, 3)}')

    if p_value <= signif:
        print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
        print(f" => Series is Stationary.")
    else:
        print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
        print(f" => Series is Non-Stationary.")

adfuller_test(trips_df['Viajes'])

    Augmented Dickey-Fuller Test on "" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -10.2206
 No. Lags Chosen       = 0
 Critical value 1%     = -3.515
 Critical value 5%     = -2.898
 Critical value 10%    = -2.586
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


In [25]:
adfuller_test(trips_df['ine_trips'])

    Augmented Dickey-Fuller Test on "" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -3.3786
 No. Lags Chosen       = 12
 Critical value 1%     = -3.53
 Critical value 5%     = -2.905
 Critical value 10%    = -2.59
 => P-Value = 0.0117. Rejecting Null Hypothesis.
 => Series is Stationary.


In [26]:
trips_df_differenced = trips_df.diff().dropna()

adfuller_test(trips_df_differenced['Viajes'])

    Augmented Dickey-Fuller Test on "" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -6.5404
 No. Lags Chosen       = 7
 Critical value 1%     = -3.525
 Critical value 5%     = -2.903
 Critical value 10%    = -2.589
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


In [27]:
adfuller_test(trips_df_differenced['ine_trips'])

    Augmented Dickey-Fuller Test on "" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -3.9319
 No. Lags Chosen       = 11
 Critical value 1%     = -3.53
 Critical value 5%     = -2.905
 Critical value 10%    = -2.59
 => P-Value = 0.0018. Rejecting Null Hypothesis.
 => Series is Stationary.


### Select order for VAR Model

In [28]:
trips_df_differenced.columns

model = VAR(trips_df_differenced)

x = model.select_order(maxlags=12)
x.summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,9.529,9.594,1.375e+04,9.555
1.0,9.040,9.236*,8438.,9.118
2.0,8.934,9.260,7587.,9.063
3.0,8.816,9.273,6748.,8.997*
4.0,8.772,9.359,6470.,9.005
5.0,8.776,9.494,6516.,9.061
6.0,8.723,9.572,6204.,9.060
7.0,8.680*,9.659,5973.*,9.068
8.0,8.693,9.803,6090.,9.132


Best lag is set to 7 days

In [29]:
model_fitted = model.fit(7)
model_fitted.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Tue, 21, Sep, 2021
Time:                     11:40:40
--------------------------------------------------------------------
No. of Equations:         2.00000    BIC:                    9.57216
Nobs:                     73.0000    HQIC:                   9.00599
Log likelihood:          -492.192    FPE:                    5668.85
AIC:                      8.63087    Det(Omega_mle):         3900.99
--------------------------------------------------------------------
Results for equation Viajes
                  coefficient       std. error           t-stat            prob
-------------------------------------------------------------------------------
const                0.052258         0.109658            0.477           0.634
L1.Viajes           -0.866039         0.120984           -7.158           0.000
L1.ine_trips        -0.001656         0.001706           -0.971

### Forecast test

In [30]:
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100


def compute_metrics_fn(y_valid, y_hat):
    mae_ = mean_absolute_error(y_valid, y_hat)
    mse_ = mean_squared_error(y_valid, y_hat)
    rmse_ = mean_squared_error(y_valid, y_hat, squared = False)
    cvrmse_ = rmse_/np.mean(y_valid)*100 # it is a percentage
    mape_ = mape(y_valid, y_hat)
    
    return mae_, mse_, rmse_, cvrmse_, mape_

trips_df.shape

(81, 2)

In [31]:
ine_true = []
ine_hat = []
for i in range(60, 81-7):
    X= trips_df.iloc[0:i]
    X_differenced = X.diff().dropna()
    
    y =trips_df.iloc[i:i+7]
    y_true_ine = [v[1] for v in y.values]
    
    model = VAR(X_differenced.iloc[:-7])
    model_fitted = model.fit(7)
    #print(len(X_differenced.iloc[-7:]))
    y_hat = model_fitted.forecast(y=X_differenced.values[-7:], steps=7)
    #print(y_hat)
    
    df_forecast = pd.DataFrame(y_hat, index=trips_df.iloc[i:i+7].index, columns=trips_df.columns + '_1d')

    columns = trips_df.columns
    for col in columns:  
        df_forecast[str(col)+'_forecast'] = X[col].iloc[-1] + df_forecast[str(col)+'_1d'].cumsum()

    #print(X.iloc[-1],df_forecast, trips_df.iloc[i:i+7])
    y_hat_ine = list(df_forecast['ine_trips_forecast'].values)
    
    ine_true = ine_true + y_true_ine
    ine_hat = ine_hat + y_hat_ine
    #print(y_hat, y, y_true_ine, y_hat_ine)

testScore_MAE, testScore_MSE, testScore_RMSE, testScore_CVRMSE, testScore_MAPE  =  compute_metrics_fn(ine_true, ine_hat)

print('Resultado del test : %.2f MAE' % ( testScore_MAE))
print('Resultado del test: %.2f MSE' % ( testScore_MSE))
print('Resultado del test: %.2f RMSE' %  (testScore_RMSE))
print('Resultado del test: %.2f CVRMSE' % ( testScore_CVRMSE))
print('Resultado del test: %.2f MAPE \n' % (testScore_MAPE))

Resultado del test : 98.55 MAE
Resultado del test: 15051.25 MSE
Resultado del test: 122.68 RMSE
Resultado del test: 28.11 CVRMSE
Resultado del test: 23.05 MAPE 



In [32]:
for i in range(0,7):
    y_true = ine_true[i::7]
    y_hat = ine_hat[i::7]
    testScore_MAE, testScore_MSE, testScore_RMSE, testScore_CVRMSE, testScore_MAPE  =  compute_metrics_fn(y_true, y_hat)

    print('Resultado del test dia %d: %.2f MAE' % (i, testScore_MAE))
    print('Resultado del test dia %d: %.2f MSE' % (i, testScore_MSE))
    print('Resultado del test dia %d: %.2f RMSE' % (i, testScore_RMSE))
    print('Resultado del test dia %d: %.2f CVRMSE' % (i, testScore_CVRMSE))
    print('Resultado del test dia %d: %.2f MAPE \n' % (i, testScore_MAPE))

Resultado del test dia 0: 90.14 MAE
Resultado del test dia 0: 12991.07 MSE
Resultado del test dia 0: 113.98 RMSE
Resultado del test dia 0: 26.38 CVRMSE
Resultado del test dia 0: 20.79 MAPE 

Resultado del test dia 1: 89.96 MAE
Resultado del test dia 1: 15742.09 MSE
Resultado del test dia 1: 125.47 RMSE
Resultado del test dia 1: 29.14 CVRMSE
Resultado del test dia 1: 21.18 MAPE 

Resultado del test dia 2: 112.67 MAE
Resultado del test dia 2: 21366.85 MSE
Resultado del test dia 2: 146.17 RMSE
Resultado del test dia 2: 33.73 CVRMSE
Resultado del test dia 2: 26.44 MAPE 

Resultado del test dia 3: 108.14 MAE
Resultado del test dia 3: 18103.37 MSE
Resultado del test dia 3: 134.55 RMSE
Resultado del test dia 3: 30.81 CVRMSE
Resultado del test dia 3: 25.16 MAPE 

Resultado del test dia 4: 100.31 MAE
Resultado del test dia 4: 13542.59 MSE
Resultado del test dia 4: 116.37 RMSE
Resultado del test dia 4: 26.52 CVRMSE
Resultado del test dia 4: 23.53 MAPE 

Resultado del test dia 5: 99.44 MAE
Result

In [33]:
print("That's all folks!")

That's all folks!
