# 3. Regression
In this notebook we will finally run our regression models. For that purpose, we are importing the necessary libraries and functions from our ```modules``` folder. We are also importing our extracted dataframe

In [1]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

# Basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import seaborn as sns
from scipy import stats
from functools import reduce

# Statsmodels
import statsmodels.api as sm
import pmdarima as pmd
from pmdarima.arima import auto_arima
from statsmodels.tsa.api import VAR
from statsmodels.tsa.vector_ar.var_model import VARResults
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

# Machine Learning models
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import Ridge, Lasso, ElasticNet, ElasticNetCV, LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error,
    median_absolute_error,
    r2_score,
    precision_score

)

from xgboost import XGBRegressor

In [2]:
# We import our own functions
import sys
sys.path.append('../../../')  # Move two levels up to the project root
from modules.functions import *

In [1]:
df = pd.read_csv('../../../input/df_raw_c19.csv', parse_dates=['Fecha'], index_col='Fecha')
df.tail()

NameError: name 'pd' is not defined

In [None]:
df_lags = pd.read_csv('../../../input/df_lags_c19.csv', parse_dates=['Fecha'], index_col='Fecha')
df_lags.tail()

## 3.1 Benchmark models

In the first section, we first run our benchmark econometric models: ```Random Walk (RW)```,  ```Autoregressive Integrated Moving Average (ARIMA)``` and ```Vector Autoregression (VAR)``` processes

### 3.1.1 Random Walk (RW)

In [5]:
forecast_horizons = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

# We define our target variable
target = 'CPI Core'

# We only use CPI as Random Walk is an univariate process
df_CPI_Core = pd.DataFrame(df_lags['CPI Core'])

# We create our train and test set
train_set = df_CPI_Core[df_CPI_Core.index < '2019-01-01']
test_set  = df_CPI_Core[df_CPI_Core.index >= '2019-01-01']

predictions = {}

for h in forecast_horizons:
    # We get the values h horizons before
    predicted_value = train_set.iloc[-h, 0]

    # We save it for horizon h
    predictions[h] = predicted_value

predicted = pd.DataFrame([predictions]).transpose().reset_index()

predicted.columns = ['Horizon', 'Prediction']

predicted = predicted.set_index(test_set.index)

predicted

Unnamed: 0_level_0,Horizon,Prediction
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,1,0.202035
2019-02-01,2,0.206436
2019-03-01,3,0.209674
2019-04-01,4,0.211221
2019-05-01,5,0.210862
2019-06-01,6,0.208824
2019-07-01,7,0.205903
2019-08-01,8,0.202773
2019-09-01,9,0.199732
2019-10-01,10,0.196761


In [6]:
# We create our results dataframe, concatenating the predicted and the actual values
results = pd.concat([predicted, test_set[target]], axis=1)
results.rename(columns={'Horizon': 'Horizon', 'Prediction': 'Predicted', 'CPI Core': 'Actual'}, inplace=True)
results

Unnamed: 0_level_0,Horizon,Predicted,Actual
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,1,0.202035,0.19702
2019-02-01,2,0.206436,0.193185
2019-03-01,3,0.209674,0.189774
2019-04-01,4,0.211221,0.186363
2019-05-01,5,0.210862,0.182886
2019-06-01,6,0.208824,0.179341
2019-07-01,7,0.205903,0.175751
2019-08-01,8,0.202773,0.172176
2019-09-01,9,0.199732,0.168668
2019-10-01,10,0.196761,0.165254


In [7]:
# We get our metrics using our function
RMSE_rw, MAPE_rw = get_metrics(results, 'RW')
metrics_rw = pd.concat([RMSE_rw, MAPE_rw], axis = 1)
metrics_rw

Unnamed: 0,RMSE_RW,MAPE_RW
1,0.005014,0.02545
2,0.010019,0.047023
3,0.014104,0.066302
4,0.017426,0.083071
5,0.019986,0.097051
6,0.021858,0.108276
7,0.023225,0.117317
8,0.024269,0.124866
9,0.025115,0.131455
10,0.025826,0.137376


### 3.1.2 Autoregressive Integrated Moving Average (ARIMA)

In [8]:
# We only use CPI as ARIMA is an univariate process
df_CPI_Core = pd.DataFrame(df_lags['CPI Core'])

# We create our train and test set
train_set = df_CPI_Core[df_CPI_Core.index < '2019-01-01']
test_set  = df_CPI_Core[df_CPI_Core.index >= '2019-01-01']

In [9]:
# We find the best SARIMA model
autoarima = pmd.auto_arima(
        y = train_set,
        start_p=1,
        start_q=0,
        seasonal=True,
        max_p=12,
        max_d=1,
        max_q=6,
        max_P=12,
        max_D=1,
        max_Q=6,
        m=4,
        n_jobs=-1,
        suppress_warnings=True,
        )

# We indicate the seasonal order for monthly data
seasonal_order = (1, 1, 1, 12)

# We create our ARIMA model
model = SARIMAX(train_set,
                order=autoarima.order,
                seasonal_order=autoarima.seasonal_order,
                enforce_stationarity = False,
                enforce_invertibility = False)
        
# We fit the model
model_fit = model.fit()

# We forecast for the next 12 horizons
forecast_values = model_fit.get_forecast(steps=12)
predicted = pd.DataFrame(forecast_values.predicted_mean, index = test_set.index)

# We create our results dataframe, concatenating the predicted and the actual values
results = pd.concat([predicted, test_set[target]], axis=1)
results.rename(columns={'predicted_mean': 'Predicted', 'CPI Core': 'Actual'}, inplace=True)
results

Unnamed: 0_level_0,Predicted,Actual
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,0.196829,0.19702
2019-02-01,0.191032,0.193185
2019-03-01,0.184859,0.189774
2019-04-01,0.178525,0.186363
2019-05-01,0.172234,0.182886
2019-06-01,0.166176,0.179341
2019-07-01,0.160523,0.175751
2019-08-01,0.15542,0.172176
2019-09-01,0.150988,0.168668
2019-10-01,0.147321,0.165254


In [10]:
# We get our metrics using our function
RMSE_arima, MAPE_arima = get_metrics(results, 'ARIMA')
metrics_arima= pd.concat([RMSE_arima, MAPE_arima], axis = 1)
metrics_arima

Unnamed: 0,RMSE_ARIMA,MAPE_ARIMA
1,0.000191,0.000971
2,0.001529,0.006058
3,0.0031,0.012672
4,0.004751,0.020019
5,0.006383,0.027664
6,0.007927,0.035287
7,0.009327,0.042624
8,0.010546,0.049461
9,0.011558,0.055612
10,0.012344,0.060903


### 3.1.3 Vector autoregression (VAR)

In [11]:
# We define our target variable, as well as our train and test set
target = 'CPI Core'
train_set = df[df.index < '2019-01-01']
test_set  = df[df.index >= '2019-01-01']

In [12]:
# We model our VAR including up to two lags
model_var = VAR(df)
model_fit = model_var.fit(2)

In [13]:
# We forecast for the next 12 months
preds = model_fit.forecast(df.values[-2:], 12)
preds = pd.DataFrame(preds, index = test_set[target].index)[0]

# We create our results dataframe, concatenating the predicted and the actual values
results = pd.concat([preds, test_set[target]],axis=1)
results.rename(columns={'CPI Core': 'Actual', 0: 'Predicted'}, inplace=True)
results

Unnamed: 0_level_0,Predicted,Actual
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,0.156116,0.19702
2019-02-01,0.15452,0.193185
2019-03-01,0.154181,0.189774
2019-04-01,0.155399,0.186363
2019-05-01,0.158548,0.182886
2019-06-01,0.163977,0.179341
2019-07-01,0.171995,0.175751
2019-08-01,0.182844,0.172176
2019-09-01,0.196667,0.168668
2019-10-01,0.213473,0.165254


In [14]:
# We get our metrics using our function
RMSE_var, MAPE_var = get_metrics(results, 'VAR')
metrics_var= pd.concat([RMSE_var, MAPE_var], axis = 1)
metrics_var

Unnamed: 0,RMSE_VAR,MAPE_VAR
1,0.040905,0.207617
2,0.0398,0.20388
3,0.038449,0.198438
4,0.036721,0.190366
5,0.034601,0.178908
6,0.032203,0.163368
7,0.029848,0.143082
8,0.028174,0.132942
9,0.028154,0.136615
10,0.030756,0.152133


## 3.2 Machine learning models

In the second section, we run our machine learning models: ```Ridge Regression (Ridge)```,  ```Least Absolute Shrinkage and Selection Operator (LASSO)``` and ```Random Forest (RF)``` models

### 3.2.1 Ridge Regression (Ridge)

In [15]:
# def test_models_regression(models, data, pred_vars, target_var ):
#     """
#     Evalúa modelos de regresión utilizando validación cruzada en series temporales.

#     Parámetros:
#     - modelos: Diccionario de modelos de regresión para evaluar.
#     - datos: DataFrame que contiene el conjunto de datos.
#     - variables_predictoras: Lista de nombres de variables predictoras.
#     - variable_objetivo: Nombre de la variable objetivo.

#     Retorna:
#     DataFrame: Resultados de la evaluación del modelo.
#     """       
#     results = {
#         'Model': [],
#         'R2_train': [],
#         'R2_test': [],
#         'MAE_train': [],
#         'MAE_test': [],
#         'MAPE_train': [],
#         'MAPE_test': [],
#         'MSE_train': [],
#         'MSE_test': [],
#         'RMSE_train': [],
#         'RMSE_test': [],
#         'Grid_Search_Params': []
#     }
    
#     X = data[pred_vars]
#     y = data[target_var]
    
#     cv = TimeSeriesSplit(n_splits=5)
    
#     print(f"Entrenando y evaluando modelos...")
    
#     for model_name, model_params in models.items():
#         print(f"Procesando el modelo: {model_name}")
        
#         if 'model' in model_params:
#             model = model_params['model']
#         else:
#             raise ValueError(f'Model is not defined for {model_name}')
        
#         if 'grid_params' in model_params:
#             grid_params = model_params['grid_params']
#         else:
#             grid_params = None
        
#         best_params = None
        
#         for ii, (tr, tt) in enumerate(cv.split(X, y)):
#             X_train, X_test = X.iloc[tr], X.iloc[tt]
#             y_train, y_test = y.iloc[tr], y.iloc[tt]
            
#             if ii == (cv.n_splits - 1):
            
#                 if grid_params is not None:
#                     grid_search = GridSearchCV(model, grid_params, cv=cv)
#                     grid_search.fit(X_train, y_train)
#                     best_model = grid_search.best_estimator_
#                     best_params = grid_search.best_params_

#                     if hasattr( best_model, 'feature_importances_' ):

#                         feature_importances = best_model.feature_importances_
#                         vars_df             = pd.DataFrame( {'Var': pred_vars, 'Importance Score': feature_importances } )
#                         vars_df             = vars_df.reindex( vars_df[ 'Importance Score' ].abs().sort_values( ascending = False ).index )
#                         vars_df.to_excel( f'varlist__{ model_name }.xlsx' )

#                     elif hasattr( best_model, 'coef_' ):

#                         coefficients = best_model.coef_[ 0 ]
#                         vars_df      = pd.DataFrame( {'Var': best_model.feature_names_in_, 'Coefficient': coefficients } )
#                         vars_df      = vars_df.reindex( vars_df[ 'Coefficient' ].abs().sort_values( ascending = False ).index )
#                         vars_df.to_excel( f'varlist_{ model_name }.xlsx' )
#                 else:
#                     best_model = model.fit(X_train, y_train)
#                     coefficients = best_model.coef_[ 0 ]
#                     vars_df      = pd.DataFrame( {'Var': best_model.feature_names_in_, 'Coefficient': coefficients } )
#                     vars_df      = vars_df.reindex( vars_df[ 'Coefficient' ].abs().sort_values( ascending = False ).index )
#                     vars_df.to_excel( f'varlist_{ model_name }.xlsx' )

#                 y_pred_train = best_model.predict(X_train)
#                 y_pred_test = best_model.predict(X_test)

#                 best_model_params = {
#                     'Model': model_name,
#                     'Grid_Search_Params': best_params
#                 }

#         results['Model'].append(best_model_params['Model'])
#         results['Grid_Search_Params'].append(best_model_params['Grid_Search_Params'])
    
#     results_df = pd.DataFrame(results)
#     results_df = results_df.sort_values(by='RMSE_test', ascending=True)

#     return results_df

In [16]:
# import pandas as pd
# from datetime import datetime, timedelta

# # Asumiendo que tu DataFrame tiene una columna 'timestamp' que representa la fecha
# # y que 'target_variable' es la variable que deseas predecir

# # 1. Filtra los datos hasta diciembre de 2018
# train_data = df[df['timestamp'] <= '2018-12-01']

# # 2. Filtra los últimos 12 meses para el conjunto de prueba
# test_start_date = datetime.strptime('2018-12-01', '%Y-%m-%d') + timedelta(days=1)
# test_data = df[(df['timestamp'] >= test_start_date) & (df['timestamp'] <= '2019-12-01')]

# # 3. Divide tus datos en características (X) y variable objetivo (y)
# X_train = train_data.drop('target_variable', axis=1)  # Ajusta 'target_variable' al nombre de tu variable objetivo
# y_train = train_data['target_variable']

# X_test = test_data.drop('target_variable', axis=1)
# y_test = test_data['target_variable']

# # Ahora tienes X_train, y_train para entrenar tu modelo hasta diciembre de 2018,
# # y X_test, y_test para evaluar tu modelo en los últimos 12 meses.
