# Load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error   # para calcular el error cuadratico medio
from math import sqrt

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio # settings like template by default

pio.templates.default = 'plotly_dark' # default template

# Modelos de ML
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('data/airline-passengers.csv', header=0, index_col=0, parse_dates=True)
print(f"Size data: {df.shape}")
df.head(3)

Size data: (144, 1)


Unnamed: 0_level_0,Passengers
Month,Unnamed: 1_level_1
1949-01-01,112
1949-02-01,118
1949-03-01,132


In [5]:
fig_area = px.line(df, x = df.index, y = 'Passengers')
fig_area.update_layout(
    title = "<b>Time series passengers</b>",
    xaxis = dict(title='Date'), yaxis = dict(title='passengers')
)
fig_area.show()

In [8]:
def get_lag_features(serie, numlags):
     if isinstance(serie, (np.ndarray)):  # si la serie es un array numpy 
        return serie[-numlags:].tolist()
     else:  # if it is a list
        return serie[-numlags:]

In [9]:
serie = df.values   # obtiene los valores de la serie
#serie = np.log(serie)  
numlags = 3      # numero de atributos lag a obtener

# construye la data para ML
X = []   # variable donde colocara los atributos de entrada
y = []   # variable objetivo 
for i in range(numlags, len(serie)):  
    X.append( get_lag_features(serie[:i, 0], numlags) )
    y.append(serie[i, 0])

# Print values len(X)
for i in range(6):
    print(f"lags {X[i]} current: {y[i]}")

lags [112, 118, 132] current: 129
lags [118, 132, 129] current: 121
lags [132, 129, 121] current: 135
lags [129, 121, 135] current: 148
lags [121, 135, 148] current: 148
lags [135, 148, 148] current: 136


# Machine learning models

In [10]:
def evaluate_MLmodels_lags(serie, numlags, train_fraction):
    # split train/test
    train_size = int(len(serie) * train_fraction)
    train, test = serie[0:train_size], serie[train_size:]
    
    print(f"size train: {train.shape}")
    print(f"size test: {test.shape}")
    
    X_train = []   # variable donde colocara los atributos de entrada
    y_train = []   # variable target 
    for i in range(numlags, len(train)-1):  
        X_train.append(get_lag_features(train[:i, 0], numlags))
        y_train.append(train[i, 0])         

    print(X_train)
    print(y_train)

    history = train.tolist()  # serie historica. Inicialmente es train    
    predictionsRF = list()  # array para almacenar las predicciones de Random Forest (RF) en la serie de test
    predictionsLR = list()  # array para almacenar las predicciones de Linear Regression (LR) en la serie de test
    
    # Evaluate in walk-forward ML models
    for t in range(len(test)):
        print(f'Training ML models to predict test {t}')
    
        # Random Forest 
        rf_model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0) # 100 trees
        print(X_train)
        print(y_train)
        rf_model.fit(X_train, y_train)
    
        # Linear regression
        lr_model = LinearRegression(n_jobs=-1)
        lr_model.fit(X_train, y_train)
            
        # Obtiene los atributos para hacer la prediccion del punto de test
        X_test = get_lag_features(history, numlags)
        print(X_test)
        # Predict test and save it
        y_RF = rf_model.predict([X_test])[0]
        predictionsRF.append(y_RF)
        
        y_LR = lr_model.predict([X_test])[0]
        predictionsLR.append(y_LR)
    
        # concatena el valor de test actual a la serie histórica. 
        history.append( test[t] ) 
        
        # El ejemplo testado lo agrega a la data de entrenamiento para el siguiente entrenamiento
        X_train.append(X_test)
        y_train.append(test[t, 0])         
    
    # calcula e imprime el RMSE en test   
    rmseRF = sqrt(mean_squared_error(test, predictionsRF))
    rmseLR = sqrt(mean_squared_error(test, predictionsLR))
    print('RMSE de Modelo RF con lag features = %.5f' % rmseRF)
    print('RMSE de Modelo LR con lag features = %.5f' % rmseLR)
    
    # plotea las predicciones versus lo real
    plt.plot(test, color='black')
    plt.plot(predictionsRF, color='green')
    plt.plot(predictionsLR, color='red')
    plt.legend(['real', 'RF_prediccion', 'LR_prediccion'],  loc='upper left')
    plt.show()

In [80]:
numlags = 13 # lags values to get
train_fraction = 0.8  # 80% training

evaluate_MLmodels_lags(serie, numlags, train_fraction )

size train: (115, 1)
size test: (29, 1)
[[112, 118, 132, 129, 121, 135, 148, 148, 136, 119, 104, 118, 115], [118, 132, 129, 121, 135, 148, 148, 136, 119, 104, 118, 115, 126], [132, 129, 121, 135, 148, 148, 136, 119, 104, 118, 115, 126, 141], [129, 121, 135, 148, 148, 136, 119, 104, 118, 115, 126, 141, 135], [121, 135, 148, 148, 136, 119, 104, 118, 115, 126, 141, 135, 125], [135, 148, 148, 136, 119, 104, 118, 115, 126, 141, 135, 125, 149], [148, 148, 136, 119, 104, 118, 115, 126, 141, 135, 125, 149, 170], [148, 136, 119, 104, 118, 115, 126, 141, 135, 125, 149, 170, 170], [136, 119, 104, 118, 115, 126, 141, 135, 125, 149, 170, 170, 158], [119, 104, 118, 115, 126, 141, 135, 125, 149, 170, 170, 158, 133], [104, 118, 115, 126, 141, 135, 125, 149, 170, 170, 158, 133, 114], [118, 115, 126, 141, 135, 125, 149, 170, 170, 158, 133, 114, 140], [115, 126, 141, 135, 125, 149, 170, 170, 158, 133, 114, 140, 145], [126, 141, 135, 125, 149, 170, 170, 158, 133, 114, 140, 145, 150], [141, 135, 125, 149, 

ValueError: X has 12 features, but RandomForestRegressor is expecting 13 features as input.