In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPClassifier

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

from skforecast.model_selection_sarimax import backtesting_sarimax
from skforecast.model_selection_sarimax import grid_search_sarimax

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

In [2]:
#pip install skforecast

In [3]:
# Cargar datos históricos de ventas (reemplaza 'tu_archivo.csv' con tu archivo CSV)
datos_ventas = pd.read_csv('desafio_modelo_de_demanda.csv')

datos_ventas.head(10)

Unnamed: 0,fecha,prod_id,precio_promedio,ventas,modelo_actual
0,2017-01-01,0,6651.73792,1786.0,1943.551087
1,2017-02-01,0,4919.698602,2194.0,1853.11185
2,2017-03-01,0,3277.484704,1178.0,1659.733052
3,2017-04-01,0,5906.864332,1010.0,1580.492875
4,2017-05-01,0,4662.644018,1327.0,1552.062093
5,2017-06-01,0,6342.04209,982.0,1108.069873
6,2017-07-01,0,4025.450305,1041.0,1027.946609
7,2017-08-01,0,5271.820417,874.0,1143.970335
8,2017-09-01,0,4657.93386,1042.0,1038.140169
9,2017-10-01,0,6146.444333,618.0,854.462461


In [4]:
datos_ventas['fecha'] = pd.to_datetime(datos_ventas['fecha'])
datos_ventas = datos_ventas.loc[datos_ventas['prod_id'] == 101]
datos_ventas = datos_ventas.sort_values('fecha')
datos_ventas = datos_ventas.reset_index(drop=True)
#filtro fecha 
#datos_ventas = datos_ventas[datos_ventas['fecha'].dt.strftime('%Y') == '2023']
data = datos_ventas
data.head(12)

Unnamed: 0,fecha,prod_id,precio_promedio,ventas,modelo_actual
0,2012-01-01,101,2993.40836,1600.0,1692.686971
1,2012-02-01,101,1444.03963,2216.0,1844.94934
2,2012-03-01,101,2722.151766,331.0,1337.859171
3,2012-04-01,101,2108.265064,389.0,1159.44116
4,2012-05-01,101,2168.753385,251.0,816.534033
5,2012-06-01,101,2214.057317,396.0,314.40799
6,2012-07-01,101,2651.306993,270.0,313.165977
7,2012-08-01,101,2192.204142,538.0,333.145388
8,2012-09-01,101,1775.795149,491.0,438.788002
9,2012-10-01,101,1861.631503,642.0,457.264761


In [5]:
# Data preprocessing
# ==============================================================================

data['fecha'] = pd.to_datetime(data['fecha'], format='%d/%m/%y')
data = data.set_index('fecha')
data = data.asfreq('MS')
data = data.sort_index()
data.head(3)

Unnamed: 0_level_0,prod_id,precio_promedio,ventas,modelo_actual
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01,101,2993.40836,1600.0,1692.686971
2012-02-01,101,1444.03963,2216.0,1844.94934
2012-03-01,101,2722.151766,331.0,1337.859171


In [6]:
data_pred = pd.DataFrame()
data_pred['ventas'] = 2
data_pred['fecha'] = pd.date_range(start='2024-01-01', end='2024-12-31', freq='MS')
data_pred = data_pred.set_index('fecha')
data_pred = data_pred.asfreq('MS')
data_pred = data_pred.sort_index()
data_pred.head(3)

Unnamed: 0_level_0,ventas
fecha,Unnamed: 1_level_1
2024-01-01,
2024-02-01,
2024-03-01,


In [7]:
# Check index is complete or there are missing values
# ==============================================================================
(data.index == pd.date_range(
                    start = data.index.min(),
                    end   = data.index.max(),
                    freq  = data.index.freq)).all()

True

In [8]:
print(f"Missing values: {data.isnull().any(axis=1).sum()}")

Missing values: 0


In [9]:
#valores para entrenamiento y test
end_train = '2015-03-30 23:59:00'
end_validation = '2019-06-30 23:59:00'
#valores para predecir
start_pred = '2024-01-01 00:00:00'
end_pred = '2024-12-31 23:59:00'

data_train = data.loc[: end_train, :]
data_val   = data.loc[end_train:end_validation, :]
data_test  = data.loc[end_validation:, :]
data_pred   = data_pred.loc[start_pred:end_pred, :]

print(f"Training dates   : {data_train.index.min()} --- {data_train.index.max()}  (n={len(data_train)})")
print(f"Validation dates : {data_val.index.min()} --- {data_val.index.max()}  (n={len(data_val)})")
print(f"Test dates       : {data_test.index.min()} --- {data_test.index.max()}  (n={len(data_test)})")
print(f"Predict       : {data_pred.index.min()} --- {data_pred.index.max()}  (n={len(data_pred)})")

Training dates   : 2012-01-01 00:00:00 --- 2015-03-01 00:00:00  (n=39)
Validation dates : 2015-04-01 00:00:00 --- 2019-06-01 00:00:00  (n=51)
Test dates       : 2019-07-01 00:00:00 --- 2023-12-01 00:00:00  (n=54)
Predict       : 2024-01-01 00:00:00 --- 2024-12-01 00:00:00  (n=12)


In [10]:
plot_train = data_train.ventas.hvplot.line(label='train')
plot_val   = data_val.ventas.hvplot.line(label='val')
plot_test  = data_test.ventas.hvplot.line(label='test')

layout = plot_train * plot_val * plot_test
layout = layout.opts(title='Daily Sales', ylabel='Sales')
layout = layout.opts(height=300, width=550)
layout

In [11]:
# Create and train forecaster
# ==============================================================================
forecaster = ForecasterAutoreg(
                 regressor     = Ridge(random_state=123),
                 lags          = 14,
                 #lags          = 14,
                 transformer_y = StandardScaler(),
                 forecaster_id = 'Sales'
             )

forecaster.fit(y=data_train.ventas)
forecaster


ForecasterAutoreg 
Regressor: Ridge(random_state=123) 
Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14] 
Transformer for y: StandardScaler() 
Transformer for exog: None 
Window size: 14 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: [Timestamp('2012-01-01 00:00:00'), Timestamp('2015-03-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 123, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-02-07 17:56:39 
Last fit date: 2024-02-07 17:56:39 
Skforecast version: 0.11.0 
Python version: 3.11.5 
Forecaster id: Sales 

In [12]:
# Backtest
# ==============================================================================
metric, predictions = backtesting_forecaster(
                          forecaster         = forecaster,
                          y                  = data.ventas,
                          initial_train_size = len(data.loc[:end_validation]),
                          steps              = 7,
                          refit              = False,
                          fixed_train_size   = False,
                          metric             = 'mean_absolute_error',
                          verbose            = True,
                          show_progress      = False
                      )

print(f'Backtest error: {metric}')
predictions.head(5)

Information of backtesting process
----------------------------------
Number of observations used for initial training: 90
Number of observations used for backtesting: 54
    Number of folds: 8
    Number of steps per fold: 7
    Number of steps to exclude from the end of each train set before test (gap): 0
    Last fold only includes 5 observations.

Fold: 0
    Training:   2012-01-01 00:00:00 -- 2019-06-01 00:00:00  (n=90)
    Validation: 2019-07-01 00:00:00 -- 2020-01-01 00:00:00  (n=7)
Fold: 1
    Training:   2012-01-01 00:00:00 -- 2019-06-01 00:00:00  (n=90)
    Validation: 2020-02-01 00:00:00 -- 2020-08-01 00:00:00  (n=7)
Fold: 2
    Training:   2012-01-01 00:00:00 -- 2019-06-01 00:00:00  (n=90)
    Validation: 2020-09-01 00:00:00 -- 2021-03-01 00:00:00  (n=7)
Fold: 3
    Training:   2012-01-01 00:00:00 -- 2019-06-01 00:00:00  (n=90)
    Validation: 2021-04-01 00:00:00 -- 2021-10-01 00:00:00  (n=7)
Fold: 4
    Training:   2012-01-01 00:00:00 -- 2019-06-01 00:00:00  (n=90)
    Val

Unnamed: 0,pred
2019-07-01,687.892697
2019-08-01,517.632916
2019-09-01,466.844645
2019-10-01,658.800855
2019-11-01,265.251942


In [13]:
plot_test = data_test.ventas.hvplot.line(label='test')
plot_predict = predictions.hvplot.line(label='prediction')
layout = plot_test * plot_predict
layout = layout.opts(
             title = 'Predictions vs real values',
             ylabel = 'users',
             legend_position = 'bottom_left'
         )
layout = layout.opts(height=300, width=550)
layout

In [15]:
predictions.to_excel('proyecciones_AR_v1.xlsx', index=True)
predictions.head()

Unnamed: 0,pred
2019-07-01,687.892697
2019-08-01,517.632916
2019-09-01,466.844645
2019-10-01,658.800855
2019-11-01,265.251942
