In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPClassifier

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

from skforecast.model_selection_sarimax import backtesting_sarimax
from skforecast.model_selection_sarimax import grid_search_sarimax

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

In [2]:
#pip install skforecast

In [3]:
# Cargar datos históricos de ventas (reemplaza 'tu_archivo.csv' con tu archivo CSV)
datos_ventas = pd.read_csv('desafio_modelo_de_demanda_2.csv')

datos_ventas.head(10)

Unnamed: 0,fecha,prod_id,precio_promedio,ventas,modelo_actual
0,2024-01-01,998,0.0,0.0,0.0
1,2024-02-01,998,0.0,0.0,0.0
2,2024-03-01,998,0.0,0.0,0.0
3,2024-04-01,998,0.0,0.0,0.0
4,2024-05-01,998,0.0,0.0,0.0
5,2024-06-01,998,0.0,0.0,0.0
6,2024-07-01,998,0.0,0.0,0.0
7,2024-08-01,998,0.0,0.0,0.0
8,2024-09-01,998,0.0,0.0,0.0
9,2024-10-01,998,0.0,0.0,0.0


In [4]:
datos_ventas['fecha'] = pd.to_datetime(datos_ventas['fecha'])
datos_ventas = datos_ventas.loc[datos_ventas['prod_id'] == 998]
datos_ventas = datos_ventas.sort_values('fecha')
datos_ventas = datos_ventas.reset_index(drop=True)
#filtro fecha 
#datos_ventas = datos_ventas[datos_ventas['fecha'].dt.strftime('%Y') == '2012']
data = datos_ventas
data.head(15)

Unnamed: 0,fecha,prod_id,precio_promedio,ventas,modelo_actual
0,2012-01-01,998,10554.620268,955.0,982.324816
1,2012-02-01,998,11452.637321,1039.0,1055.230971
2,2012-03-01,998,9287.956839,599.0,851.884063
3,2012-04-01,998,9980.515771,605.0,752.608072
4,2012-05-01,998,7975.140638,742.0,773.231454
5,2012-06-01,998,8633.787271,723.0,611.791077
6,2012-07-01,998,8092.077706,646.0,670.525059
7,2012-08-01,998,8998.92844,353.0,653.487315
8,2012-09-01,998,9896.525089,389.0,547.505722
9,2012-10-01,998,10993.702476,357.0,443.789458


In [5]:
# Data preprocessing
# ==============================================================================

data['fecha'] = pd.to_datetime(data['fecha'], format='%d/%m/%y')
data = data.set_index('fecha')
data = data.asfreq('MS')
data = data.sort_index()
data.head(3)

Unnamed: 0_level_0,prod_id,precio_promedio,ventas,modelo_actual
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01,998,10554.620268,955.0,982.324816
2012-02-01,998,11452.637321,1039.0,1055.230971
2012-03-01,998,9287.956839,599.0,851.884063


In [None]:
data_pred = pd.DataFrame()
data_pred['ventas'] = 0
data_pred['prod_id'] = 0
data_pred['ventas'] = 0
data_pred['modelo_actual'] = 0
data_pred['fecha'] = pd.date_range(start='2024-01-01', end='2024-12-31', freq='MS')


In [7]:
# Check index is complete or there are missing values
# ==============================================================================
(data.index == pd.date_range(
                    start = data.index.min(),
                    end   = data.index.max(),
                    freq  = data.index.freq)).all()

True

In [8]:
print(f"Missing values: {data.isnull().any(axis=1).sum()}")

Missing values: 0


In [9]:
#valores para entrenamiento y test
end_train = '2019-03-30 23:59:00'
end_validation = '2023-12-01 00:00:00'
#valores para predecir
start_pred = '2024-01-01 00:00:00'
end_pred = '2024-12-31 23:59:00'

data_train = data.loc[: end_train, :]
data_test   = data.loc[end_train:end_validation, :]
data_val  = data.loc[start_pred:end_pred, :]


print(f"Training dates   : {data_train.index.min()} --- {data_train.index.max()}  (n={len(data_train)})")
print(f"Test dates       : {data_test.index.min()} --- {data_test.index.max()}  (n={len(data_test)})")
print(f"Validation dates : {data_val.index.min()} --- {data_val.index.max()}  (n={len(data_val)})")


Training dates   : 2012-01-01 00:00:00 --- 2019-03-01 00:00:00  (n=87)
Test dates       : 2019-04-01 00:00:00 --- 2023-12-01 00:00:00  (n=57)
Validation dates : 2024-01-01 00:00:00 --- 2024-12-01 00:00:00  (n=12)


In [10]:
plot_train = data_train.ventas.hvplot.line(label='train')
plot_val   = data_val.ventas.hvplot.line(label='val')
plot_test  = data_test.ventas.hvplot.line(label='test')

layout = plot_train * plot_val * plot_test
layout = layout.opts(title='Daily Sales', ylabel='Sales')
layout = layout.opts(height=300, width=550)
layout

In [11]:
# Create and train forecaster
# ==============================================================================
forecaster = ForecasterAutoreg(
                 regressor     = Ridge(random_state=123),
                 lags          = 21,
                 #lags          = 14,
                 transformer_y = StandardScaler(),
                 forecaster_id = 'Sales'
             )

forecaster.fit(y=data_train.ventas)
forecaster


ForecasterAutoreg 
Regressor: Ridge(random_state=123) 
Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21] 
Transformer for y: StandardScaler() 
Transformer for exog: None 
Window size: 21 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: [Timestamp('2012-01-01 00:00:00'), Timestamp('2019-03-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 123, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-02-07 11:02:54 
Last fit date: 2024-02-07 11:02:54 
Skforecast version: 0.11.0 
Python version: 3.11.5 
Forecaster id: Sales 

In [12]:
# Backtest
# ==============================================================================
metric, predictions = backtesting_forecaster(
                          forecaster         = forecaster,
                          y                  = data.ventas,
                          initial_train_size = len(data.loc[:end_validation]),
                          steps              = 1,
                          refit              = False,
                          fixed_train_size   = False,
                          metric             = 'mean_absolute_error',
                          verbose            = True,
                          show_progress      = False
                      )

print(f'Backtest error: {metric}')
predictions.head(5)

Information of backtesting process
----------------------------------
Number of observations used for initial training: 144
Number of observations used for backtesting: 12
    Number of folds: 12
    Number of steps per fold: 1
    Number of steps to exclude from the end of each train set before test (gap): 0

Fold: 0
    Training:   2012-01-01 00:00:00 -- 2023-12-01 00:00:00  (n=144)
    Validation: 2024-01-01 00:00:00 -- 2024-01-01 00:00:00  (n=1)
Fold: 1
    Training:   2012-01-01 00:00:00 -- 2023-12-01 00:00:00  (n=144)
    Validation: 2024-02-01 00:00:00 -- 2024-02-01 00:00:00  (n=1)
Fold: 2
    Training:   2012-01-01 00:00:00 -- 2023-12-01 00:00:00  (n=144)
    Validation: 2024-03-01 00:00:00 -- 2024-03-01 00:00:00  (n=1)
Fold: 3
    Training:   2012-01-01 00:00:00 -- 2023-12-01 00:00:00  (n=144)
    Validation: 2024-04-01 00:00:00 -- 2024-04-01 00:00:00  (n=1)
Fold: 4
    Training:   2012-01-01 00:00:00 -- 2023-12-01 00:00:00  (n=144)
    Validation: 2024-05-01 00:00:00 -- 2024-

Unnamed: 0,pred
2024-01-01,1901.656923
2024-02-01,1026.569771
2024-03-01,571.117158
2024-04-01,605.561065
2024-05-01,796.918259


In [13]:
plot_test = data_test.ventas.hvplot.line(label='test')
plot_predict = predictions.hvplot.line(label='prediction')
layout = plot_test * plot_predict
layout = layout.opts(
             title = 'Predictions vs real values',
             ylabel = 'users',
             legend_position = 'bottom_left'
         )
layout = layout.opts(height=300, width=550)
layout

In [14]:
predictions.to_excel('proyecciones_AR.xlsx', index=True)
predictions.head()

Unnamed: 0,pred
2024-01-01,1901.656923
2024-02-01,1026.569771
2024-03-01,571.117158
2024-04-01,605.561065
2024-05-01,796.918259
