In [8]:
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.model_selection import (
    TimeSeriesFold,
    backtesting_forecaster_multiseries,
)

from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMRegressor

from src.data import load_training_data, make_exog_features, split_data

In [60]:
data = load_training_data()
data = make_exog_features(data)
data.head()

Unnamed: 0_level_0,ba_AECI,ba_AVA,ba_AZPS,ba_BANC,ba_BPAT,ba_CHPD,ba_CISO,ba_CPLE,ba_CPLW,ba_DOPD,...,ba_TPWR,ba_TVA,ba_WACM,ba_WALC,ba_WAUW,exog_is_holiday,exog_month,exog_day_of_week,exog_is_weekend,exog_season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-01,51628.0,27070.0,96193.0,46398.0,123905.0,3745.0,592567.0,123640.0,11797.0,4617.0,...,10804.0,348678.0,80299.0,24633.0,1820.0,0,10,5,1,4
2022-10-02,53127.0,28039.0,97208.0,42814.0,125317.0,3655.0,560074.0,124418.0,11759.0,4583.0,...,10773.0,345900.0,79702.0,26100.0,1749.0,0,10,6,1,4
2022-10-03,54708.0,30110.0,96570.0,47041.0,133353.0,3790.0,623658.0,132803.0,12155.0,4732.0,...,11228.0,373596.0,80536.0,25310.0,1915.0,0,10,0,0,4
2022-10-04,53345.0,30764.0,88963.0,48332.0,134664.0,3831.0,654561.0,134430.0,12294.0,4829.0,...,10927.0,378071.0,80522.0,23938.0,1838.0,0,10,1,0,4
2022-10-05,53356.0,30421.0,91984.0,50362.0,135464.0,3815.0,664304.0,136348.0,12482.0,4854.0,...,10962.0,376763.0,81362.0,23839.0,1828.0,0,10,2,0,4


In [None]:
# Encoding exog features as categorical for training
data = data.astype({col: "category" for col in data.filter(like="exog_").columns})

In [65]:
data_train, data_test = split_data(data)

data_train.shape=(396, 58)
data_test.shape=(366, 58)
Train dates : 2022-10-01 00:00:00 --- 2023-10-31 00:00:00   (n=396)
Test dates  : 2023-11-01 00:00:00 --- 2024-10-31 00:00:00   (n=366)


In [66]:
# Setting up forecaster
forecaster = ForecasterRecursiveMultiSeries(
    regressor=LGBMRegressor(random_state=123, verbose=-1, categorical_feaure=data_train.filter(like='exog_').columns.tolist()),
    lags=[1, 7, 30, 182, 365],  # lags for 1 d, 1 w, 1m, 6 m, 12 m
    window_features=RollingFeatures(
        stats=["mean", "mean", "mean", "mean", "std", "std", "std", "std"],
        window_sizes=[7, 30, 182, 365, 7, 30, 182, 365],
    ),  # Rolling means and stds for 1w, 1m, 6m, 12m
    encoding="ordinal",
    transformer_series=StandardScaler(),  # Transforms each target series using standard scaler. Tranformations are applied under the hood when predicting and the prediction itself is returned on the original scale.
    transformer_exog=StandardScaler(),
)

In [67]:
# Setting up validation
cv = TimeSeriesFold(
    steps=1,  # We'd like to forecast a week in the future.
    initial_train_size=len(data_train),
    refit=True,
    fixed_train_size=False,
    allow_incomplete_fold=True,
)

In [68]:
metrics, backtest_predictions = (
    backtesting_forecaster_multiseries(
        forecaster=forecaster,
        series=data.filter(like="ba_"),
        exog=data.filter(like="exog_"),
        cv=cv,
        levels=None,
        metric="mean_absolute_error",
        add_aggregated_metric=True,
        n_jobs="auto",
        verbose=False,
        show_progress=True,
        suppress_warnings=False,
    )
)

 


  0%|          | 0/366 [00:00<?, ?it/s]

In [69]:
metrics[metrics['levels'] == 'average']

Unnamed: 0,levels,mean_absolute_error
53,average,7755.546317


This is better than the naive baseline we established. 

Now do a quick check of feature importances.

In [70]:
forecaster.fit(
    series=data.filter(like="ba_"),
    exog=data.filter(like="exog_"),
)

forecaster.get_feature_importances()

Unnamed: 0,feature,importance
0,lag_1,398
16,exog_day_of_week,294
9,roll_std_7,216
1,lag_7,202
10,roll_std_30,190
2,lag_30,178
3,lag_182,177
15,exog_month,174
4,lag_365,167
7,roll_mean_182,165


It seems as though is_weekend is redundant. Let's remove this and see if the score improves. 

In [71]:
data_reduced = data.copy()
data_reduced = data_reduced.drop(
    columns=["exog_is_weekend"],
)

data_train_reduced, data_test_reduced = split_data(data_reduced)

data_train.shape=(396, 57)
data_test.shape=(366, 57)
Train dates : 2022-10-01 00:00:00 --- 2023-10-31 00:00:00   (n=396)
Test dates  : 2023-11-01 00:00:00 --- 2024-10-31 00:00:00   (n=366)


In [72]:
forecaster_reduced = ForecasterRecursiveMultiSeries(
    regressor=LGBMRegressor(random_state=123, verbose=-1, categorical_feaure=data_train_reduced.filter(like='exog_').columns.tolist()),
    lags=[1, 7, 30, 182, 365],  # lags for 1 d, 1 w, 1m, 6 m, 12 m
    window_features=RollingFeatures(
        stats=["mean", "mean", "mean", "mean", "std", "std", "std", "std"],
        window_sizes=[7, 30, 182, 365, 7, 30, 182, 365],
    ),  # Rolling means and stds for 1w, 1m, 6m, 12m
    encoding="ordinal",
    transformer_series=StandardScaler(),  # Transforms each target series using standard scaler. Tranformations are applied under the hood when predicting and the prediction itself is returned on the original scale.
    transformer_exog=StandardScaler(),
)

metrics_reduced, backtest_predictions_reduced = backtesting_forecaster_multiseries(
    forecaster=forecaster_reduced,
    series=data_reduced.filter(like="ba_"),
    exog=data_reduced.filter(like="exog_"),
    cv=cv,
    levels=None,
    metric="mean_absolute_error",
    add_aggregated_metric=True,
    n_jobs="auto",
    verbose=False,
    show_progress=True,
    suppress_warnings=False,
)

 


  0%|          | 0/366 [00:00<?, ?it/s]

In [73]:
metrics_reduced[metrics_reduced["levels"] == "average"]

Unnamed: 0,levels,mean_absolute_error
53,average,7760.946846


In [76]:
forecaster_reduced.fit(
    series=data_reduced.filter(like="ba_"),
    exog=data_reduced.filter(like="exog_"),
)

forecaster_reduced.get_feature_importances()

Unnamed: 0,feature,importance
0,lag_1,398
16,exog_day_of_week,294
9,roll_std_7,216
1,lag_7,202
10,roll_std_30,190
2,lag_30,178
3,lag_182,177
15,exog_month,174
4,lag_365,167
7,roll_mean_182,165


The score actually got slightly worse, so there may be some interaction effects with the other features. We will in this case retain the is_weekend feature.