In [1]:
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.model_selection import (
    TimeSeriesFold,
    bayesian_search_forecaster_multiseries,
    backtesting_forecaster_multiseries,
)

from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMRegressor

from src.data import load_training_data, make_exog_features, split_data

In [2]:
data = load_training_data()
data = make_exog_features(data)
# Encoding exog features as categorical for training
data = data.astype({col: "category" for col in data.filter(like="exog_").columns})
data_train, data_test = split_data(data)

data_train.shape=(396, 58)
data_test.shape=(366, 58)
Train dates : 2022-10-01 00:00:00 --- 2023-10-31 00:00:00   (n=396)
Test dates  : 2023-11-01 00:00:00 --- 2024-10-31 00:00:00   (n=366)


In [7]:
# Setting up forecaster
forecaster = ForecasterRecursiveMultiSeries(
    regressor=LGBMRegressor(
        random_state=123,
        verbose=-1,
        categorical_feaure=data_train.filter(like="exog_").columns.tolist(),
    ),
    lags = 7, # Placeholder, the value will be overwritten
    window_features=RollingFeatures(
        stats=["mean", "mean", "mean", "mean", "std", "std", "std", "std"],
        window_sizes=[7, 30, 182, 365, 7, 30, 182, 365],
    ),  # Rolling means and stds for 1w, 1m, 6m, 12m
    encoding="ordinal",
    transformer_series=StandardScaler(),  # Transforms each target series using standard scaler. Tranformations are applied under the hood when predicting and the prediction itself is returned on the original scale.
    transformer_exog=StandardScaler(),
)


# Search space
def search_space(trial):
    return {
        "lags": trial.suggest_categorical("lags", [1, 7, 30, 182]),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 1.0
        ), 
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

cv = TimeSeriesFold(
    steps=1,
    initial_train_size=len(data_train),
    refit=False,
    fixed_train_size=True,
    allow_incomplete_fold=True,
)

results, best_trial = bayesian_search_forecaster_multiseries(
    forecaster=forecaster,
    series=data.filter(like="ba_"),
    exog=data.filter(like="exog_"),
    search_space=search_space,
    cv=cv,
    metric="mean_absolute_error",
    aggregate_metric="average",
    n_trials=200, 
    random_state=123,
    n_jobs="auto",
    verbose=False,
    show_progress=True,
)

  0%|          | 0/200 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182] 
  Parameters: {'num_leaves': 37, 'feature_fraction': 0.9800975145540853, 'bagging_fraction': 0.6868697806837848, 'min_child_samples': 6}
  Backtesting metric: 115

In [8]:
best_params = best_trial.params

best_lags = best_params.pop("lags")

tuned_forecaster = ForecasterRecursiveMultiSeries(
    regressor=LGBMRegressor(
        random_state=123,
        verbose=-1,
        categorical_feaure=data_train.filter(like="exog_").columns.tolist(),
        **best_params,
    ),
    lags=best_lags,
    window_features=RollingFeatures(
        stats=["mean", "mean", "mean", "mean", "std", "std", "std", "std"],
        window_sizes=[7, 30, 182, 365, 7, 30, 182, 365],
    ),
    encoding="ordinal",
    transformer_series=StandardScaler(),
    transformer_exog=StandardScaler(),
)

In [9]:
cv = TimeSeriesFold(
    steps=1,
    initial_train_size=len(data_train),
    refit=True,
    fixed_train_size=True,
    allow_incomplete_fold=True,
)

metrics, backtest_predictions = backtesting_forecaster_multiseries(
    forecaster=tuned_forecaster,
    series=data.filter(like="ba_"),
    exog=data.filter(like="exog_"),
    cv=cv,
    levels=None,
    metric="mean_absolute_error",
    add_aggregated_metric=True,
    n_jobs="auto",
    verbose=False,
    show_progress=True,
    suppress_warnings=False,
)

 


  0%|          | 0/366 [00:00<?, ?it/s]

In [10]:
metrics[metrics["levels"] == "average"]

Unnamed: 0,levels,mean_absolute_error
53,average,7877.053637
