In [37]:
import pandas as pd

from src.model import forwardfill_missing_values
from src.paths import TRANSFORMED_DATA_DIR
from src.config import BAS
from src.model import FeatureEngineerByBA

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

from feature_engine.timeseries.forecasting import (
    LagFeatures,
    WindowFeatures,
)


In [20]:
data = pd.read_csv(
    TRANSFORMED_DATA_DIR / "ts_tabular_2020_12_to_2025_2.csv", parse_dates=["datetime"]
)

# Clean up data
data = data[data["ba_code"].isin(BAS)]
# Forward filling NaNs
data = forwardfill_missing_values(data)

data.head()

Unnamed: 0,datetime,demand,ba_code
0,2020-12-01,75397.0,AECI
1,2020-12-02,71967.0,AECI
2,2020-12-03,69654.0,AECI
3,2020-12-04,67352.0,AECI
4,2020-12-05,65905.0,AECI


### Build benchmark models

Build several benchmark models to compare model performance against. 

- Naive: previous day's demand as forecast. 
- Lag 7: same day a week ago as forecast. 
- Rolling mean 7: rolling weekly average as forecast. 
- Lag 30: use same day one month ago as forecast. 
- Rolling mean 30: rolling monthly average as forecast. 
- Lag 365: use same day one year ago as forecast. 
- Rolling mean 365: rolling annual average as forecast. 

In [44]:
demand = data.copy()

train_end = "2024-01-01"
lags = [1, 7, 30, 365]
windows = [7, 30, 365]

lag_transformer = FeatureEngineerByBA(
    LagFeatures(
        variables=["demand"],
        periods=lags,
        drop_original=False,
    )
)

window_transformer = FeatureEngineerByBA(
    WindowFeatures(
        variables=["demand"],
        window=windows,
        freq=None,
        functions=["mean"],
        missing_values="ignore",
    )
)

pipe = Pipeline(
    [
        ("lags", lag_transformer), 
        ("windf", window_transformer),
    ]
)

demand_ = pipe.fit_transform(demand)
train_end = pd.Timestamp(train_end)

demand_ = demand_.loc[demand_["datetime"] > train_end]

demand_.head()

Unnamed: 0,datetime,demand,ba_code,demand_lag_1,demand_lag_7,demand_lag_30,demand_lag_365,demand_window_7_mean,demand_window_30_mean,demand_window_365_mean
1127,2024-01-02,81919.0,AECI,78611.0,73005.0,64811.0,58056.0,76881.142857,68614.866667,65072.594521
1128,2024-01-03,78631.0,AECI,81919.0,75775.0,66383.0,58551.0,78154.571429,69185.133333,65137.972603
1129,2024-01-04,78007.0,AECI,78631.0,77685.0,69418.0,73122.0,78562.571429,69593.4,65192.986301
1130,2024-01-05,80639.0,AECI,78007.0,79943.0,72026.0,77401.0,78608.571429,69879.7,65206.369863
1131,2024-01-06,77412.0,AECI,80639.0,75292.0,63252.0,75265.0,78708.0,70166.8,65215.241096


In [45]:
maes = {}
for col in demand_.columns[3:]:
    mae = mean_absolute_error(demand_["demand"], demand_[col])
    maes[col] = mae

pd.DataFrame({k.replace("demand_", ""): [v] for k, v in maes.items()})

Unnamed: 0,lag_1,lag_7,lag_30,lag_365,window_7_mean,window_30_mean,window_365_mean
0,9323.132523,19273.106322,26481.084637,20414.824063,14191.049961,17487.92177,26331.820495


Best score came from using the previous day's demand as the prediction for the next, with an MAE of 9323.