In [1]:
import pandas as pd

from typing import Optional

from src.model import forwardfill_missing_values
from src.paths import TRANSFORMED_DATA_DIR
from src.config import BAS
from src.model import FeatureEngineerByBA
from src.data import split_data

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin

from feature_engine.timeseries.forecasting import (
    LagFeatures,
    WindowFeatures,
)
from feature_engine.imputation import DropMissingData


In [2]:
data = pd.read_csv(
    TRANSFORMED_DATA_DIR / "ts_tabular_2020_12_to_2025_2.csv", parse_dates=["datetime"]
)

# Clean up data
data = data[data["ba_code"].isin(BAS)]
# Forward filling NaNs
data = forwardfill_missing_values(data)

data.head()

Unnamed: 0,datetime,demand,ba_code
0,2020-12-01,75397.0,AECI
1,2020-12-02,71967.0,AECI
2,2020-12-03,69654.0,AECI
3,2020-12-04,67352.0,AECI
4,2020-12-05,65905.0,AECI


### Build benchmark models

Build several benchmark models to compare model performance against. 

- Naive: previous day's demand as forecast. 
- Lag 7: same day a week ago as forecast. 
- Rolling mean 7: rolling weekly average as forecast. 
- Lag 30: use same day one month ago as forecast. 
- Rolling mean 30: rolling monthly average as forecast. 
- Lag 365: use same day one year ago as forecast. 
- Rolling mean 365: rolling annual average as forecast. 

In [None]:
demand = data.copy()

train_end = "2024-01-01"
lags = [1]
windows = [7, 30, 365]

lag_transformer = FeatureEngineerByBA(
    LagFeatures(
        variables=["demand"],
        periods=lags,
        drop_original=False,
    )
)

window_transformer = FeatureEngineerByBA(
    WindowFeatures(
        variables=["demand"],
        window=windows,
        freq=None,
        functions=["mean"],
        missing_values="ignore",
    )
)

pipe = Pipeline(
    [
        ("lags", lag_transformer), 
        # ("windf", window_transformer),
        ("drop", DropMissingData())
    ]
)

demand_ = pipe.fit_transform(demand)
train_end = pd.Timestamp(train_end)

demand_ = demand_.loc[demand_["datetime"] > train_end]

display(demand_.head())
display(demand_.tail())

Unnamed: 0,datetime,demand,ba_code,demand_lag_1
1127,2024-01-02,81919.0,AECI,78611.0
1128,2024-01-03,78631.0,AECI,81919.0
1129,2024-01-04,78007.0,AECI,78631.0
1130,2024-01-05,80639.0,AECI,78007.0
1131,2024-01-06,77412.0,AECI,80639.0


Unnamed: 0,datetime,demand,ba_code,demand_lag_1
103778,2025-02-22,2342.0,WAUW,2589.0
103779,2025-02-23,2161.0,WAUW,2342.0
103780,2025-02-24,2088.0,WAUW,2161.0
103781,2025-02-25,2064.0,WAUW,2088.0
103782,2025-02-26,2080.0,WAUW,2064.0


In [4]:
maes = {}
for col in demand_.columns[3:]:
    mae = mean_absolute_error(demand_["demand"], demand_[col])
    maes[col] = mae

pd.DataFrame({k.replace("demand_", ""): [v] for k, v in maes.items()})

Unnamed: 0,lag_1,lag_7,lag_30,lag_365,window_7_mean,window_30_mean,window_365_mean
0,9323.132523,19273.106322,26481.084637,20414.824063,14191.049961,17487.92177,26331.820495


Best score came from using the previous day's demand as the prediction for the next, with an MAE of 9323.

In [58]:
train_end = "2024-01-01"

class BenchmarkLagOneDay:

    def __init__(self, X: pd.DataFrame, train_end: str):
        self.transformer = FeatureEngineerByBA(
            LagFeatures(
                variables=["demand"],
                periods=[1],
                drop_original=False,
            )
        )
        self.X = X.copy()
        self.train_end = pd.Timestamp(train_end)

    def fit(self) -> "BenchmarkLagOneDay":
        pipe = Pipeline(
            [
                ("transformer", self.transformer),
                ("drop", DropMissingData()),
            ]
        )

        self.X = pipe.fit_transform(self.X)
        self.X = self.X.loc[self.X["datetime"] > self.train_end]

        return self

    def predict(self) -> pd.Series:
        return self.X["demand_lag_1"]

In [None]:
demand = data.copy()

train_end = "2024-01-01"

# To obtain same test set as for a model transformed by a set of steps in a pipeline, 
# need to fix the index of this test set to match that of the transformed test set, y_test_t
y_test = demand.loc[demand["datetime"] > train_end]["demand"]

model = BenchmarkLagOneDay(demand, train_end)
model.fit()
preds = model.predict()

mae = mean_absolute_error(y_test, preds)
mae

9323.132522578915