In [97]:
"""
TODO: Currently take all BAs but need to only keep the ones 
    that are in BAS in src/config.py
"""

from lightgbm import LGBMRegressor
import pandas as pd

from feature_engine.creation import CyclicalFeatures
from feature_engine.datetime import DatetimeFeatures
from feature_engine.imputation import DropMissingData
from feature_engine.selection import DropFeatures
from feature_engine.timeseries.forecasting import (
    LagFeatures,
    WindowFeatures,
)

from feature_engine.encoding import OrdinalEncoder

from sklearn.pipeline import Pipeline

from src.data import (
    make_exog_features,
    split_data,
    prepare_feature_store_data_for_training,
)

from src.model import forwardfill_missing_values
from src.paths import TRANSFORMED_DATA_DIR

data = pd.read_csv(
    TRANSFORMED_DATA_DIR / "ts_tabular_2022_10_to_2025_2.csv",
    parse_dates=["datetime"])
data.head()

Unnamed: 0,datetime,demand,ba_code
0,2022-10-01,51628,AECI
1,2022-10-02,53127,AECI
2,2022-10-03,54708,AECI
3,2022-10-04,53345,AECI
4,2022-10-05,53356,AECI


In [33]:
demand = data.copy()
demand.dtypes

datetime    datetime64[ns]
demand               int64
ba_code             object
dtype: object

In [34]:
dtf = DatetimeFeatures(
    # the datetime variable
    variables="datetime",
    # the features we want to create
    features_to_extract=[
        "month",
        "week",
        "day_of_week",
        "day_of_month",
        "weekend",
    ],
    drop_original=False
)

demand = dtf.fit_transform(demand)
demand.head()

Unnamed: 0,datetime,demand,ba_code,datetime_month,datetime_week,datetime_day_of_week,datetime_day_of_month,datetime_weekend
0,2022-10-01,51628,AECI,10,39,5,1,1
1,2022-10-02,53127,AECI,10,39,6,2,1
2,2022-10-03,54708,AECI,10,40,0,3,0
3,2022-10-04,53345,AECI,10,40,1,4,0
4,2022-10-05,53356,AECI,10,40,2,5,0


In [None]:
from sklearn.base import TransformerMixin

def wrapper(df: pd.DataFrame, transformer: TransformerMixin) -> pd.DataFrame:
    # Unique bas
    ba_codes = df["ba_code"].unique()
    
    # Iterate over each BA, fit_transform for each BA and concatenate
    output = pd.DataFrame()
    for ba_code in ba_codes:
        tmp = df.loc[df["ba_code"] == ba_code, :].copy()
        tmp = transformer.fit_transform(tmp)
        tmp["ba_code"] = ba_code
        output = pd.concat([output, tmp])

    return output

In [87]:
def get_lag_features(
    df: pd.DataFrame, lags: list[int] = [1, 2, 3], variables: list[str] = ["demand"],
    ) -> pd.DataFrame:
    
    # define lag transformer
    lf = LagFeatures(
        variables=variables,
        periods=lags,
        drop_original=False,
    )

    return wrapper(df, lf)


def get_window_features(
    df: pd.DataFrame, 
    window: list[int] = [3, 5, 7], 
    freq: str = None, 
    functions: list[str] = ["mean"], 
    variables: list[str] = ["demand"],
) -> pd.DataFrame:
    
    winf = WindowFeatures(
        variables=variables, 
        window=window, 
        freq=freq, 
        functions=functions,
        missing_values="ignore",
    )
    
    return wrapper(df, winf)


In [83]:
df_ = demand.copy()

df_ = get_lag_features(df_, lags=[1,2,3])

display(df_.head())
display(df_.tail())

Unnamed: 0,datetime,demand,ba_code,datetime_month,datetime_week,datetime_day_of_week,datetime_day_of_month,datetime_weekend,demand_lag_1,demand_lag_2,demand_lag_3
0,2022-10-01,51628,AECI,10,39,5,1,1,,,
1,2022-10-02,53127,AECI,10,39,6,2,1,51628.0,,
2,2022-10-03,54708,AECI,10,40,0,3,0,53127.0,51628.0,
3,2022-10-04,53345,AECI,10,40,1,4,0,54708.0,53127.0,51628.0
4,2022-10-05,53356,AECI,10,40,2,5,0,53345.0,54708.0,53127.0


Unnamed: 0,datetime,demand,ba_code,datetime_month,datetime_week,datetime_day_of_week,datetime_day_of_month,datetime_weekend,demand_lag_1,demand_lag_2,demand_lag_3
58955,2025-02-22,2342,WAUW,2,8,5,22,1,2589.0,3026.0,3287.0
58956,2025-02-23,2161,WAUW,2,8,6,23,1,2342.0,2589.0,3026.0
58957,2025-02-24,2088,WAUW,2,9,0,24,0,2161.0,2342.0,2589.0
58958,2025-02-25,2064,WAUW,2,9,1,25,0,2088.0,2161.0,2342.0
58959,2025-02-26,2080,WAUW,2,9,2,26,0,2064.0,2088.0,2161.0


In [88]:
df_ = demand.copy()

df_ = get_window_features(df_, window=[3, 5, 7], functions=["mean", "median", "std"])

df_.head(10)

Unnamed: 0,datetime,demand,ba_code,datetime_month,datetime_week,datetime_day_of_week,datetime_day_of_month,datetime_weekend,demand_lag_1,demand_lag_2,demand_lag_3,demand_window_3_mean,demand_window_3_median,demand_window_3_std,demand_window_5_mean,demand_window_5_median,demand_window_5_std,demand_window_7_mean,demand_window_7_median,demand_window_7_std
0,2022-10-01,51628,AECI,10,39,5,1,1,,,,,,,,,,,,
1,2022-10-02,53127,AECI,10,39,6,2,1,51628.0,,,,,,,,,,,
2,2022-10-03,54708,AECI,10,40,0,3,0,53127.0,51628.0,,,,,,,,,,
3,2022-10-04,53345,AECI,10,40,1,4,0,54708.0,53127.0,51628.0,53154.333333,53127.0,1540.181916,,,,,,
4,2022-10-05,53356,AECI,10,40,2,5,0,53345.0,54708.0,53127.0,53726.666667,53345.0,856.821063,,,,,,
5,2022-10-06,54014,AECI,10,40,3,6,0,53356.0,53345.0,54708.0,53803.0,53356.0,783.772288,53232.8,53345.0,1094.36726,,,
6,2022-10-07,50246,AECI,10,40,4,7,0,54014.0,53356.0,53345.0,53571.666667,53356.0,383.111385,53710.0,53356.0,649.393948,,,
7,2022-10-08,50391,AECI,10,40,5,8,1,50246.0,54014.0,53356.0,52538.666667,53356.0,2012.580764,53133.8,53356.0,1709.256037,52917.714286,53345.0,1507.030935
8,2022-10-09,51900,AECI,10,40,6,9,1,50391.0,50246.0,54014.0,51550.333333,50391.0,2134.829345,52270.4,53345.0,1803.037243,52741.0,53345.0,1738.254872
9,2022-10-10,52578,AECI,10,41,0,10,0,51900.0,50391.0,50246.0,50845.666667,50391.0,915.953238,51981.4,51900.0,1700.632177,52565.714286,53345.0,1754.631463


In [57]:
(51628 + 53127 + 54708 + 53345 + 53356) / 5

53232.8

In [None]:
from sklearn.preprocessing import FunctionTransformer


lagf = FunctionTransformer(get_lag_features, validate=False)
windf = FunctionTransformer(get_window_features, validate=False)
# Introduce missing date when using lags and windows so need to drop these NaNs
drop_missing = DropMissingData()

# Ordinal encoding for BA feature
ordinal_enc = OrdinalEncoder(variables=["ba_code"], encoding_method="arbitrary")

# Also drop the target from the training set
drop_target = DropFeatures(features_to_drop=["demand"])

In [107]:
pipe = Pipeline(
    [
        ("datetime", dtf),
        ("lags", lagf),
        ("windf", windf),
        ("drop_missing", drop_missing),
        ("ordinal_enc", ordinal_enc),
        ("drop_target", drop_target),
    ]
)

In [108]:
demand = data.copy()

demand = pipe.fit_transform(demand)

demand.head()

Unnamed: 0,datetime,ba_code,datetime_month,datetime_week,datetime_day_of_week,datetime_day_of_month,datetime_weekend,demand_lag_1,demand_lag_2,demand_lag_3,demand_window_3_mean,demand_window_5_mean,demand_window_7_mean
7,2022-10-08,0,10,40,5,8,1,50246.0,54014.0,53356.0,52538.666667,53133.8,52917.714286
8,2022-10-09,0,10,40,6,9,1,50391.0,50246.0,54014.0,51550.333333,52270.4,52741.0
9,2022-10-10,0,10,41,0,10,0,51900.0,50391.0,50246.0,50845.666667,51981.4,52565.714286
10,2022-10-11,0,10,41,1,11,0,52578.0,51900.0,50391.0,51623.0,51825.8,52261.428571
11,2022-10-12,0,10,41,2,12,0,53944.0,52578.0,51900.0,52807.333333,51811.8,52347.0
