# Feature Engineering

In this notebook we will select and create features to feed our ML model. For now, we will focus on the consumption data.
<br>
We will use `MLForecast` to create the features.

In [50]:
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from pathlib import Path

In [51]:
import sys
sys.path.append("/Users/gabriel/Documents/Git/End-to-end MLOps for Time Series")
from utils import load_config

In [52]:
default_config_path = "../config/development/pipeline.yaml"
config = load_config(default_config_path)

# Feature Engineering function

In [53]:
def feature_engineering(
    df,
    freq,
    id_col,
    time_col,
    target_col,
    forecast_horizon,
    n_lags,
    rolling_mean_window_size,
    n_lag_transforms,
    date_features,
    static_features,
    on_test=False, # TODO: if True, stack last rows of train on top of test data
):
    lags = [i for i in range(forecast_horizon, forecast_horizon + n_lags)]
    lag_transforms = {
        i: [ExpandingMean(), RollingMean(window_size=rolling_mean_window_size)]
        for i in range(forecast_horizon, forecast_horizon + n_lag_transforms)
    }
    fcst = MLForecast(
        models=[],
        freq=freq,
        lags=lags,
        lag_transforms=lag_transforms,
        date_features=date_features,
    )
    df_transformed = fcst.preprocess(
        df,
        id_col=id_col,
        time_col=time_col,
        target_col=target_col,
        static_features=static_features,
    )
    return df_transformed

# Testing the function

In [54]:
df = pd.read_csv("../data/preprocessed/consumption_train.csv", parse_dates=["datetime"])
print(df.shape)
df.head()

(760650, 9)


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,96.59,1,2021-09-01,0,1,0
1,3,0,3,39.241,1,2021-09-01,0,25,12
2,7,1,3,453.023,1,2021-09-01,0,61,30
3,8,0,1,9.787,1,2021-09-01,0,63,31
4,3,0,1,14.964,1,2021-09-01,0,23,11


In [56]:
X = feature_engineering(
    df=df,
    freq=config["freq"],
    id_col=config["id_col"],
    time_col=config["time_col"],
    target_col=config["target_col"],
    forecast_horizon=config["forecast_horizon"],
    n_lags=config["n_lags"],
    rolling_mean_window_size=config["rolling_mean_window_size"],
    n_lag_transforms=config["n_lag_transforms"],
    date_features=config["date_features"],
    static_features=config["static_features"],
    on_test=on_test,
)
X[X["prediction_unit_id"] == 0].head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,lag48,...,rolling_mean_lag48_window_size4,expanding_mean_lag49,rolling_mean_lag49_window_size4,expanding_mean_lag50,rolling_mean_lag50_window_size4,expanding_mean_lag51,rolling_mean_lag51_window_size4,month,dayofweek,hour
3046,0,0,1,109.17,1,2021-09-03 06:00:00,2,6589,0,96.481,...,90.60025,88.6325,89.3785,88.4028,86.356,88.4575,88.4575,9,4,6
3105,0,0,1,108.439,1,2021-09-03 07:00:00,2,6711,0,94.592,...,92.2595,89.753714,90.60025,88.6325,89.3785,88.4028,86.356,9,4,7
3187,0,0,1,95.669,1,2021-09-03 08:00:00,2,6833,0,77.308,...,89.5405,90.3585,92.2595,89.753714,90.60025,88.6325,89.3785,9,4,8
3247,0,0,1,73.07,1,2021-09-03 09:00:00,2,6955,0,54.211,...,80.648,88.908444,89.5405,90.3585,92.2595,89.753714,90.60025,9,4,9
3284,0,0,1,50.981,1,2021-09-03 10:00:00,2,7077,0,36.071,...,65.5455,85.4387,80.648,88.908444,89.5405,90.3585,92.2595,9,4,10


In [57]:
for file in files:
    on_test = True if file.split(".")[0].split("_")[1] == "test" else False
    print(file.split(".")[0].split("_")[1], ":", on_test)

train : False
test : True
train : False
test : True


In [59]:
processed_path = "../data/processed"

files = [
    "consumption_train.csv",
    "consumption_test.csv",
    "production_train.csv",
    "production_test.csv",
]
for file in files:
    df = pd.read_csv(Path(preprocessed_path, file), parse_dates=["datetime"])
    on_test = True if file.split(".")[0].split("_")[1] == "test" else False
    df_transformed = feature_engineering(
        df=df,
        freq=config["freq"],
        id_col=config["id_col"],
        time_col=config["time_col"],
        target_col=config["target_col"],
        forecast_horizon=config["forecast_horizon"],
        n_lags=config["n_lags"],
        rolling_mean_window_size=config["rolling_mean_window_size"],
        n_lag_transforms=config["n_lag_transforms"],
        date_features=config["date_features"],
        static_features=config["static_features"],
        on_test=on_test,
    )
    # df_transformed.to_csv(Path(processed_path, file), index=False)
    print(f"{file} processed and saved to {processed_path}")

consumption_train.csv processed and saved to ../data/processed
consumption_test.csv processed and saved to ../data/processed
production_train.csv processed and saved to ../data/processed
production_test.csv processed and saved to ../data/processed


# Stack last train rows on top of test

When applying feature engineering we remove the first rows of the dataframe.
First determine the value of `step` for which `X` and `df` are aligned according to "datetime".
Once it's done, stack the last `n_step` values of `train` on top of `test` so that we don't lose
the first rows of test after preprocessing.

In [87]:
# TODO: determine the value of `step` for which `X` and `df` are aligned according to "datetime".
# Once it's done, stack the last `n_step` values of `train` on top of `test` so that we don't lose
# the first rows of test after preprocessing.
step = forecast_horizon + max(n_lags, n_lag_transforms) - 1
step

71

In [88]:
X_ = X.sort_values(by=["prediction_unit_id", "datetime"])
df_ = df.sort_values(by=["prediction_unit_id", "datetime"])
print((X_["datetime"].head().values == df_["datetime"].iloc[step:step+5].values).sum() == 5)
print((X_["datetime"].tail().values == df_["datetime"].tail().values).sum() == 5)

False
True


In [89]:
import random
random.seed(0)

forecast_horizon_list = random.choices(range(1, 10), k=3)
n_lags_list = random.choices(range(1, 10), k=3)
rolling_mean_window_size_list = random.choices(range(1, 10), k=3)
n_lag_transforms_list = random.choices(range(1, 10), k=3)
forecast_horizon_list

[8, 7, 4]

In [90]:
import itertools
combinations = itertools.product(forecast_horizon_list, n_lags_list, rolling_mean_window_size_list, n_lag_transforms_list)
# len(list(combinations))

In [91]:
# print(f"forecast_horizon_list : {forecast_horizon_list}")
# print(f"n_lags_list : {n_lags_list}")
# print(f"rolling_mean_window_size_list : {rolling_mean_window_size_list}")
# print(f"n_lag_transforms_list : {n_lag_transforms_list}")

In [92]:
# for _ in combinations:
#     forecast_horizon, n_lags, rolling_mean_window_size, n_lag_transforms = _
#     print(_)
#     break

# Testing the function on test set

In [93]:
# df_test = pd.read_csv("../data/preprocessed/consumption_test.csv", parse_dates=["datetime"])
# print(df_test.shape)
# df_test.head()

In [94]:
# X = feature_engineering(df_test, on_test=True)
# print(X.shape)
# X.head()

In [95]:
# X.tail()

In [96]:
# X.isna().sum().sum()