# Feature Engineering

In this notebook we will select and create features to feed our ML model. For now, we will focus on the consumption data.
<br>
We will use `MLForecast` to create the features.

In [82]:
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean

In [83]:
df = pd.read_csv("../data/preprocessed/consumption_train.csv", parse_dates=["datetime"])
print(df.shape)
df.head()

(760650, 9)


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,96.59,1,2021-09-01,0,1,0
1,3,0,3,39.241,1,2021-09-01,0,25,12
2,7,1,3,453.023,1,2021-09-01,0,61,30
3,8,0,1,9.787,1,2021-09-01,0,63,31
4,3,0,1,14.964,1,2021-09-01,0,23,11


In [84]:
import sys
sys.path.append("/Users/gabriel/Documents/Git/End-to-end MLOps for Time Series")
from utils import load_config
config = load_config("../config/config.yaml")

# Feature Engineering function

In [85]:
forecast_horizon = config["forecast_horizon"]
n_lags = config["n_lags"]
rolling_mean_window_size = config["rolling_mean_window_size"]
n_lag_transforms = n_lags
date_features = config["date_features"]

def feature_engineering(
    df,
    id_col="prediction_unit_id",
    time_col="datetime",
    target_col="target",
    forecast_horizon=forecast_horizon,
    n_lags=n_lags,
    rolling_mean_window_size=rolling_mean_window_size,
    date_features=date_features,
    static_features=["county", "is_business", "product_type", "is_consumption", "prediction_unit_id"],
    on_test=False, # if True, stack last rows of train on top of test data
):
    fcst = MLForecast(
        models=[],
        freq="h",
        lags=[i for i in range(forecast_horizon, forecast_horizon + n_lags)],
        lag_transforms={
            i: [ExpandingMean(), RollingMean(window_size=rolling_mean_window_size)]
            for i in range(forecast_horizon, forecast_horizon + n_lag_transforms)
        },
        date_features=date_features,
    )
    id_columns = [id_col, time_col, target_col]
    return fcst.preprocess(df, id_col=id_col, time_col=time_col, target_col=target_col, static_features=static_features)

# Testing the function

In [86]:
X = feature_engineering(df)
X[X["prediction_unit_id"] == 0].head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,lag48,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
5286,0,0,1,165.499,1,2021-09-04 22:00:00,3,11469,0,134.986,...,81.8605,81.756654,81.308,81.1456,80.502083,79.96975,79.96975,9,5,22
5348,0,0,1,133.299,1,2021-09-04 23:00:00,3,11591,0,120.54,...,82.505417,82.612111,81.8605,81.756654,81.308,81.1456,80.502083,9,5,23
5387,0,0,1,123.214,1,2021-09-05 00:00:00,4,11713,0,107.129,...,83.241833,83.355714,82.505417,82.612111,81.8605,81.756654,81.308,9,6,0
5478,0,0,1,117.082,1,2021-09-05 01:00:00,4,11835,0,81.92,...,83.893958,84.131655,83.241833,83.355714,82.505417,82.612111,81.8605,9,6,1
5535,0,0,1,109.178,1,2021-09-05 02:00:00,4,11957,0,96.193,...,84.539375,84.841667,83.893958,84.131655,83.241833,83.355714,82.505417,9,6,2


# Stack last train rows on top of test

When applying feature engineering we remove the first rows of the dataframe.
First determine the value of `step` for which `X` and `df` are aligned according to "datetime".
Once it's done, stack the last `n_step` values of `train` on top of `test` so that we don't lose
the first rows of test after preprocessing.

In [87]:
# TODO: determine the value of `step` for which `X` and `df` are aligned according to "datetime".
# Once it's done, stack the last `n_step` values of `train` on top of `test` so that we don't lose
# the first rows of test after preprocessing.
step = forecast_horizon + max(n_lags, n_lag_transforms) - 1
step

71

In [88]:
X_ = X.sort_values(by=["prediction_unit_id", "datetime"])
df_ = df.sort_values(by=["prediction_unit_id", "datetime"])
print((X_["datetime"].head().values == df_["datetime"].iloc[step:step+5].values).sum() == 5)
print((X_["datetime"].tail().values == df_["datetime"].tail().values).sum() == 5)

False
True


In [89]:
import random
random.seed(0)

forecast_horizon_list = random.choices(range(1, 10), k=3)
n_lags_list = random.choices(range(1, 10), k=3)
rolling_mean_window_size_list = random.choices(range(1, 10), k=3)
n_lag_transforms_list = random.choices(range(1, 10), k=3)
forecast_horizon_list

[8, 7, 4]

In [90]:
import itertools
combinations = itertools.product(forecast_horizon_list, n_lags_list, rolling_mean_window_size_list, n_lag_transforms_list)
# len(list(combinations))

In [91]:
# print(f"forecast_horizon_list : {forecast_horizon_list}")
# print(f"n_lags_list : {n_lags_list}")
# print(f"rolling_mean_window_size_list : {rolling_mean_window_size_list}")
# print(f"n_lag_transforms_list : {n_lag_transforms_list}")

In [92]:
# for _ in combinations:
#     forecast_horizon, n_lags, rolling_mean_window_size, n_lag_transforms = _
#     print(_)
#     break

# Testing the function on test set

In [93]:
# df_test = pd.read_csv("../data/preprocessed/consumption_test.csv", parse_dates=["datetime"])
# print(df_test.shape)
# df_test.head()

In [94]:
# X = feature_engineering(df_test, on_test=True)
# print(X.shape)
# X.head()

In [95]:
# X.tail()

In [96]:
# X.isna().sum().sum()