# Feature Engineering

In this notebook we will select and create features to feed our ML model. For now, we will focus on the consumption data.
<br>
We will use `MLForecast` to create the features.

In [1]:
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from pathlib import Path

In [2]:
import sys

sys.path.append(
    "/Users/gabriel/Documents/Git/End-to-end-MLOps-for-Time-Series-Forecasting"
)
from utils import load_config

In [3]:
default_config_path = "../configs/development/config.yaml"
config = load_config(default_config_path)

# Feature Engineering function


In [4]:
def feature_engineering(
    df,
    freq,
    id_col,
    time_col,
    target_col,
    lags,
    rolling_mean_window_sizes,
    lag_transforms_index,
    date_features,
    static_features,
    on_test=False,  # TODO: if True, stack last rows of train on top of test data to keep all test data
):
    lag_transforms = {
        i: [ExpandingMean()]
        + [RollingMean(window_size) for window_size in rolling_mean_window_sizes]
        for i in lag_transforms_index
    }
    fcst = MLForecast(
        models=[],
        freq=freq,
        lags=lags,
        lag_transforms=lag_transforms,
        date_features=date_features,
    )
    df_transformed = fcst.preprocess(
        df,
        id_col=id_col,
        time_col=time_col,
        target_col=target_col,
        static_features=static_features,
    )
    return df_transformed

# Testing the function

In [5]:
df = pd.read_csv("../data/preprocessed/consumption_train.csv", parse_dates=["datetime"])
print(df.shape)
df.head()

(760650, 9)


Unnamed: 0,target,county,is_business,product_type,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,96.59,0,0,1,1,2021-09-01,0,1,0
1,32.354,8,0,3,1,2021-09-01,0,65,32
2,129.063,8,1,3,1,2021-09-01,0,67,33
3,13.922,9,0,1,1,2021-09-01,0,69,34
4,91.447,9,0,3,1,2021-09-01,0,71,35


In [6]:
X = feature_engineering(
    df=df,
    freq=config["freq"],
    id_col=config["id_col"],
    time_col=config["time_col"],
    target_col=config["target_col"],
    lags=config["lags"],
    rolling_mean_window_sizes=config["rolling_mean_window_sizes"],
    lag_transforms_index=config["lag_transforms_index"],
    date_features=config["date_features"],
    static_features=config["static_features"],
    on_test=False,
)
X.head()

Unnamed: 0,target,county,is_business,product_type,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,lag48,...,lag144,lag168,expanding_mean_lag48,rolling_mean_lag48_window_size6,rolling_mean_lag48_window_size12,rolling_mean_lag48_window_size24,rolling_mean_lag48_window_size48,month,dayofweek,hour
9408,232.64,11,0,3,1,2021-09-08,7,20587,45,260.052,...,257.347,229.602,196.218595,370.763667,265.144083,228.315958,215.394042,9,2,0
9409,22.824,11,0,1,1,2021-09-08,7,20583,43,26.557,...,24.06,21.099,20.751099,34.5955,26.141,23.435333,22.742208,9,2,0
9410,924.467,10,1,3,1,2021-09-08,7,20581,42,822.525,...,833.737,927.349,922.493322,821.747833,646.00825,641.936083,690.803396,9,2,0
9411,36.727,10,1,1,1,2021-09-08,7,20577,40,39.887,...,34.783,43.957,30.143909,42.943833,24.458167,24.878667,25.404521,9,2,0
9412,77.04,10,0,3,1,2021-09-08,7,20575,39,106.057,...,73.25,77.071,63.335521,123.578667,76.412583,71.224708,73.752542,9,2,0


Explanation of the `lag_transforms` features:
- expanding_mean_lag**N**: expanding mean until lag **N**.
- rolling_mean_lag**N**_window_size**M**: rolling mean starting from lag **N** over the previous **M** values.
<br>
Example: rolling_mean_lag**5**_window_size**3** corresponds to the rolling mean starting from lag**5** over the previous **3** values. So this the means of lags 5, 6 and 7.

In [7]:
files = [
    "consumption_train.csv",
    "consumption_test.csv",
    "production_train.csv",
    "production_test.csv",
]

for file in files:
    on_test = True if "test" in file else False
    print(file, f": on_test={on_test}")

consumption_train.csv : on_test=False
consumption_test.csv : on_test=True
production_train.csv : on_test=False
production_test.csv : on_test=True


In [8]:
preprocessed_path = "../data/preprocessed"
processed_path = "../data/processed"

for file in files:
    df = pd.read_csv(Path(preprocessed_path, file), parse_dates=["datetime"])
    on_test = True if file.split(".")[0].split("_")[1] == "test" else False
    df_transformed = feature_engineering(
        df=df,
        freq=config["freq"],
        id_col=config["id_col"],
        time_col=config["time_col"],
        target_col=config["target_col"],
        lags=config["lags"],
        rolling_mean_window_sizes=config["rolling_mean_window_sizes"],
        lag_transforms_index=config["lag_transforms_index"],
        date_features=config["date_features"],
        static_features=config["static_features"],
        on_test=False,
    )
    # df_transformed.to_csv(Path(processed_path, file), index=False)
    print(f"{file} processed and saved to {processed_path}")

consumption_train.csv processed and saved to ../data/processed
consumption_test.csv processed and saved to ../data/processed
production_train.csv processed and saved to ../data/processed
production_test.csv processed and saved to ../data/processed


# Stack last train rows on top of test

When applying feature engineering we remove the first rows of the dataframe.
First determine the value of `step` for which `X` and `df` are aligned according to "datetime".
Once it's done, stack the last `n_step` values of `train` on top of `test` so that we don't lose
the first rows of test after preprocessing.

In [9]:
# # TODO: determine the value of `step` for which `X` and `df` are aligned according to "datetime".
# # Once it's done, stack the last `n_step` values of `train` on top of `test` so that we don't lose
# # the first rows of test after preprocessing.
# step = forecast_horizon + max(n_lags, n_lag_transforms) - 1
# step

In [10]:
# X_ = X.sort_values(by=["prediction_unit_id", "datetime"])
# df_ = df.sort_values(by=["prediction_unit_id", "datetime"])
# print(
#     (X_["datetime"].head().values == df_["datetime"].iloc[step : step + 5].values).sum()
#     == 5
# )
# print((X_["datetime"].tail().values == df_["datetime"].tail().values).sum() == 5)

In [11]:
# import random

# random.seed(0)

# forecast_horizon_list = random.choices(range(1, 10), k=3)
# n_lags_list = random.choices(range(1, 10), k=3)
# rolling_mean_window_size_list = random.choices(range(1, 10), k=3)
# n_lag_transforms_list = random.choices(range(1, 10), k=3)
# forecast_horizon_list

In [12]:
# import itertools

# combinations = itertools.product(
#     forecast_horizon_list,
#     n_lags_list,
#     rolling_mean_window_size_list,
#     n_lag_transforms_list,
# )
# # len(list(combinations))

In [13]:
# print(f"forecast_horizon_list : {forecast_horizon_list}")
# print(f"n_lags_list : {n_lags_list}")
# print(f"rolling_mean_window_size_list : {rolling_mean_window_size_list}")
# print(f"n_lag_transforms_list : {n_lag_transforms_list}")

In [14]:
# for _ in combinations:
#     forecast_horizon, n_lags, rolling_mean_window_size, n_lag_transforms = _
#     print(_)
#     break

# Testing the function on test set

In [15]:
# df_test = pd.read_csv("../data/preprocessed/consumption_test.csv", parse_dates=["datetime"])
# print(df_test.shape)
# df_test.head()

In [16]:
# X = feature_engineering(df_test, on_test=True)
# print(X.shape)
# X.head()

In [17]:
# X.tail()

In [18]:
# X.isna().sum().sum()