# Feature Engineering

In this notebook we will select and create features to feed our ML model. For now, we will focus on the consumption data.
<br>
We will use `MLForecast` to create the features.

In [None]:
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean

In [None]:
df = pd.read_csv("../data/preprocessed/consumption_train.csv", parse_dates=["datetime"])
print(df.shape)
df.head()

In [None]:
import sys
sys.path.append("/Users/gabriel/Documents/Git/End-to-end MLOps for Time Series")
from utils import load_config
config = load_config("../config/config.yaml")

In [None]:
config["n_lag_transforms"]

# Feature Engineering function

In [None]:
forecast_horizon = config["forecast_horizon"]
n_lags = config["n_lags"]
rolling_mean_window_size = config["rolling_mean_window_size"]
n_lag_transforms = n_lags
date_features = config["date_features"]

def feature_engineering(
    df,
    id_col="prediction_unit_id",
    time_col="datetime",
    target_col="target",
    forecast_horizon=forecast_horizon,
    n_lags=n_lags,
    rolling_mean_window_size=rolling_mean_window_size,
    date_features=date_features,
    on_test=False, # if True, stack last rows of train on top of test data
):
    fcst = MLForecast(
        models=[],
        freq="h",
        lags=[i for i in range(forecast_horizon, forecast_horizon + n_lags)],
        lag_transforms={
            i: [ExpandingMean(), RollingMean(window_size=rolling_mean_window_size)]
            for i in range(forecast_horizon, forecast_horizon + n_lag_transforms)
        },
        date_features=date_features,
    )
    id_columns = [id_col, time_col, target_col]
    return fcst.preprocess(df[id_columns], id_col=id_col, time_col=time_col, target_col=target_col)

# Testing the function

In [None]:
forecast_horizon = 2
n_lags = 3
rolling_mean_window_size = 2
n_lag_transforms = 0

In [None]:
X = feature_engineering(df)
X[X["prediction_unit_id"] == 0].head()

In [None]:
df[df["prediction_unit_id"] == 0].head()

In [None]:
try:
    display(df_processed.head())
except:
    df_processed = pd.read_csv("../data/processed/consumption_train_processed.csv")
    display(df_processed.head())

In [None]:
X_ = X.sort_values(by=["prediction_unit_id", "datetime"])
df_ = df.sort_values(by=["prediction_unit_id", "datetime"])
df_.head()
# TODO: determine the value of `step` for which `X` and `df` are aligned according to "datetime".
# Once it's done, stack the last `n_step` values of `train` on top of `test` so that we don't lose
# the first rows of test after preprocessing.
step = forecast_horizon + max(n_lags, n_lag_transforms) - 1
(X_["datetime"].head() == df_["datetime"].iloc[step:step+5]).sum() == 5

# Stack last train rows on top of test

When applying feature engineering we remove the first rows of the dataframe.
First determine the value of `step` for which `X` and `df` are aligned according to "datetime".
Once it's done, stack the last `n_step` values of `train` on top of `test` so that we don't lose
the first rows of test after preprocessing.

In [None]:
import random
random.seed(0)

forecast_horizon_list = random.choices(range(1, 10), k=3)
n_lags_list = random.choices(range(1, 10), k=3)
rolling_mean_window_size_list = random.choices(range(1, 10), k=3)
n_lag_transforms_list = random.choices(range(1, 10), k=3)
forecast_horizon_list

In [None]:
import itertools
x = [1, 2, 3]
y = ['a', 'b', 'c']
combinations = itertools.product(forecast_horizon_list, n_lags_list, rolling_mean_window_size_list, n_lag_transforms_list)
# len(list(combinations))

In [None]:
# print(f"forecast_horizon_list : {forecast_horizon_list}")
# print(f"n_lags_list : {n_lags_list}")
# print(f"rolling_mean_window_size_list : {rolling_mean_window_size_list}")
# print(f"n_lag_transforms_list : {n_lag_transforms_list}")

In [None]:
# for _ in combinations:
#     print(_)
#     break

In [None]:
for _ in combinations:
    forecast_horizon, n_lags, rolling_mean_window_size, n_lag_transforms = _
    print(_)
    break

In [None]:
X_["datetime"].tail() == df_["datetime"].tail()

In [None]:
X_ = X.sort_values(by=["prediction_unit_id", "datetime"])
X_.head()

In [None]:
df_ = df.sort_values(by=["prediction_unit_id", "datetime"])
df_.head()
step = forecast_horizon + max(n_lags, n_lag_transforms) - 1
df_.iloc[step:step+5]

In [None]:
X_grouped = X.groupby(by="prediction_unit_id").tail()
X_grouped[X_grouped["prediction_unit_id"] == 0]

In [None]:
grouped = df.groupby(by="prediction_unit_id").tail()
grouped[grouped["prediction_unit_id"] == 0]

# Testing the function on test set

In [None]:
X = feature_engineering(df, inference=True)
print(X.shape)

In [None]:
X.head()

In [None]:
X.tail()

In [None]:
X.isna().sum().sum()

In [None]:
ts = df[df["prediction_unit_id"]==0].iloc[-24:]
ts

In [None]:
ts.reset_index(drop=True, inplace=True)
ts

In [None]:
X = feature_engineering(ts, inference=True)
print(X.shape)

In [None]:
X.head()