# Feature Engineering

In this notebook we will select and create features to feed our ML model. For now, we will focus on the consumption data.
<br>
We will use `MLForecast` to create the lag and time-related features.

In [86]:
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean

In [87]:
df = pd.read_csv("../data/preprocessed/consumption_train.csv", parse_dates=["datetime"])
print(df.shape)
df.head()

(760650, 9)


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,96.59,1,2021-09-01,0,1,0
1,3,0,3,39.241,1,2021-09-01,0,25,12
2,7,1,3,453.023,1,2021-09-01,0,61,30
3,8,0,1,9.787,1,2021-09-01,0,63,31
4,3,0,1,14.964,1,2021-09-01,0,23,11


# Feature engineering function

In [161]:
ids = [0, 1]
ts = df[df["prediction_unit_id"].isin(ids)].tail(24*len(ids))
ts.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
759180,0,0,1,737.878,1,2023-01-25 06:00:00,511,1615273,0
759192,0,0,2,28.838,1,2023-01-25 06:00:00,511,1615275,1
759241,0,0,2,35.224,1,2023-01-25 07:00:00,511,1615407,1
759242,0,0,1,800.512,1,2023-01-25 07:00:00,511,1615405,0
759293,0,0,1,797.651,1,2023-01-25 08:00:00,511,1615537,0


In [162]:
ts.tail()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
760508,0,0,2,20.013,1,2023-01-26 03:00:00,512,1618047,1
760557,0,0,2,21.796,1,2023-01-26 04:00:00,512,1618179,1
760565,0,0,1,646.03,1,2023-01-26 04:00:00,512,1618177,0
760619,0,0,2,21.953,1,2023-01-26 05:00:00,512,1618311,1
760641,0,0,1,654.215,1,2023-01-26 05:00:00,512,1618309,0


np.int64(0)

In [301]:
def feature_engineering(
    df,
    id_col="prediction_unit_id",
    time_col="datetime",
    target_col="target",
    forecast_horizon=4,
    n_lags=24,
    rolling_mean_window_size=24,
    inference=False,
):
    if inference == True:  # add rows between the last recorded value and the target_col
        unique_ids = df[id_col].unique()
        for id in unique_ids:
            county = df.loc[df["prediction_unit_id"]==id, "county"].iloc[0]
            is_business = df.loc[df["prediction_unit_id"]==id, "is_business"].iloc[0]
            product_type = df.loc[df["prediction_unit_id"]==id, "product_type"].iloc[0]
            new_rows = pd.DataFrame(
                {
                    id_col: id,
                    time_col: pd.date_range(
                        df[time_col].iloc[-1], periods=forecast_horizon, freq="h"
                    ),
                    target_col: -99,  # can't be None
                    "county": county,
                    "is_business": is_business,
                    "product_type": product_type,
                },
                index=range(df.index.stop, df.index.stop+forecast_horizon)
            )
            df = pd.concat((df, new_rows))

    fcst = MLForecast(
        models=[],
        freq="h",
        lags=[i + forecast_horizon for i in range(n_lags)],
        lag_transforms={
            i + forecast_horizon: [ExpandingMean(), RollingMean(window_size=rolling_mean_window_size)]
            for i in range(24)
        },
        date_features=["month", "dayofweek", "hour"],
    )

    id_columns = [id_col, time_col, target_col]
    X = fcst.preprocess(df[id_columns], id_col=id_col, time_col=time_col, target_col=target_col)
    columns_to_drop = id_columns + ["data_block_id", "row_id", "is_consumption"]
    X = pd.concat([df[df.columns.drop(columns_to_drop)], X], axis=1, join="inner")
    if inference == True:
        return X.drop(columns=id_columns)
    else:
        X, y = X.drop(columns=id_columns), X[target_col]
        return X, y

# Testing the function for training

In [99]:
X, y = feature_engineering(df, inference=False)
print(X.shape)
print(y.shape)

(754728, 78)
(754728,)


In [100]:
X.head()

Unnamed: 0,county,is_business,product_type,lag48,lag49,lag50,lag51,lag52,lag53,lag54,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
5264,4,1,3,356.405,510.314,593.243,498.008,651.211,665.971,702.769,...,630.763833,608.930462,630.926875,620.69836,631.689917,632.698667,632.698667,9,5,22
5265,2,0,3,31.637,41.316,40.195,29.904,21.693,14.567,8.766,...,17.781792,18.205423,17.69125,17.90728,17.676583,17.693208,17.693208,9,5,22
5266,5,1,1,101.242,104.646,127.011,149.387,144.964,133.411,191.672,...,130.350083,126.109423,130.608667,128.76856,131.103667,130.774417,130.774417,9,5,22
5267,0,1,3,5070.692,5401.186,5844.822,6041.072,6060.432,6165.16,5353.664,...,4990.877875,5026.974692,5024.7095,5064.05332,5060.261542,5095.1675,5095.1675,9,5,22
5268,7,1,0,763.137,890.89,849.272,880.601,732.112,856.077,401.599,...,641.494667,633.809077,641.823208,637.72728,641.517083,641.470167,641.470167,9,5,22


In [101]:
y.head()

5264     368.268
5265      41.177
5266      57.411
5267    5039.402
5268     305.908
Name: target, dtype: float64

# Testing the function for inference

In [304]:
X = feature_engineering(df, inference=True)
print(X.shape)

(757752, 78)


In [305]:
X.head()

Unnamed: 0,county,is_business,product_type,lag4,lag5,lag6,lag7,lag8,lag9,lag10,...,rolling_mean_lag24_window_size24,expanding_mean_lag25,rolling_mean_lag25_window_size24,expanding_mean_lag26,rolling_mean_lag26_window_size24,expanding_mean_lag27,rolling_mean_lag27_window_size24,month,dayofweek,hour
2800,3,1,3,791.336,826.717,846.936,847.394,820.862,696.927,711.533,...,703.663,705.797,703.872042,705.41824,704.432542,704.513083,704.513083,9,4,2
2801,9,1,1,41.932,45.853,42.147,53.52,37.915,34.934,32.839,...,87.547833,83.346423,87.386125,85.09128,87.173583,87.068958,87.068958,9,4,2
2802,14,0,1,14.338,22.362,19.745,15.053,7.036,6.546,8.48,...,9.909208,10.278231,9.892167,10.13376,9.913583,9.992625,9.992625,9,4,2
2803,10,1,1,48.539,48.337,54.458,41.462,36.052,23.948,20.82,...,27.8505,28.944308,27.984417,28.40316,27.755083,28.137333,28.137333,9,4,2
2804,0,0,3,920.535,1000.499,988.047,877.168,656.989,489.864,278.374,...,507.921667,513.582731,504.19975,507.23952,501.005375,497.7205,497.7205,9,4,2


In [306]:
X.tail()

Unnamed: 0,county,is_business,product_type,lag4,lag5,lag6,lag7,lag8,lag9,lag10,...,rolling_mean_lag24_window_size24,expanding_mean_lag25,rolling_mean_lag25_window_size24,expanding_mean_lag26,rolling_mean_lag26_window_size24,expanding_mean_lag27,rolling_mean_lag27_window_size24,month,dayofweek,hour
760897,2,1,1,11.143,10.912,11.102,11.385,11.115,11.167,12.983,...,13.092583,53.622564,13.14425,53.626694,13.188292,53.630831,13.182833,2,3,23
760898,11,1,0,537.549,532.796,531.7,396.956,385.78,390.776,364.798,...,457.775625,336.092077,458.726125,336.101742,459.245333,336.112464,459.808833,2,3,23
760899,11,1,0,435.205,537.549,532.796,531.7,396.956,385.78,390.776,...,457.337792,336.086865,457.775625,336.092077,458.726125,336.101742,459.245333,2,4,0
760900,11,1,0,239.252,435.205,537.549,532.796,531.7,396.956,385.78,...,459.81975,336.087833,457.337792,336.086865,457.775625,336.092077,458.726125,2,4,1
760901,11,1,0,259.745,239.252,435.205,537.549,532.796,531.7,396.956,...,460.0845,336.108153,459.81975,336.087833,457.337792,336.086865,457.775625,2,4,2


In [308]:
X.isna().sum().sum()

np.int64(0)