# Feature Engineering

In this notebook we will select and create features to feed our ML model. For now, we will focus on the consumption data.
<br>
We will use `MLForecast` to create the lag and time-related features.

In [1]:
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean

In [2]:
df = pd.read_csv("../data/preprocessed/consumption_train.csv", parse_dates=["datetime"])
print(df.shape)
df.head()

(760650, 9)


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,96.59,1,2021-09-01,0,1,0
1,3,0,3,39.241,1,2021-09-01,0,25,12
2,7,1,3,453.023,1,2021-09-01,0,61,30
3,8,0,1,9.787,1,2021-09-01,0,63,31
4,3,0,1,14.964,1,2021-09-01,0,23,11


# Feature engineering function

In [3]:
def feature_engineering(
    df,
    id_col="prediction_unit_id",
    time_col="datetime",
    target_col="target",
    forecast_horizon=48,
    n_lags=24,
    rolling_mean_window_size=24,
    inference=False,
):
    if inference == True:  # add rows between the last recorded value and the target_col
        unique_ids = df[id_col].unique()
        for id in unique_ids:
            county = df.loc[df["prediction_unit_id"]==id, "county"].iloc[0]
            is_business = df.loc[df["prediction_unit_id"]==id, "is_business"].iloc[0]
            product_type = df.loc[df["prediction_unit_id"]==id, "product_type"].iloc[0]
            new_rows = pd.DataFrame(
                {
                    id_col: id,
                    time_col: pd.date_range(
                        df[time_col].iloc[-1], periods=forecast_horizon, freq="h"
                    ),
                    target_col: -99,  # can't be None
                    "county": county,
                    "is_business": is_business,
                    "product_type": product_type,
                },
                index=range(df.index.stop, df.index.stop+forecast_horizon)
            )
            df = pd.concat((df, new_rows))

    fcst = MLForecast(
        models=[],
        freq="h",
        lags=[i + forecast_horizon for i in range(n_lags)],
        lag_transforms={
            i + forecast_horizon: [ExpandingMean(), RollingMean(window_size=rolling_mean_window_size)]
            for i in range(24)
        },
        date_features=["month", "dayofweek", "hour"],
    )

    id_columns = [id_col, time_col, target_col]
    X = fcst.preprocess(df[id_columns], id_col=id_col, time_col=time_col, target_col=target_col)
    columns_to_drop = id_columns + ["data_block_id", "row_id", "is_consumption"]
    X = pd.concat([df[df.columns.drop(columns_to_drop)], X], axis=1, join="inner")
    if inference == True:
        return X.drop(columns=id_columns)
    else:
        X, y = X.drop(columns=id_columns), X[target_col]
        return X, y

# Testing the function for training

In [4]:
X, y = feature_engineering(df, inference=False)
print(X.shape)
print(y.shape)

(754728, 78)
(754728,)


In [5]:
X.head()

Unnamed: 0,county,is_business,product_type,lag48,lag49,lag50,lag51,lag52,lag53,lag54,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
5264,4,1,3,356.405,510.314,593.243,498.008,651.211,665.971,702.769,...,630.763833,608.930462,630.926875,620.69836,631.689917,632.698667,632.698667,9,5,22
5265,2,0,3,31.637,41.316,40.195,29.904,21.693,14.567,8.766,...,17.781792,18.205423,17.69125,17.90728,17.676583,17.693208,17.693208,9,5,22
5266,5,1,1,101.242,104.646,127.011,149.387,144.964,133.411,191.672,...,130.350083,126.109423,130.608667,128.76856,131.103667,130.774417,130.774417,9,5,22
5267,0,1,3,5070.692,5401.186,5844.822,6041.072,6060.432,6165.16,5353.664,...,4990.877875,5026.974692,5024.7095,5064.05332,5060.261542,5095.1675,5095.1675,9,5,22
5268,7,1,0,763.137,890.89,849.272,880.601,732.112,856.077,401.599,...,641.494667,633.809077,641.823208,637.72728,641.517083,641.470167,641.470167,9,5,22


In [6]:
y.head()

5264     368.268
5265      41.177
5266      57.411
5267    5039.402
5268     305.908
Name: target, dtype: float64

# Testing the function for inference

In [7]:
X = feature_engineering(df, inference=True)
print(X.shape)

(757752, 78)


In [8]:
X.head()

Unnamed: 0,county,is_business,product_type,lag48,lag49,lag50,lag51,lag52,lag53,lag54,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
5264,4,1,3,356.405,510.314,593.243,498.008,651.211,665.971,702.769,...,630.763833,608.930462,630.926875,620.69836,631.689917,632.698667,632.698667,9,5,22
5265,2,0,3,31.637,41.316,40.195,29.904,21.693,14.567,8.766,...,17.781792,18.205423,17.69125,17.90728,17.676583,17.693208,17.693208,9,5,22
5266,5,1,1,101.242,104.646,127.011,149.387,144.964,133.411,191.672,...,130.350083,126.109423,130.608667,128.76856,131.103667,130.774417,130.774417,9,5,22
5267,0,1,3,5070.692,5401.186,5844.822,6041.072,6060.432,6165.16,5353.664,...,4990.877875,5026.974692,5024.7095,5064.05332,5060.261542,5095.1675,5095.1675,9,5,22
5268,7,1,0,763.137,890.89,849.272,880.601,732.112,856.077,401.599,...,641.494667,633.809077,641.823208,637.72728,641.517083,641.470167,641.470167,9,5,22


In [9]:
X.tail()

Unnamed: 0,county,is_business,product_type,lag48,lag49,lag50,lag51,lag52,lag53,lag54,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
763669,11,1,0,532.796,531.7,396.956,385.78,390.776,364.798,509.426,...,458.726125,336.101742,459.245333,336.112464,459.808833,336.093559,460.309583,5,0,10
763670,11,1,0,537.549,532.796,531.7,396.956,385.78,390.776,364.798,...,457.775625,336.092077,458.726125,336.101742,459.245333,336.112464,459.808833,5,0,11
763671,11,1,0,435.205,537.549,532.796,531.7,396.956,385.78,390.776,...,457.337792,336.086865,457.775625,336.092077,458.726125,336.101742,459.245333,5,0,12
763672,11,1,0,239.252,435.205,537.549,532.796,531.7,396.956,385.78,...,459.81975,336.087833,457.337792,336.086865,457.775625,336.092077,458.726125,5,0,13
763673,11,1,0,259.745,239.252,435.205,537.549,532.796,531.7,396.956,...,460.0845,336.108153,459.81975,336.087833,457.337792,336.086865,457.775625,5,0,14


In [10]:
X.isna().sum().sum()

np.int64(0)

In [11]:
ts = df[df["prediction_unit_id"]==0].iloc[-24:]
ts

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
759180,0,0,1,737.878,1,2023-01-25 06:00:00,511,1615273,0
759242,0,0,1,800.512,1,2023-01-25 07:00:00,511,1615405,0
759293,0,0,1,797.651,1,2023-01-25 08:00:00,511,1615537,0
759385,0,0,1,784.166,1,2023-01-25 09:00:00,511,1615669,0
759404,0,0,1,741.292,1,2023-01-25 10:00:00,511,1615801,0
759468,0,0,1,690.67,1,2023-01-25 11:00:00,511,1615933,0
759552,0,0,1,646.51,1,2023-01-25 12:00:00,511,1616065,0
759623,0,0,1,673.457,1,2023-01-25 13:00:00,511,1616197,0
759658,0,0,1,679.117,1,2023-01-25 14:00:00,511,1616329,0
759727,0,0,1,732.858,1,2023-01-25 15:00:00,511,1616461,0


In [12]:
ts.reset_index(drop=True, inplace=True)
ts

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,737.878,1,2023-01-25 06:00:00,511,1615273,0
1,0,0,1,800.512,1,2023-01-25 07:00:00,511,1615405,0
2,0,0,1,797.651,1,2023-01-25 08:00:00,511,1615537,0
3,0,0,1,784.166,1,2023-01-25 09:00:00,511,1615669,0
4,0,0,1,741.292,1,2023-01-25 10:00:00,511,1615801,0
5,0,0,1,690.67,1,2023-01-25 11:00:00,511,1615933,0
6,0,0,1,646.51,1,2023-01-25 12:00:00,511,1616065,0
7,0,0,1,673.457,1,2023-01-25 13:00:00,511,1616197,0
8,0,0,1,679.117,1,2023-01-25 14:00:00,511,1616329,0
9,0,0,1,732.858,1,2023-01-25 15:00:00,511,1616461,0


In [13]:
X = feature_engineering(ts, inference=True)
print(X.shape)

(1, 78)


In [14]:
X.head()

Unnamed: 0,county,is_business,product_type,lag48,lag49,lag50,lag51,lag52,lag53,lag54,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
71,0,0,1,654.215,646.03,634.848,652.737,664.992,710.865,755.605,...,780.05175,778.680333,778.680333,769.195,769.195,737.878,737.878,1,5,4
