# Feature_Engineering (ML_Forecast)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gogati/Modern-Time-Series-Forecasting-with-Python-2E/edit/main/notebooks/Chapter06/01-Feature_Engineering%28ML_Forecast%29.ipynb)



In [1]:
%cd ../..

c:\Work\Modern Time Series Forecasting _ 2E\Modern-Time-Series-Forecasting-with-Python-2E


In [2]:
import math
import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
from src.utils.general import LogTime
from tqdm.autonotebook import tqdm
from IPython.display import display, HTML

# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
tqdm.pandas()

  warn(


In [3]:
os.makedirs("imgs/chapter_6", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")

# Reading the preprocessed files

In [32]:
#Readin the missing value imputed and train test split data
try:
    train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed.parquet")
    val_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed.parquet")
    test_df = pd.read_parquet(preprocessed/"selected_blocks_test_missing_imputed.parquet")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Setting up Experiment Harness.ipynb in Chapter04
    </div>
    """))

## Combining the train, validation and test datasets for Feature Engineering

Some of the features that we are creating needs the train and test dataset to be combined into a single dataset with continuous time. In case of real-life operations, where we will not have the test set, it is recommended to create the test period dataset with zero or nulled out actual observations and continue.

In [33]:
train_df["type"] = "train"
val_df["type"] = "val"
test_df["type"] = "test"
full_df = pd.concat([train_df, val_df, test_df]).sort_values(["LCLid", "timestamp"])
del train_df, test_df, val_df

In [34]:
from mlforecast.lag_transforms import (
    RollingMean,
    RollingStd,
    RollingMin,
    RollingMax,
    SeasonalRollingMean,
    SeasonalRollingMin,
    SeasonalRollingMax,
    SeasonalRollingStd,
    ExponentiallyWeightedMean,
)

from collections import defaultdict

In [35]:
lag_transforms = defaultdict(list)

## Lag Features

In [36]:
lags = (
    (np.arange(5) + 1).tolist()
    + (np.arange(5) + 46).tolist()
    + (np.arange(5) + (48 * 7) - 2).tolist()
)
lags

[1, 2, 3, 4, 5, 46, 47, 48, 49, 50, 334, 335, 336, 337, 338]

In [37]:
# with LogTime():
#     full_df, added_features = add_lags(
#         full_df, lags=lags, column="energy_consumption", ts_id="LCLid", use_32_bit=True
#     )
# print(f"Features Created: {','.join(added_features)}")

## Rolling
---
All the other features apart from lags are added as "Lag Transforms" in `mlforecast`. This is beacuse of an issue we have seen in the book. When taking rolling means and such features, current time step should be avoided because if not that accounts to data leakage.Therefore, in `mlforecast` these transformations are added as a dictionary {offset: [list of functions]}. This way, if we add transformations as `{'1': [Transformer for rolling Mean, Transformer for Exponential Mean]}` it will move back one time step and calculate the transformations.

In [38]:
# Adding Rolling Mean, Rolling Std, with an offset of one timestep
lag_transforms[1]+= [RollingMean(window_size=n) for n in [3, 6, 12, 48]] + [
    RollingStd(window_size=n) for n in [3, 6, 12, 48]
]

## Seasonal Rolling
----
 This is slightly different from the implementation in the repository and explanation in the book. This goes back seasonal length and calculates mean, max etc in a window from that starting point. For the similar feature in the book (Rolling Seasonal Mean), use the same function we have been using in the book.

 P.S: I advice you to play around with this feature to understand how to configure seasonal length and window size using the code below:

 ```python
 from mlforecast.lag_transforms import SeasonalRollingMax

# generate sequential data to verify seasonal rolling
data = generate_daily_series(1,min_length=50, max_length=500, seed=42)
data['y'] = np.arange(1, len(data)+1)

season_length = 8
window_size=1

# Defining the Rolling window we want to test
seasonal_rolling_window = SeasonalRollingMax(window_size=window_size, season_length=season_length)

fcst = MLForecast(
    models=[],
    freq='D',
    lag_transforms={
    season_length: [seasonal_rolling_window],
    },
)
data_t = fcst.preprocess(data)
data_t.head()
```

In [39]:
# Adding Seasonal Rolling Mean, Seasonal Rolling Std, with an offset of seasonal period timestep
lag_transforms[48]+= [SeasonalRollingMean(season_length=48, window_size=3)] + [
    SeasonalRollingStd(season_length=48, window_size=3)
]

lag_transforms[48 * 7]+= [SeasonalRollingMean(season_length=48 * 7, window_size=3)] + [
    SeasonalRollingStd(season_length=48 * 7, window_size=3)
]

## EWMA

In [40]:
t = np.arange(25).tolist()
plot_df = pd.DataFrame({"Timesteps behind t": t})
for alpha in [0.3, 0.5, 0.8]:
    weights = [alpha * math.pow((1 - alpha), i) for i in t]
    span = (2 - alpha) / alpha
    halflife = math.log(1 - alpha) / math.log(0.5)
    plot_df[f"Alpha={alpha} | Span={span:.2f}"] = weights

fig = px.line(
    pd.melt(plot_df, id_vars="Timesteps behind t", var_name="Parameters"),
    x="Timesteps behind t",
    y="value",
    facet_col="Parameters",
)
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    yaxis=dict(
        title_text="Weights",
        titlefont=dict(size=15),
        tickfont=dict(size=15),
    ),
    xaxis=dict(
        titlefont=dict(size=15),
        tickfont=dict(size=15),
    ),
)
fig.update_annotations(font=dict(size=16))
fig.write_image(f"imgs/chapter_6/ewma_weights.png")
fig.show()

In [41]:
# Adding Rolling Mean, Rolling Std, with an offset of one timestep
lag_transforms[1] += [ExponentiallyWeightedMean(alpha=alpha) for alpha in [0.2, 0.5, 0.9]]

## Temporal Features

In [49]:
# Define the features you need in the model
# these should either be strings (pandas date function) or functions that take date as an argument
temporal_features = [
    "month",
    "quarter",
    "is_quarter_end",
    "is_quarter_start",
    "is_year_end",
    "is_year_start",
    "is_month_start",
    "is_month_end",
    "week",
    "day",
    "dayofweek",
    "dayofyear",
    "hour",
    "minute",
]

In [17]:
# with LogTime():
#     full_df, added_features = add_temporal_features(
#         full_df,
#         field_name="timestamp",
#         frequency="30min",
#         add_elapsed=True,
#         drop=False,
#         use_32_bit=True,
#     )
# print(f"Features Created: {','.join(added_features)}")

Time Elapsed: 2 seconds
Features Created: timestamp_Month,timestamp_Quarter,timestamp_Is_quarter_end,timestamp_Is_quarter_start,timestamp_Is_year_end,timestamp_Is_year_start,timestamp_Is_month_start,timestamp_WeekDay,timestamp_Dayofweek,timestamp_Dayofyear,timestamp_Hour,timestamp_Minute,timestamp_Elapsed


## Calculating the Features

In [43]:
from mlforecast import MLForecast

In [50]:
fcst = MLForecast(
    models=[],
    freq='D',
    lags=lags, # Defining the Lags we need to create
    # Defining some transformations we need to do to the lags (offsets)
    lag_transforms=lag_transforms,
    date_features=temporal_features, # Defining the date features we need
)
with LogTime():
    full_df = fcst.preprocess(
        full_df,
        time_col="timestamp",
        id_col="LCLid",
        target_col="energy_consumption",
    )

Time Elapsed: 1 second


In [52]:
full_df.columns

Index(['timestamp', 'LCLid', 'energy_consumption', 'frequency',
       'series_length', 'stdorToU', 'Acorn', 'Acorn_grouped', 'file',
       'holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
       'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
       'humidity', 'summary', 'type', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5',
       'lag46', 'lag47', 'lag48', 'lag49', 'lag50', 'lag334', 'lag335',
       'lag336', 'lag337', 'lag338', 'rolling_mean_lag1_window_size3',
       'rolling_mean_lag1_window_size6', 'rolling_mean_lag1_window_size12',
       'rolling_mean_lag1_window_size48', 'rolling_std_lag1_window_size3',
       'rolling_std_lag1_window_size6', 'rolling_std_lag1_window_size12',
       'rolling_std_lag1_window_size48',
       'exponentially_weighted_mean_lag1_alpha0.2',
       'exponentially_weighted_mean_lag1_alpha0.5',
       'exponentially_weighted_mean_lag1_alpha0.9',
       'seasonal_rolling_mean_lag48_season_length48_window_size3',
    

## Fourier Terms

In [53]:
from src.feature_engineering.temporal_features import (
    add_fourier_features,
    bulk_add_fourier_features,
)

In [54]:
full_df.columns

Index(['timestamp', 'LCLid', 'energy_consumption', 'frequency',
       'series_length', 'stdorToU', 'Acorn', 'Acorn_grouped', 'file',
       'holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
       'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
       'humidity', 'summary', 'type', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5',
       'lag46', 'lag47', 'lag48', 'lag49', 'lag50', 'lag334', 'lag335',
       'lag336', 'lag337', 'lag338', 'rolling_mean_lag1_window_size3',
       'rolling_mean_lag1_window_size6', 'rolling_mean_lag1_window_size12',
       'rolling_mean_lag1_window_size48', 'rolling_std_lag1_window_size3',
       'rolling_std_lag1_window_size6', 'rolling_std_lag1_window_size12',
       'rolling_std_lag1_window_size48',
       'exponentially_weighted_mean_lag1_alpha0.2',
       'exponentially_weighted_mean_lag1_alpha0.5',
       'exponentially_weighted_mean_lag1_alpha0.9',
       'seasonal_rolling_mean_lag48_season_length48_window_size3',
    

In [19]:
# train_df, added_features = add_fourier_features(train_df, "timestamp_Month", max_value=12, n_fourier_terms=1)

In [55]:
with LogTime():
    full_df, added_features = bulk_add_fourier_features(
        full_df,
        ["month", "hour", "minute"],
        max_values=[12, 24, 60],
        n_fourier_terms=5,
        use_32_bit=True,
    )
print(f"Features Created: {','.join(added_features)}")

Time Elapsed: 4 seconds
Features Created: month_sin_1,month_sin_2,month_sin_3,month_sin_4,month_sin_5,month_cos_1,month_cos_2,month_cos_3,month_cos_4,month_cos_5,hour_sin_1,hour_sin_2,hour_sin_3,hour_sin_4,hour_sin_5,hour_cos_1,hour_cos_2,hour_cos_3,hour_cos_4,hour_cos_5,minute_sin_1,minute_sin_2,minute_sin_3,minute_sin_4,minute_sin_5,minute_cos_1,minute_cos_2,minute_cos_3,minute_cos_4,minute_cos_5


### Plotting Fourier Terms

In [56]:
plot_df = (
    full_df[["month", "month_sin_1"]]
    .drop_duplicates()
    .sort_values("month")
)

plot_df.columns = ["calendar", "fourier"]

plot_df = pd.concat([plot_df, plot_df, plot_df]).reset_index(drop=True)
# plot_df.reset_index(drop=True, inplace=True)

plot_df.reset_index(inplace=True)
plot_df["index"] += 1
plot_df = pd.melt(
    plot_df, id_vars="index", var_name="month", value_name="Representation"
)

In [57]:
fig = px.line(plot_df, x="index", y="Representation", facet_row="month")
fig.update_layout(
    autosize=False,
    width=900,
    height=800,
    title_text="Step Function vs Continuous Function",
    title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
    titlefont={"size": 20},
    legend_title=None,
    # yaxis=dict(
    #     # title_text=ylabel,
    #     # titlefont=dict(size=12),
    # ),
    xaxis=dict(
        title_text="Time",
        # titlefont=dict(size=12),
    ),
)
fig.update_yaxes(matches=None)
fig.update_xaxes(
    ticktext=np.arange(1, 13).tolist() * 3,
    tickvals=np.arange(len(plot_df)) + 1,
)
# fig.write_image(f"imgs/chapter_6/fourier.png")
fig.show()

# Saving the feature engineered file

In [58]:
full_df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 4560240 entries, 1008 to 32687
Columns: 96 entries, timestamp to minute_cos_5
dtypes: bool(6), category(10), datetime64[ns](1), float32(68), int32(2), object(1), uint16(1), uint8(7)
memory usage: 1.6 GB


In [25]:
full_df[full_df["type"] == "train"].drop(columns="type").to_parquet(
    preprocessed / "selected_blocks_train_missing_imputed_feature_engg_mlforecast.parquet"
)
full_df[full_df["type"] == "val"].drop(columns="type").to_parquet(
    preprocessed / "selected_blocks_val_missing_imputed_feature_engg_mlforecast.parquet"
)
full_df[full_df["type"] == "test"].drop(columns="type").to_parquet(
    preprocessed / "selected_blocks_test_missing_imputed_feature_engg_mlforecast.parquet"
)