# 2.Feature Engineering

This notebook constructs model-ready features for all datasets.

Feature categories:
- Lag-based features
- Rolling statistics
- Calendar-based features
- External (ERA5) weather features

Rules:
- Feature logic is identical across Phase 1, Phase 2, and Test.
- No future information is used.
- ERA5 is treated as an exogenous input.


## 2.1 Import & Feature Configuration

In [1]:
import pandas as pd
import numpy as np

LAGS = [1, 4, 96]            # 15 min, 1 hour, 1 day
ROLLING_WINDOWS = [96]      # 24 hours

## 2.2 Load Dataset

In [2]:
# -------------------------
# Load data
# -------------------------
phase1 = pd.read_csv("../data/processed/phase1.csv", parse_dates=["timestamp"])
phase2 = pd.read_csv("../data/processed/phase2.csv", parse_dates=["timestamp"])
test   = pd.read_csv("../data/processed/test.csv",   parse_dates=["timestamp"])

# Load ERA5 Weather Data
era5 = pd.read_csv(
    "../data/raw/ERA5_Weather_Data_Monash.csv"
)

# Rename datetime column to match forecasting datasets
era5 = era5.rename(columns={"datetime (UTC)": "timestamp"})

# Parse datetime properly
era5["timestamp"] = (
    pd.to_datetime(era5["timestamp"], utc=True)
      .dt.tz_localize(None)
)

# Drop non-informative metadata columns
era5 = era5.drop(columns=[
    "coordinates (lat,lon)",
    "model (name)",
    "model elevation (surface)",
    "utc_offset (hrs)"
])

era5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100057 entries, 0 to 100056
Data columns (total 9 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   timestamp                          100057 non-null  datetime64[ns]
 1   temperature (degC)                 100057 non-null  float64       
 2   dewpoint_temperature (degC)        100057 non-null  float64       
 3   wind_speed (m/s)                   100057 non-null  float64       
 4   mean_sea_level_pressure (Pa)       100057 non-null  float64       
 5   relative_humidity ((0-1))          100057 non-null  float64       
 6   surface_solar_radiation (W/m^2)    100057 non-null  float64       
 7   surface_thermal_radiation (W/m^2)  100057 non-null  float64       
 8   total_cloud_cover (0-1)            100057 non-null  float64       
dtypes: datetime64[ns](1), float64(8)
memory usage: 6.9 MB


## 2.3 Time-Based Feature Engineering

In [3]:
def add_time_features(df):
    df = df.copy()
    df["hour"] = df["timestamp"].dt.hour
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
    return df

phase1 = add_time_features(phase1)
phase2 = add_time_features(phase2)
test   = add_time_features(test)

## 2.4 Lag Features

In [4]:
LAGS = [1, 4, 96]  
# 1 = 15 min
# 4 = 1 hour
# 96 = 1 day

def add_lag_features(df, lags):
    df = df.sort_values(["series_id", "timestamp"])
    for lag in lags:
        df[f"lag_{lag}"] = (
            df.groupby("series_id")["value"]
              .shift(lag)
        )
    return df

phase1 = add_lag_features(phase1, LAGS)
phase2 = add_lag_features(phase2, LAGS)
test   = add_lag_features(test, LAGS)

## 2.5 Merge ERA5 (Exogeneous Variable)

In [5]:
# Filter training window: Jan 2016 - Sep 2020
phase1 = phase1[phase1["timestamp"] < "2020-10-01"]

# Filter validation window: October 2020 saja
phase2 = phase2[
    (phase2["timestamp"] >= "2020-10-01") &
    (phase2["timestamp"] < "2020-11-01")
]

# Filter test window: November 2020 saja
test = test[
    (test["timestamp"] >= "2020-11-01") &
    (test["timestamp"] < "2020-12-01")
]

def merge_era5(ts_df, era5_df):
    ts_df = ts_df.sort_values("timestamp")
    era5_df = era5_df.sort_values("timestamp")

    merged = pd.merge_asof(
        ts_df,
        era5_df,
        on="timestamp",
        direction="backward"
    )
    return merged

phase1 = merge_era5(phase1, era5)
phase2 = merge_era5(phase2, era5)
test   = merge_era5(test, era5)

In [6]:
phase1[[
    "timestamp",
    "temperature (degC)",
    "wind_speed (m/s)",
    "surface_solar_radiation (W/m^2)"
]].head(10)



Unnamed: 0,timestamp,temperature (degC),wind_speed (m/s),surface_solar_radiation (W/m^2)
0,2016-03-02 02:01:00,30.74,1.09,847.8
1,2016-03-02 02:16:00,30.74,1.09,847.8
2,2016-03-02 02:31:00,30.74,1.09,847.8
3,2016-03-02 02:46:00,30.74,1.09,847.8
4,2016-03-02 03:01:00,31.03,1.22,627.78
5,2016-03-02 03:16:00,31.03,1.22,627.78
6,2016-03-02 03:31:00,31.03,1.22,627.78
7,2016-03-02 03:46:00,31.03,1.22,627.78
8,2016-03-02 04:01:00,31.59,1.68,808.03
9,2016-03-02 04:16:00,31.59,1.68,808.03


## 2.6 Handle Missing Value

In [8]:
def clean_features(df):
    df = df.copy()

    # Drop rows with incomplete lag history
    lag_cols = [c for c in df.columns if c.startswith("lag_")]
    df = df.dropna(subset=lag_cols)

    # Forward-fill exogenous weather variables
    weather_cols = [
        "temperature (degC)",
        "wind_speed (m/s)",
        "surface_solar_radiation (W/m^2)"
    ]
    df[weather_cols] = df[weather_cols].fillna(method="ffill")

    return df

phase1 = clean_features(phase1)
phase2 = clean_features(phase2)
test   = clean_features(test)

def missing_summary(df):
    return (
        df.isna()
          .sum()
          .sort_values(ascending=False)
    )

print('Missing Summary Phase 1:\n', missing_summary(phase1))
print('\nMissing Summary Phase 2:\n', missing_summary(phase2))
print('\nMissing Summary Test:\n', missing_summary(test))

Missing Summary Phase 1:
 series_id                            0
timestamp                            0
value                                0
hour                                 0
dayofweek                            0
day                                  0
month                                0
is_weekend                           0
lag_1                                0
lag_4                                0
lag_96                               0
temperature (degC)                   0
dewpoint_temperature (degC)          0
wind_speed (m/s)                     0
mean_sea_level_pressure (Pa)         0
relative_humidity ((0-1))            0
surface_solar_radiation (W/m^2)      0
surface_thermal_radiation (W/m^2)    0
total_cloud_cover (0-1)              0
dtype: int64

Missing Summary Phase 2:
 series_id                            0
timestamp                            0
value                                0
hour                                 0
dayofweek                            

  df[weather_cols] = df[weather_cols].fillna(method="ffill")
  df[weather_cols] = df[weather_cols].fillna(method="ffill")
  df[weather_cols] = df[weather_cols].fillna(method="ffill")


## 2.7 Save Dataset for Modeling

In [9]:
phase1.to_csv("../data/modeling/train.csv", index=False)
phase2.to_csv("../data/modeling/validation.csv", index=False)
test.to_csv("../data/modeling/test.csv", index=False)