# Feature Engineering Toolkit

- This notebook builds reusable time-series features for retail forecasting.

Cell 1 — Imports + Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load from baseline module data folder
train = pd.read_csv("../baseline_prophet_forecast/data/train.csv")
features = pd.read_csv("../baseline_prophet_forecast/data/features.csv")
stores = pd.read_csv("../baseline_prophet_forecast/data/stores.csv")

df = (
    train
    .merge(features, on=["Store", "Date", "IsHoliday"])
    .merge(stores, on="Store")
)

df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values(["Store", "Dept", "Date"])
df.head()


Cell 2 — Select a Store–Dept Series (for demonstration)

In [None]:
store = 1
dept = 1

ts = df[(df["Store"] == store) & (df["Dept"] == dept)].copy()
ts = ts.set_index("Date").sort_index()

# Weekly alignment
ts = ts.asfreq("W")
ts[["Weekly_Sales", "IsHoliday"]].head()


Cell 3 — Define the Feature Function (Reusable)

This is the core deliverable. It is designed to avoid leakage by shifting before rolling.

In [None]:
def make_features(d: pd.DataFrame, target: str = "Weekly_Sales") -> pd.DataFrame:
    """
    Feature engineering for weekly retail demand forecasting.

    Expects:
      - index is datetime at weekly frequency
      - columns include target (e.g., Weekly_Sales) and IsHoliday

    Returns:
      - same frame with engineered features added
    """
    x = d.copy()

    # Time features
    x["weekofyear"] = x.index.isocalendar().week.astype(int)
    x["month"] = x.index.month
    x["year"] = x.index.year

    # Lag features (weeks)
    for lag in [1, 2, 4, 8, 13, 26, 52]:
        x[f"lag_{lag}"] = x[target].shift(lag)

    # Rolling window features (use shift(1) to avoid using current week)
    for win in [4, 8, 12, 24]:
        base = x[target].shift(1)
        x[f"roll_mean_{win}"] = base.rolling(win).mean()
        x[f"roll_std_{win}"] = base.rolling(win).std()
        x[f"roll_min_{win}"] = base.rolling(win).min()
        x[f"roll_max_{win}"] = base.rolling(win).max()

    # Momentum / change features
    x["diff_1"] = x[target].diff(1)
    x["pct_change_1"] = x[target].pct_change(1)

    # Holiday proximity window (captures pre/post holiday effects)
    if "IsHoliday" in x.columns:
        x["holiday_window_3"] = x["IsHoliday"].rolling(3, center=True).max().fillna(0).astype(int)
    else:
        x["holiday_window_3"] = 0

    # Year-over-year
    x["yoy_change"] = (x[target] - x[target].shift(52)) / x[target].shift(52)

    # Fourier terms (weekly seasonality; boosting models benefit)
    for k in [1, 2, 3]:
        x[f"fourier_sin_{k}"] = np.sin(2 * np.pi * x["weekofyear"] * k / 52)
        x[f"fourier_cos_{k}"] = np.cos(2 * np.pi * x["weekofyear"] * k / 52)

    return x


Cell 4 — Apply Features + Create Model-Ready Table

In [None]:
feat = make_features(ts, target="Weekly_Sales")
feat = feat.dropna()

target_col = "Weekly_Sales"
feature_cols = [c for c in feat.columns if c != target_col]

feat[feature_cols].head()


Save a “ready” dataset (useful for the LightGBM module):

In [None]:
feat.to_parquet("features_store1_dept1.parquet")


Cell 5 — Quick Diagnostics: Correlation Heatmap (Top Features)

Correlation across all features can be large; keep it focused.

In [None]:
corr_cols = ["Weekly_Sales", "lag_1", "lag_2", "lag_4", "roll_mean_4", "roll_mean_8", "roll_std_8", "yoy_change", "holiday_window_3"]
corr_cols = [c for c in corr_cols if c in feat.columns]

plt.figure(figsize=(10, 6))
sns.heatmap(feat[corr_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Feature Correlation (Selected)")
plt.tight_layout()
plt.savefig("images/feature_correlation_selected.png")
plt.show()


Cell 6 — Add a “No Leakage” Checklist (Markdown Cell)

Add a Markdown cell with:

Lags use .shift(lag)

Rolling windows use .shift(1).rolling(win)

No future columns (no forward-looking labels)

Same logic can be reused for training and inference (feature store concept)