# 02 — Feature Engineering

Build ML-ready features using lags, rolling stats, and calendar data.

## Reload clean base

In [1]:
import pandas as pd

# Step 1: read column names only
cols = pd.read_csv(
    "../data/raw/m5/sales_train_validation.csv",
    nrows=0
).columns

# Step 2: keep id columns + last 365 day columns
id_cols = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
day_cols = [c for c in cols if c.startswith("d_")][-365:]

use_cols = id_cols + day_cols

# Step 3: load only required columns
sales = pd.read_csv(
    "../data/raw/m5/sales_train_validation.csv",
    usecols=use_cols
)

calendar = pd.read_csv("../data/raw/m5/calendar.csv")
prices = pd.read_csv("../data/raw/m5/sell_prices.csv")

sales.head()


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1549,d_1550,d_1551,d_1552,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,1,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,3,0,2,1,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,2,1,2,...,2,1,1,0,1,1,2,2,2,4


In [2]:
sample_ids = sales["id"].sample(1000, random_state=42)
sales = sales[sales["id"].isin(sample_ids)]

## Melt & merge

In [3]:
df = sales.melt(
    id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
    var_name="d",
    value_name="sales"
)

df = df.merge(calendar, on="d", how="left")
df = df.merge(prices, on=["store_id", "item_id", "wm_yr_wk"], how="left")

df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["id", "date"])


In [4]:
df.shape

(365000, 22)

## Time features

In [5]:
df["dow"] = df["date"].dt.dayofweek
df["week"] = df["date"].dt.isocalendar().week.astype(int)
df["month"] = df["date"].dt.month


## Lag features

In [6]:
for lag in [1, 7, 14, 28]:
    df[f"lag_{lag}"] = df.groupby("id")["sales"].shift(lag)


## Rolling statistics

In [7]:
df["rmean_7"] = df.groupby("id")["sales"].shift(1).rolling(7).mean()
df["rmean_28"] = df.groupby("id")["sales"].shift(1).rolling(28).mean()


## Drop NA & save

In [8]:
FEATURE_COLS = [
    "lag_1", "lag_7", "lag_14", "lag_28",
    "rmean_7", "rmean_28",
    "sell_price"
]

df_model = df.dropna(subset=FEATURE_COLS)

df_model.shape


(335758, 30)

In [9]:
df_model.to_parquet("../data/processed/features.parquet")
df_model.head()


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,snap_WI,sell_price,dow,week,lag_1,lag_7,lag_14,lag_28,rmean_7,rmean_28
28668,FOODS_1_003_TX_3_validation,FOODS_1_003,FOODS_1,FOODS,TX_3,TX,d_1577,0,2015-05-24,11517,...,0,3.23,6,21,0.0,0.0,0.0,0.0,0.0,0.035714
29668,FOODS_1_003_TX_3_validation,FOODS_1_003,FOODS_1,FOODS,TX_3,TX,d_1578,0,2015-05-25,11517,...,0,3.23,0,22,0.0,0.0,0.0,0.0,0.0,0.035714
30668,FOODS_1_003_TX_3_validation,FOODS_1_003,FOODS_1,FOODS,TX_3,TX,d_1579,0,2015-05-26,11517,...,0,3.23,1,22,0.0,0.0,0.0,0.0,0.0,0.035714
31668,FOODS_1_003_TX_3_validation,FOODS_1_003,FOODS_1,FOODS,TX_3,TX,d_1580,0,2015-05-27,11517,...,0,3.23,2,22,0.0,0.0,0.0,0.0,0.0,0.035714
32668,FOODS_1_003_TX_3_validation,FOODS_1_003,FOODS_1,FOODS,TX_3,TX,d_1581,0,2015-05-28,11517,...,0,3.23,3,22,0.0,0.0,0.0,0.0,0.0,0.035714


In [10]:
len(df_model)


335758

In [11]:
df_model.groupby("id").size().describe()

count    1000.000000
mean      335.758000
std        13.874362
min        72.000000
25%       337.000000
50%       337.000000
75%       337.000000
max       337.000000
dtype: float64

In [12]:
df_model["date"].min(), df_model["date"].max()


(Timestamp('2015-05-24 00:00:00'), Timestamp('2016-04-24 00:00:00'))