In [9]:
import pandas as pd
from pathlib import Path

DATA_SILVER = Path("../data/silver")
DATA_GOLD = Path("../data/gold")

df = pd.read_parquet(DATA_SILVER / "train_clean.parquet")

df.shape

(1017209, 18)

In [10]:
# Date features
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Week"] = df["Date"].dt.isocalendar().week.astype(int)
df["Day"] = df["Date"].dt.day
df["DayOfYear"] = df["Date"].dt.dayofyear
df["IsWeekend"] = df["Date"].dt.weekday.isin([5,6]).astype(int)

df[["Date","Year","Month","Week","IsWeekend"]].head()

Unnamed: 0,Date,Year,Month,Week,IsWeekend
0,2015-07-31,2015,7,31,0
1,2015-07-31,2015,7,31,0
2,2015-07-31,2015,7,31,0
3,2015-07-31,2015,7,31,0
4,2015-07-31,2015,7,31,0


In [11]:
# Promo intensity
df["HasPromo"] = (df["Promo"] == 1).astype(int)
df["HasPromo2"] = (df["Promo2"] == 1).astype(int)

# Promo interval flag
df["HasPromoInterval"] = df["PromoInterval"].notna().astype(int)

df[["Promo","Promo2","HasPromo","HasPromo2","HasPromoInterval"]].head()

Unnamed: 0,Promo,Promo2,HasPromo,HasPromo2,HasPromoInterval
0,1,0,1,0,0
1,1,1,1,1,1
2,1,1,1,1,1
3,1,0,1,0,0
4,1,0,1,0,0


In [12]:
import numpy as np

In [13]:
# Competition distance log (reduz escala)
df["CompetitionDistanceLog"] = np.log1p(df["CompetitionDistance"])

df[["CompetitionDistance","CompetitionDistanceLog"]].head()

Unnamed: 0,CompetitionDistance,CompetitionDistanceLog
0,1270.0,7.147559
1,570.0,6.347389
2,14130.0,9.556126
3,620.0,6.431331
4,29910.0,10.305982


In [14]:
# One-hot encoding
cat_cols = ["StateHoliday","StoreType","Assortment"]

df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

df.shape

(1017209, 33)

In [15]:
gold_path = DATA_GOLD / "train_features.parquet"

df.to_parquet(gold_path, index=False)

print("Saved:", gold_path)
print("Shape:", df.shape)

Saved: ../data/gold/train_features.parquet
Shape: (1017209, 33)
