In [1]:
# ============================================================
# 07 - Model Tuning (Random Forest) | Cell 1/8 - Imports + setup
# ============================================================

import numpy as np
import pandas as pd

from pathlib import Path
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

# Optional (only if you use scipy distributions)
from scipy.stats import randint

RANDOM_STATE = 42
TARGET = "Sales"  # adjust if needed

REPORTS_METRICS = Path("../reports/metrics")
MODELS_DIR = Path("../models")
REPORTS_METRICS.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
# ============================================================
# 07 - Model Tuning (Random Forest) | Cell 2/8 - Load dataset
# ============================================================

# Prefer using your "gold" features (already engineered)
DATA_PATH = Path("../data/gold/train_features.parquet")

df = pd.read_parquet(DATA_PATH)
print("Loaded:", df.shape)
df.head(3)

Loaded: (1017209, 33)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,...,HasPromoInterval,CompetitionDistanceLog,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_b,StoreType_c,StoreType_d,Assortment_b,Assortment_c
0,1,5,2015-07-31,5263,555,1,1,1,1270.0,9,...,0,7.147559,False,False,False,False,True,False,False,False
1,2,5,2015-07-31,6064,625,1,1,1,570.0,11,...,1,6.347389,False,False,False,False,False,False,False,False
2,3,5,2015-07-31,8314,821,1,1,1,14130.0,12,...,1,9.556126,False,False,False,False,False,False,False,False


In [3]:
# ============================================================
# 07 - Model Tuning (Random Forest) | Cell 3/8 - Date handling
# ============================================================

# If Date exists, make it numeric and drop original datetime from X later
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df["Date_ts"] = df["Date"].astype("int64")  # NaT becomes min int64

print("Date in df?", "Date" in df.columns)
print("Date_ts in df?", "Date_ts" in df.columns)

Date in df? True
Date_ts in df? True


In [4]:
# ============================================================
# 07 - Model Tuning (Random Forest) | Cell 4/8 - Build X/y
# ============================================================

drop_cols = [TARGET]
if "Date" in df.columns:
    drop_cols.append("Date")  # remove datetime column

X = df.drop(columns=drop_cols, errors="ignore").copy()
y = df[TARGET].copy()

print("X:", X.shape, "| y:", y.shape)
print("Non-numeric cols (before encoding):", X.select_dtypes(exclude=["number"]).columns.tolist())

X: (1017209, 32) | y: (1017209,)
Non-numeric cols (before encoding): ['PromoInterval', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c', 'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_b', 'Assortment_c']


In [5]:
# ============================================================
# 07 - Model Tuning (Random Forest) | Cell 5/8 - Encode categoricals (fix PromoInterval)
# ============================================================

# 1) Fix known culprit: PromoInterval (strings like 'Jan,Apr,Jul,Oct')
if "PromoInterval" in X.columns:
    X["PromoInterval"] = X["PromoInterval"].fillna("None").astype(str)
    X = pd.get_dummies(X, columns=["PromoInterval"], drop_first=True)

# 2) Encode any other remaining object columns
obj_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("Object cols BEFORE final dummies:", obj_cols)

if obj_cols:
    X = pd.get_dummies(X, columns=obj_cols, drop_first=True)

obj_cols_after = X.select_dtypes(include=["object"]).columns.tolist()
print("Object cols AFTER dummies:", obj_cols_after)

# Hard stop if anything still non-numeric
if obj_cols_after:
    for c in obj_cols_after[:10]:
        print(f"Still object: {c} | sample:", X[c].dropna().astype(str).unique()[:5])
    raise ValueError(f"Still have object cols: {obj_cols_after}")

# Final numeric cleanup
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))
X = X.astype("float32")

print("✅ X ready:", X.shape)

Object cols BEFORE final dummies: []
Object cols AFTER dummies: []
✅ X ready: (1017209, 34)


In [6]:
# ============================================================
# 07 - Model Tuning (Random Forest) | Cell 6/8 - Split
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print("Train:", X_train.shape, "| Test:", X_test.shape)

Train: (813767, 34) | Test: (203442, 34)


In [7]:
# ============================================================
# 07 - Model Tuning (Random Forest) | Cell 7/8 - RandomizedSearchCV
# ============================================================

rf = RandomForestRegressor(
    random_state=RANDOM_STATE,
    n_jobs=-1
)

param_dist = {
    "n_estimators": randint(100, 400),
    "max_depth": randint(10, 40),
    "min_samples_split": randint(2, 30),
    "min_samples_leaf": randint(1, 15),
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
}

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring="neg_root_mean_squared_error",
    random_state=RANDOM_STATE,
    n_jobs=2,
    verbose=2,
    error_score="raise"  # important: surfaces the real error immediately
)

search.fit(X_train, y_train)

print("Best params:", search.best_params_)
print("Best CV RMSE:", -search.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=True, max_depth=29, max_features=sqrt, min_samples_leaf=11, min_samples_split=9, n_estimators=288; total time= 1.4min
[CV] END bootstrap=True, max_depth=29, max_features=sqrt, min_samples_leaf=11, min_samples_split=9, n_estimators=288; total time= 1.5min




[CV] END bootstrap=True, max_depth=16, max_features=log2, min_samples_leaf=3, min_samples_split=24, n_estimators=187; total time=  42.4s
[CV] END bootstrap=True, max_depth=13, max_features=None, min_samples_leaf=6, min_samples_split=22, n_estimators=357; total time= 7.0min
[CV] END bootstrap=False, max_depth=21, max_features=log2, min_samples_leaf=6, min_samples_split=3, n_estimators=291; total time= 1.7min
[CV] END bootstrap=False, max_depth=21, max_features=log2, min_samples_leaf=6, min_samples_split=3, n_estimators=291; total time= 1.7min
[CV] END bootstrap=False, max_depth=21, max_features=log2, min_samples_leaf=6, min_samples_split=3, n_estimators=291; total time= 1.9min
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=12, min_samples_split=27, n_estimators=121; total time=  50.7s
[CV] END bootstrap=True, max_depth=29, max_features=sqrt, min_samples_leaf=11, min_samples_split=9, n_estimators=288; total time= 1.4min
[CV] END bootstrap=True, max_depth=16, 

In [9]:
# ============================================================
# 07 - Model Tuning (Random Forest) | Cell 8/8 - Evaluate + Save
# ============================================================

best_model = search.best_estimator_

y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("Test MAE :", mae)
print("Test RMSE:", rmse)

# Save model
model_path = MODELS_DIR / "random_forest_tuned.joblib"
joblib.dump(best_model, model_path)
print("Saved ->", model_path.resolve())

# Save metrics
metrics_path = REPORTS_METRICS / "tuning_results.csv"
pd.DataFrame([{
    "model": "RandomForest_Tuned",
    "mae": mae,
    "rmse": rmse,
    "best_params": str(search.best_params_),
    "n_features": X.shape[1],
    "n_train": X_train.shape[0],
    "n_test": X_test.shape[0],
}]).to_csv(metrics_path, index=False)

print("Saved ->", metrics_path.resolve())

Test MAE : 312.8663452627772
Test RMSE: 514.4791848342323
Saved -> /Users/edonaire/Documents/governance-first-sales-prediction/models/random_forest_tuned.joblib
Saved -> /Users/edonaire/Documents/governance-first-sales-prediction/reports/metrics/tuning_results.csv
