# 04 — Modeling

**Purpose:** Train Random Forest and Ridge with time‑aware CV and last‑5‑years hold‑out. Save predictions/metrics and permutation importance for **both** models.

**Inputs:** `./data/processed/countries_features.csv`

**Outputs (CSV only):**
- `./reports/metrics/cv_summary.csv` (RF)
- `./reports/metrics/holdout_metrics.csv` (RF)
- `./reports/predictions/cv_predictions.csv` (RF)
- `./reports/predictions/holdout_predictions.csv` (RF)
- `./reports/predictions/residuals_all_data.csv` (RF; optional)
- `./reports/metrics/holdout_permutation_importance.csv` (RF; optional)
- `./reports/metrics/cv_summary_ridge.csv` (Ridge; optional)
- `./reports/metrics/holdout_metrics_ridge.csv` (Ridge; optional)
- `./reports/predictions/holdout_predictions_ridge.csv` (Ridge; optional)

**Key decisions/assumptions:** Countries only; no target imputation; median imputer in‑pipeline; time‑aware CV; last‑5y hold‑out.

## 1) Setup

In [None]:
from pathlib import Path
import numpy as np, pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

pd.set_option("display.max_columns", None)

INP = Path("../data/processed/countries_features.csv")
REPORTS = Path("./reports"); REPORTS.mkdir(exist_ok=True)
REPORTS_METRICS = REPORTS / "metrics"; REPORTS_METRICS.mkdir(parents=True, exist_ok=True)
REPORTS_PRED = REPORTS / "predictions"; REPORTS_PRED.mkdir(parents=True, exist_ok=True)

print("Features:", INP.resolve())

## 2) Load features & assemble sets

In [None]:
df = pd.read_csv(INP)
TARGET = "cereal_yield"

id_cols = ["Country Name", "Country Code"]
non_feature = set([TARGET, "year", *id_cols])
numeric_like = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
exclude = set(non_feature)
exclude |= {c for c in numeric_like if c.startswith("lag0_") or c.endswith("_future")}
FEATS = [c for c in numeric_like if c not in exclude]

mask = df[TARGET].notna()
dfm = df.loc[mask].copy()
if "year" in dfm.columns:
    dfm["year"] = pd.to_numeric(dfm["year"], errors="coerce").astype("Int64")

X = dfm[FEATS]
y = dfm[TARGET]
years = dfm["year"] if "year" in dfm.columns else pd.Series([np.nan]*len(dfm), index=dfm.index)
codes = dfm["Country Code"] if "Country Code" in dfm.columns else None

dfm = dfm.sort_values("year")  # ensure chronological order globally

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

print("n_features:", len(FEATS), "| labeled rows:", X.shape[0])

## 3) Random Forest — CV & Hold‑out

In [None]:
rf_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                    ("rf", RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))])

# CV
cv_rows = []; cv_pred_rows = []
for fold, (tr, va) in enumerate(tscv.split(X), start=1):
    X_tr, y_tr = X.iloc[tr], y.iloc[tr]
    X_va, y_va = X.iloc[va], y.iloc[va]
    y_va_np = y_va.to_numpy(dtype=float)

    rf_pipe.fit(X_tr, y_tr)
    p = rf_pipe.predict(X_va)

    r2 = r2_score(y_va_np, p)
    mae = mean_absolute_error(y_va_np, p)
    try:
        rmse = mean_squared_error(y_va_np, p, squared=False)
    except TypeError:
        rmse = (mean_squared_error(y_va_np, p))**0.5
    cv_rows.append({"fold": fold, "R2": r2, "MAE": mae, "RMSE": rmse})

    yrs_va = years.iloc[va].values
    codes_va = codes.iloc[va].values if codes is not None else np.array([None]*len(va))
    for i in range(len(p)):
        cv_pred_rows.append({
            "fold": fold,
            "year": int(yrs_va[i]) if yrs_va[i] == yrs_va[i] else None,
            "Country Code": None if codes is None else codes_va[i],
            "y_true": float(y_va_np[i]),
            "y_pred": float(p[i])
        })

pd.DataFrame(cv_rows).to_csv(REPORTS_METRICS / "cv_summary.csv", index=False)
pd.DataFrame(cv_pred_rows).to_csv(REPORTS_PRED / "cv_predictions.csv", index=False)

# Hold-out
last_year = int(years.max())
ho_mask = years >= (last_year - 4); tr_mask = ~ho_mask
X_tr, y_tr = X.loc[tr_mask], y.loc[tr_mask]
X_ho, y_ho = X.loc[ho_mask], y.loc[ho_mask]

rf_pipe.fit(X_tr, y_tr)
p_ho = rf_pipe.predict(X_ho)

try:
    rmse = mean_squared_error(y_ho, p_ho, squared=False)
except TypeError:
    rmse = (mean_squared_error(y_ho, p_ho))**0.5

pd.DataFrame([{
    "R2": r2_score(y_ho, p_ho), "MAE": mean_absolute_error(y_ho, p_ho), "RMSE": rmse,
    "years": f"{int(years[ho_mask].min())}-{int(years[ho_mask].max())}"
}]).to_csv(REPORTS_METRICS / "holdout_metrics.csv", index=False)

df_ho = X_ho.copy()
for c in ["Country Name","Country Code","year"]:
    if c in dfm.columns:
        df_ho[c] = dfm.loc[df_ho.index, c].values
df_ho["y_true"] = y_ho.values
df_ho["y_pred"] = p_ho
df_ho[[c for c in ["Country Name","Country Code","year","y_true","y_pred"] if c in df_ho.columns]].to_csv(REPORTS_PRED / "holdout_predictions.csv", index=False)
print("Saved RF metrics & predictions.")

## 4) Ridge — CV & Hold‑out

In [None]:
from sklearn.preprocessing import StandardScaler

ridge_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("ridge", Ridge(alpha=1.0, solver="auto"))
])

# CV
ridge_cv_rows = []
for fold, (tr, va) in enumerate(tscv.split(X), start=1):
    X_tr, y_tr = X.iloc[tr], y.iloc[tr]
    X_va, y_va = X.iloc[va], y.iloc[va]
    ridge_pipe.fit(X_tr, y_tr)
    p = ridge_pipe.predict(X_va)
    try:
        rmse = mean_squared_error(y_va, p, squared=False)
    except TypeError:
        rmse = (mean_squared_error(y_va, p))**0.5
    ridge_cv_rows.append({"fold": fold, "R2": r2_score(y_va, p), "MAE": mean_absolute_error(y_va, p), "RMSE": rmse})

ridge_cv = pd.DataFrame(ridge_cv_rows)
ridge_cv.loc["mean"] = {"fold": "mean", **ridge_cv[["R2","MAE","RMSE"]].mean().to_dict()}
ridge_cv.to_csv(REPORTS_METRICS / "cv_summary_ridge.csv", index=False)

# Hold-out
ridge_pipe.fit(X_tr, y_tr)
p_ho_r = ridge_pipe.predict(X_ho)
try:
    rmse_r = mean_squared_error(y_ho, p_ho_r, squared=False)
except TypeError:
    rmse_r = (mean_squared_error(y_ho, p_ho_r))**0.5

pd.DataFrame([{
    "R2": r2_score(y_ho, p_ho_r), "MAE": mean_absolute_error(y_ho, p_ho_r), "RMSE": rmse_r,
    "years": f"{int(years[ho_mask].min())}-{int(years[ho_mask].max())}"
}]).to_csv(REPORTS_METRICS / "holdout_metrics_ridge.csv", index=False)

df_ho_r = X_ho.copy()
for c in ["Country Name","Country Code","year"]:
    if c in dfm.columns:
        df_ho_r[c] = dfm.loc[df_ho_r.index, c].values
df_ho_r["y_true"] = y_ho.values
df_ho_r["y_pred"] = p_ho_r
df_ho_r[[c for c in ["Country Name","Country Code","year","y_true","y_pred"] if c in df_ho_r.columns]]     .to_csv(REPORTS_PRED / "holdout_predictions_ridge.csv", index=False)
print("Saved Ridge metrics & predictions.")

## 5) Permutation importance on hold‑out (both models)

In [None]:
from sklearn.inspection import permutation_importance

# RF
try:
    result_rf = permutation_importance(rf_pipe, X_ho, y_ho, n_repeats=5, random_state=42, n_jobs=1)
    imp_rf = pd.DataFrame({"feature": X.columns, "perm_importance": result_rf.importances_mean})              .sort_values("perm_importance", ascending=False)
    imp_rf.to_csv(REPORTS_METRICS / "holdout_permutation_importance.csv", index=False)
    print("Saved RF permutation importance.")
except Exception as e:
    print("RF permutation importance skipped:", e)

# Ridge
try:
    result_rg = permutation_importance(ridge_pipe, X_ho, y_ho, n_repeats=5, random_state=42, n_jobs=1)
    imp_rg = pd.DataFrame({"feature": X.columns, "perm_importance": result_rg.importances_mean})              .sort_values("perm_importance", ascending=False)
    imp_rg.to_csv(REPORTS_METRICS / "holdout_permutation_importance_ridge.csv", index=False)
    print("Saved Ridge permutation importance.")
except Exception as e:
    print("Ridge permutation importance skipped:", e)

## 6) Environment

In [None]:
import sys, platform, numpy, pandas, sklearn
print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("NumPy:", numpy.__version__)
print("Pandas:", pandas.__version__)
print("scikit-learn:", sklearn.__version__)