# 07 — Report

**Purpose:** Consolidate metrics, diagnostics, and scenario impacts into a concise report for stakeholders.

**Inputs:**
- `./reports/metrics/cv_summary.csv`, `cv_summary_ridge.csv` (optional)
- `./reports/metrics/holdout_metrics.csv`, `holdout_metrics_ridge.csv` (optional)
- `./reports/predictions/holdout_predictions.csv` (RF)
- `./reports/metrics/holdout_permutation_importance.csv` (optional)
- `./reports/metrics/holdout_shap_mean_abs.csv` (optional)
- `./reports/scenarios/scenario_*.csv` (local & t+1)

**Outputs:** none

## 1. Load artifacts

In [None]:
from pathlib import Path
import glob, pandas as pd, numpy as np, matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,5); plt.rcParams["figure.dpi"] = 120

BASE = Path("./reports"); MET, PRED, SCN = BASE/"metrics", BASE/"predictions", BASE/"scenarios"

def sread(p):
    try:
        if p.exists(): return pd.read_csv(p)
    except Exception as e:
        print("Read failed:", p, e)
    return None

rf_cv = sread(MET/"cv_summary.csv")
rf_ho = sread(MET/"holdout_metrics.csv")
rg_cv = sread(MET/"cv_summary_ridge.csv")
rg_ho = sread(MET/"holdout_metrics_ridge.csv")

# Predictions for diagnostics
rf_ho_p = sread(PRED/"holdout_predictions.csv")
rg_ho_p = sread(PRED/"holdout_predictions_ridge.csv")  # may be missing

# Importance tables (optional)
permimp = sread(MET/"holdout_permutation_importance.csv")
shapabs = sread(MET/"holdout_shap_mean_abs.csv")

# Scenario files
sc_files = sorted(glob.glob(str(SCN / "scenario_*.csv")))

## 2. Performance tables

In [None]:
from pandas.api.types import is_numeric_dtype

def cv_mean(df):
    num = df[pd.to_numeric(df["fold"], errors="coerce").notna()].copy()
    for c in ["R2","MAE","RMSE"]:
        num[c] = pd.to_numeric(num[c], errors="coerce")
    return pd.DataFrame([{"R2": num["R2"].mean(), "MAE": num["MAE"].mean(), "RMSE": num["RMSE"].mean()}])

if rf_cv is not None and not rf_cv.empty:
    print("RF CV mean:"); display(cv_mean(rf_cv))
if rg_cv is not None and not rg_cv.empty:
    print("Ridge CV mean:"); display(cv_mean(rg_cv))
if rf_ho is not None and not rf_ho.empty:
    print("RF Hold-out:"); display(rf_ho)
if rg_ho is not None and not rg_ho.empty:
    print("Ridge Hold-out:"); display(rg_ho)

## 3. Hold‑out diagnostics (prefer Ridge, else RF)

In [None]:
# Prefer Ridge predictions if available; otherwise fall back to RF
pred_df = None
model_name = None

if rg_ho_p is not None and not rg_ho_p.empty:
    pred_df = rg_ho_p.copy()
    model_name = "Ridge"
elif rf_ho_p is not None and not rf_ho_p.empty:
    pred_df = rf_ho_p.copy()
    model_name = "Random Forest"

if pred_df is None:
    print("No hold-out predictions found for Ridge or RF.")
else:
    import numpy as np, matplotlib.pyplot as plt
    x = pred_df["y_true"].to_numpy(dtype=float)
    y = pred_df["y_pred"].to_numpy(dtype=float)

    plt.figure()
    plt.scatter(x, y, alpha=0.4)
    lim = [np.nanmin([x.min(), y.min()]), np.nanmax([x.max(), y.max()])]
    plt.plot(lim, lim)
    plt.xlabel("y_true (kg/ha)")
    plt.ylabel("y_pred (kg/ha)")
    plt.title(f"Hold-out: y_true vs y_pred ({model_name})")
    plt.tight_layout()
    plt.show()

    resid = x - y
    plt.figure()
    plt.hist(resid, bins=30)
    plt.title(f"Hold-out residuals ({model_name})")
    plt.xlabel("Residual")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

## 4. Feature importance — focus on non‑lag drivers

In [None]:
def is_lag_or_roll(feat: str) -> bool:
    f = str(feat)
    if f.startswith("cereal_yield"):   # drop yield lags/rolls explicitly
        return True
    return ("_lag" in f) or ("_roll" in f)

def feature_category(feat: str) -> str:
    f = feat.lower()
    if "temp" in f: return "Climate: temperature"
    if "precip" in f: return "Climate: precipitation"
    if "fertilizer" in f: return "Inputs: fertilizer"
    if "gdp" in f: return "Macro: income"
    if "co2" in f: return "Macro: emissions"
    if "rural_pop" in f or "population" in f: return "Demography"
    return "Other"

# (A) Top non-lag features by permutation importance
if permimp is not None and not permimp.empty:
    perm_nonlag = permimp[~permimp["feature"].apply(is_lag_or_roll)].copy()
    perm_nonlag = perm_nonlag.sort_values("perm_importance", ascending=False)
    print("Top non-lag features (Permutation Importance):")
    display(perm_nonlag.head(15))
else:
    perm_nonlag = None
    print("No permutation importance table found.")

# (B) Top non-lag features by mean |SHAP|
if shapabs is not None and not shapabs.empty:
    shap_nonlag = shapabs[~shapabs["feature"].apply(is_lag_or_roll)].copy()
    shap_nonlag = shap_nonlag.sort_values("mean_abs_shap", ascending=False)
    print("Top non-lag features (mean |SHAP|):")
    display(shap_nonlag.head(15))
else:
    shap_nonlag = None
    print("No SHAP importance table found.")

# (C) Category-level averages (non-lag only)
def cat_table(df, value_col, label):
    if df is None or df.empty: 
        return
    tab = (
        df.assign(category=df["feature"].apply(feature_category))
          .groupby("category", as_index=False)[value_col].mean()
          .sort_values(value_col, ascending=False)
    )
    print(f"{label} — category averages:")
    display(tab)

cat_table(perm_nonlag, "perm_importance", "Permutation importance")
cat_table(shap_nonlag, "mean_abs_shap", "Mean |SHAP|")

> **Interpretation (non‑lag features).**  
> Climate variables show directional but modest signal at annual resolution; temperature often edges precipitation, but effects are region‑ and season‑specific. Fertilizer and macro/demographic variables retain secondary influence, likely proxying input access, infrastructure, and technology. Prioritizing **seasonal climate features** and **input quality metrics** (irrigation share, seed types) should strengthen climate sensitivity in future iterations.

## 5. Scenario summary & map (t+1 preferred)

In [None]:
import glob
from pathlib import Path
import pandas as pd

t1 = sorted(glob.glob(str(SCN / "scenario_*_tplus1.csv")))
chosen = t1[-1] if t1 else (sc_files[-1] if sc_files else None)

if chosen is None:
    print("No scenario files found in ./reports/scenarios/.")
else:
    sc = pd.read_csv(chosen)
    print("Using scenario file:", Path(chosen).name)
    cols = [c for c in ["Country Name","Country Code","year","y_pred_baseline","y_pred_scenario","delta_abs","delta_pct"] if c in sc.columns]
    display(sc.sort_values("delta_abs", ascending=False)[cols].head(15))

    # Choropleth (optional)
    try:
        import plotly.express as px
        if "Country Code" in sc.columns:
            fig = px.choropleth(sc, locations="Country Code", color="delta_abs", color_continuous_scale="RdBu",
                                title="Scenario impact (t+1): Δ yield (kg/ha)")
            fig.update_layout(coloraxis_colorbar_title="Δ kg/ha")
            fig.show()
    except Exception as e:
        print("Plotly not available for map; table shown only. Error:", e)

### How to read the map (t+1 scenario impact)
- **What’s shown:** absolute change in predicted yield (kg/ha) = **ŷ_scenario − ŷ_baseline** for the **same t+1** feature row; only nudged drivers differ.
- **Colors:** blue/positive = increase; red/negative = decrease; units are kg/ha.
- **t+1 construction:** lags/rolling features are propagated from the latest observed year to avoid look-ahead.
- **Gray countries:** missing data/ISO3 mapping.
- **Note:** associational sensitivity around the current regime; not a causal estimate.

## 6. Environment

In [None]:
import sys, platform, numpy, pandas, matplotlib
print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("NumPy:", numpy.__version__)
print("Pandas:", pandas.__version__)
print("Matplotlib:", matplotlib.__version__)