# Evaluate Notebook

This notebook loads the saved model and evaluates it on train and test sets, reporting RMSE and R².
It also writes `reports/metrics.json` and, when available, `reports/feature_importance.csv`.


## 1) Load configuration & resolve paths

In [None]:

from pathlib import Path
import os, json
import pandas as pd
import yaml

# Locate the repo root by searching upward for params.yaml
def find_repo_root(start: Path | None = None) -> Path:
    cur = (start or Path.cwd()).resolve()
    for _ in range(10):
        if (cur / "params.yaml").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    raise FileNotFoundError("Could not find 'params.yaml' in parent directories. Start this notebook from your repo.")

ROOT = find_repo_root(Path.cwd())
print(f'Resolved ROOT: {ROOT}')
PARAMS_PATH = ROOT / "params.yaml"

with open(PARAMS_PATH, "r") as f:
    params = yaml.safe_load(f)

DATA_DIR = ROOT / "data"
PROCESSED = DATA_DIR / "processed"
MODELS = ROOT / "models"
REPORTS = ROOT / "reports"
PROCESSED.mkdir(parents=True, exist_ok=True)
MODELS.mkdir(parents=True, exist_ok=True)
REPORTS.mkdir(parents=True, exist_ok=True)

display(params)


## 2) Load processed data and trained model

In [None]:

from joblib import load

X_train = pd.read_csv(PROCESSED / "X_train.csv")
X_test  = pd.read_csv(PROCESSED / "X_test.csv")

def read_y(path: Path, n_expected: int) -> pd.Series:
    y = pd.read_csv(path).iloc[:, 0]
    if len(y) == n_expected:
        return y
    y2 = pd.read_csv(path, header=None).iloc[:, 0]
    if len(y2) == n_expected:
        return y2
    raise ValueError(f"Inconsistent length when reading {path}: got {len(y)} (header) and {len(y2)} (no header), expected {n_expected}.")

y_train = read_y(PROCESSED / "y_train.csv", len(X_train))
y_test  = read_y(PROCESSED / "y_test.csv", len(X_test))

model_path = MODELS / "model.joblib"
if not model_path.exists():
    raise FileNotFoundError(f"Model not found at {model_path}. Run the Train notebook first.")
model = load(model_path)

print("Loaded model and data.")
print("Shapes:")
print(" X_train:", X_train.shape, " X_test:", X_test.shape)
print(" y_train:", y_train.shape, " y_test:", y_test.shape)


## 3) Compute metrics (RMSE, R²)
We compute Mean Squared Error, take its square root for RMSE, and compute R² for both train and test.

In [None]:

from math import sqrt
from sklearn.metrics import mean_squared_error, r2_score

preds_train = model.predict(X_train)
preds_test  = model.predict(X_test)

mse_train = mean_squared_error(y_train, preds_train)
mse_test  = mean_squared_error(y_test, preds_test)
rmse_train = sqrt(mse_train)
rmse_test  = sqrt(mse_test)

r2_train = r2_score(y_train, preds_train)
r2_test  = r2_score(y_test, preds_test)

metrics = {
    "rmse": float(rmse_train),
    "rmse_test": float(rmse_test),
    "r2": float(r2_train),
    "r2_test": float(r2_test),
    "mse": float(mse_train),
    "mse_test": float(mse_test),
}

print("Metrics:")
print(json.dumps(metrics, indent=2))


## 4) Save reports

In [None]:

REPORTS.mkdir(parents=True, exist_ok=True)
metrics_path = REPORTS / "metrics.json"
metrics_path.write_text(json.dumps(metrics, indent=2))
print(f"Wrote metrics to: {metrics_path}")

# Optional: save feature importances if available
fi_path = None
if hasattr(model, "feature_importances_"):
    feature_names = list(X_train.columns)
    fi = (
        pd.DataFrame({"feature": feature_names, "importance": model.feature_importances_})
        .sort_values("importance", ascending=False)
    )
    fi_path = REPORTS / "feature_importance.csv"
    fi.to_csv(fi_path, index=False)
    print(f"Wrote feature importances to: {fi_path}")
else:
    print("Model has no feature_importances_.")


## 5) Visualize feature importances (optional)

In [None]:

import matplotlib.pyplot as plt

if hasattr(model, "feature_importances_"):
    top = (
        pd.Series(model.feature_importances_, index=X_train.columns)
        .sort_values(ascending=False)
        .head(10)
    )
    display(top.to_frame("importance"))
    plt.figure()
    top.sort_values().plot(kind="barh")
    plt.title("Top 10 Feature Importances")
    plt.tight_layout()
    plt.show()
else:
    print("Model has no feature_importances_. Skipping plot.")
