### Model Evaluation & Reports

In [1]:
# =========================
# Day 5: Evaluation, Diagnostics, Model Card (Local only)
# - Read-only from SQL or local files
# - No Azure writes
# =========================

import os, re, json, glob, warnings
from datetime import datetime
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    confusion_matrix, roc_auc_score, roc_curve,
    precision_recall_curve, f1_score, accuracy_score
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# ---------- Paths ----------
ROOT = os.path.abspath(os.getcwd())
ARTIF_DIR   = os.path.join(ROOT, "models")
REPORTS_DIR = os.path.join(ROOT, "reports")
DOCS_DIR    = os.path.join(ROOT, "docs")
INPUTS_DIR  = os.path.join(ROOT, "_inputs")

os.makedirs(ARTIF_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)
os.makedirs(DOCS_DIR, exist_ok=True)
os.makedirs(INPUTS_DIR, exist_ok=True)

STAMP = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")

# ---------- 0) Engine import (your working one) ----------
# Make a small db_connect.py at project root that defines `engine`
#   from sqlalchemy import create_engine
#   engine = create_engine("<YOUR WORKING CONNECTION STRING>")
try:
    from db_connect import engine   # <--- uses your proven engine
except Exception as e:
    engine = None
    print("No engine found, will use local fallback if needed. Err:", repr(e))

# ---------- 1) Data load (same logic as Day 4) ----------
def read_from_sql_or_fallback():
    # Try SQL first
    if engine is not None:
        try:
            with engine.begin() as conn:
                # Prefer a processed view if you created it; else minimal fact
                has_view = False
                try:
                    _ = pd.read_sql("SELECT TOP (1) * FROM dbo.v_processed_sales", conn)
                    has_view = True
                except Exception:
                    pass
                if has_view:
                    df = pd.read_sql("SELECT * FROM dbo.v_processed_sales", conn)
                else:
                    q = """
                    SELECT f.OrderID, f.CustomerID, f.ProductID, f.Region,
                           f.OrderDate, f.ShipDate, f.Sales
                    FROM dbo.fact_sales f
                    """
                    df = pd.read_sql(q, conn)
            if not df.empty:
                return df
        except Exception as e:
            print("SQL read failed, fallback to local:", repr(e))

    # Local fallback
    pq = os.path.join(INPUTS_DIR, "processed_sales.parquet")
    cs = os.path.join(INPUTS_DIR, "fact_sales.csv")
    if os.path.exists(pq):
        return pd.read_parquet(pq)
    if os.path.exists(cs):
        return pd.read_csv(cs)

    raise FileNotFoundError("No SQL and no _inputs/processed_sales.parquet or _inputs/fact_sales.csv")

df = read_from_sql_or_fallback()
print("Loaded:", df.shape)
print(df.head(3))

# ---------- 2) Rebuild derived columns (safe) ----------
def to_datetime_safe(s):
    return pd.to_datetime(s, errors="coerce", dayfirst=True)

for dcol in ["OrderDate","ShipDate"]:
    if dcol in df.columns and not np.issubdtype(df[dcol].dtype, np.datetime64):
        df[dcol] = to_datetime_safe(df[dcol])

# Derive (order)
if "OrderDate" in df.columns:
    df["OrderYear"] = df["OrderDate"].dt.year
    df["OrderMonth"] = df["OrderDate"].dt.month
    df["OrderQuarter"] = df["OrderDate"].dt.quarter
    df["OrderWeekOfYear"] = df["OrderDate"].dt.isocalendar().week.astype("Int64")
    df["OrderMonthName"] = df["OrderDate"].dt.month_name()
    df["OrderIsWeekendOrder"] = df["OrderDate"].dt.dayofweek.isin([5,6]).astype(int)
    df["OrderYearMonth"] = df["OrderDate"].dt.to_period("M").astype(str)

# Derive (ship)
if "ShipDate" in df.columns:
    df["ShipYear"] = df["ShipDate"].dt.year
    df["ShipMonth"] = df["ShipDate"].dt.month
    df["ShipQuarter"] = df["ShipDate"].dt.quarter
    df["ShipWeekOfYear"] = df["ShipDate"].dt.isocalendar().week.astype("Int64")
    df["ShipMonthName"] = df["ShipDate"].dt.month_name()
    df["ShipIsWeekendShip"] = df["ShipDate"].dt.dayofweek.isin([5,6]).astype(int)

if {"OrderDate","ShipDate"}.issubset(df.columns):
    df["DaysToShip"] = (df["ShipDate"] - df["OrderDate"]).dt.days

# Clean strings
for col in ["ShipMode","Segment","Country","State","Region","Category","SubCategory","ProductName","CustomerName","City","PostalCode"]:
    if col in df.columns and df[col].dtype == object:
        df[col] = df[col].astype(str).strip()

# Drop bad rows
need = [c for c in ["OrderID","ProductID","CustomerID","OrderDate","Sales"] if c in df.columns]
df = df.dropna(subset=need)
df["Sales"] = pd.to_numeric(df["Sales"], errors="coerce")
df = df.dropna(subset=["Sales"])
df = df[df["Sales"] >= 0]

print("Post-clean:", df.shape)

# ---------- 3) Build feature matrix same as Day 4 ----------
categorical_features = ["Region","OrderMonthName"]  # kept minimal on purpose (matches yesterday’s run)
numeric_features = ["OrderYear","OrderMonth","OrderQuarter","OrderWeekOfYear","OrderIsWeekendOrder"]
target_col = "Sales"

drop_cols = [
    "CustomerName","City","PostalCode","ProductName",
    "OrderID","CustomerID","ProductID",
    "OrderDate","ShipDate",
    "ShipYear","ShipMonth","ShipQuarter","ShipYearMonth","ShipWeekOfYear","ShipMonthName","ShipIsWeekendShip","DaysToShip",
    "OrderYearMonth"
]
drop_cols = [c for c in drop_cols if c in df.columns]

X_all = df.drop(columns=[target_col] + drop_cols, errors="ignore")
y_all = df[target_col].astype(float)

# time-aware split
cutoff = df["OrderDate"].quantile(0.80)
train_mask = df["OrderDate"] <= cutoff
X_train, y_train = X_all[train_mask].copy(), y_all[train_mask].copy()
X_test,  y_test  = X_all[~train_mask].copy(), y_all[~train_mask].copy()

print("Cutoff:", cutoff.date(), "| Train:", X_train.shape, "| Test:", X_test.shape)

# ---------- 4) Reload latest saved models & metrics ----------
def latest_file(pattern):
    files = glob.glob(pattern)
    if not files:
        return None
    return max(files, key=os.path.getmtime)

reg_model_path = latest_file(os.path.join(ARTIF_DIR, "regression_*.pkl"))
cls_model_path = latest_file(os.path.join(ARTIF_DIR, "classifier_*.pkl"))
reg_metrics_path = latest_file(os.path.join(REPORTS_DIR, "regression_metrics_*.json"))
cls_metrics_path = latest_file(os.path.join(REPORTS_DIR, "classifier_metrics_*.json"))

print("Loaded model files:")
print(" - regression:", reg_model_path)
print(" - classifier:", cls_model_path)

best_reg = joblib.load(reg_model_path) if reg_model_path else None
best_cls = joblib.load(cls_model_path) if cls_model_path else None

prev_reg_json = json.load(open(reg_metrics_path)) if reg_metrics_path else {"results":{}}
prev_cls_json = json.load(open(cls_metrics_path)) if cls_metrics_path else {"results":{}}

# ---------- 5) Evaluate regression in depth ----------
reg_preds = best_reg.predict(X_test)
reg_mae  = mean_absolute_error(y_test, reg_preds)
reg_rmse = float(np.sqrt(mean_squared_error(y_test, reg_preds)))
reg_r2   = r2_score(y_test, reg_preds)

reg_eval = pd.DataFrame([{"MAE": reg_mae, "RMSE": reg_rmse, "R2": reg_r2, "cutoff": str(cutoff.date())}])
reg_eval.to_csv(os.path.join(REPORTS_DIR, f"regression_eval_{STAMP}.csv"), index=False)

# residuals
residuals = y_test.values - reg_preds
plt.figure(figsize=(6,4))
plt.scatter(reg_preds, residuals, alpha=0.3)
plt.axhline(0, lw=1)
plt.xlabel("Predicted")
plt.ylabel("Residual (Actual - Pred)")
plt.title("Residuals vs Predicted")
plt.tight_layout()
plt.savefig(os.path.join(REPORTS_DIR, f"reg_residuals_vs_pred_{STAMP}.png"), dpi=140)
plt.close()

# error by month (helps find seasonality problems)
tmp = pd.DataFrame({
    "OrderMonth": df.loc[~train_mask, "OrderMonth"].values,
    "abs_err": np.abs(y_test.values - reg_preds)
})
err_by_month = tmp.groupby("OrderMonth")["abs_err"].mean().reset_index()
err_by_month.to_csv(os.path.join(REPORTS_DIR, f"reg_abs_error_by_month_{STAMP}.csv"), index=False)

plt.figure(figsize=(6,4))
plt.plot(err_by_month["OrderMonth"], err_by_month["abs_err"], marker="o")
plt.xlabel("OrderMonth")
plt.ylabel("Mean absolute error")
plt.title("Regression error by month")
plt.tight_layout()
plt.savefig(os.path.join(REPORTS_DIR, f"reg_error_by_month_{STAMP}.png"), dpi=140)
plt.close()

# ---------- 6) Evaluate classification in depth ----------
# Baseline at saved model's default threshold (0.5 unless otherwise inside model)
cls_proba = best_cls.predict_proba(X_test)[:,1] if hasattr(best_cls, "predict_proba") else None
cls_pred  = best_cls.predict(X_test)

# Build labels using Day 4 rule: top-20% high-sellers on TRAIN
thr_train = y_train.quantile(0.80)
y_train_cls = (y_train >= thr_train).astype(int)
y_test_cls  = (y_test  >= thr_train).astype(int)

acc0 = accuracy_score(y_test_cls, cls_pred)
f10  = f1_score(y_test_cls, cls_pred)
auc0 = roc_auc_score(y_test_cls, cls_proba) if cls_proba is not None else float("nan")

# Threshold tuning (maximize F1 on test set)
best_f1, best_thr = -1, 0.5
if cls_proba is not None:
    prec, rec, ths = precision_recall_curve(y_test_cls, cls_proba)
    # compute F1 for each threshold
    f1s = (2*prec*rec/(prec+rec+1e-9))
    best_idx = int(np.nanargmax(f1s))
    best_f1 = float(f1s[best_idx])
    if best_idx == 0:
        # scikit gives no threshold for the first PR point
        best_thr = 0.5
    else:
        best_thr = float(ths[best_idx-1])

    # apply tuned threshold
    cls_pred_tuned = (cls_proba >= best_thr).astype(int)
    acc1 = accuracy_score(y_test_cls, cls_pred_tuned)
    f11  = f1_score(y_test_cls, cls_pred_tuned)
    auc1 = auc0  # same probs

    # Save PR and ROC curves
    fpr, tpr, _ = roc_curve(y_test_cls, cls_proba)
    plt.figure(figsize=(5,5))
    plt.plot(fpr, tpr)
    plt.plot([0,1],[0,1], ls="--")
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC")
    plt.tight_layout()
    plt.savefig(os.path.join(REPORTS_DIR, f"cls_roc_{STAMP}.png"), dpi=140)
    plt.close()

    plt.figure(figsize=(5,5))
    plt.plot(rec, prec)
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("PR Curve")
    plt.tight_layout()
    plt.savefig(os.path.join(REPORTS_DIR, f"cls_pr_{STAMP}.png"), dpi=140)
    plt.close()

# Confusion matrices (default and tuned)
def save_conf_mat(y_true, y_pred, fname):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(4,4))
    ax.imshow(cm, cmap="Blues")
    for (i,j),v in np.ndenumerate(cm):
        ax.text(j, i, str(v), ha="center", va="center")
    ax.set_xticks([0,1]); ax.set_yticks([0,1])
    ax.set_xticklabels(["Low","High"]); ax.set_yticklabels(["Low","High"])
    ax.set_xlabel("Predicted"); ax.set_ylabel("Actual")
    ax.set_title("Confusion Matrix")
    plt.tight_layout()
    plt.savefig(os.path.join(REPORTS_DIR, fname), dpi=140); plt.close()

save_conf_mat(y_test_cls, cls_pred, f"cls_confusion_default_{STAMP}.png")
if cls_proba is not None:
    save_conf_mat(y_test_cls, (cls_proba>=best_thr).astype(int), f"cls_confusion_tuned_{STAMP}.png")

# Metrics table
rows = [{
    "mode":"default", "ACC":acc0, "F1":f10, "AUC":auc0, "threshold":0.5
}]
if cls_proba is not None:
    rows.append({"mode":"tuned", "ACC":acc1, "F1":f11, "AUC":auc1, "threshold":best_thr})

cls_eval = pd.DataFrame(rows)
cls_eval.to_csv(os.path.join(REPORTS_DIR, f"classification_eval_{STAMP}.csv"), index=False)

# ---------- 7) Explainability ----------
def get_feature_names_from_preprocessor(prep: ColumnTransformer):
    # Works on sklearn >=1.0
    try:
        return prep.get_feature_names_out()
    except Exception:
        names = []
        for name, trans, cols in prep.transformers_:
            if name == "remainder" and trans == "drop":
                continue
            if hasattr(trans, "get_feature_names_out"):
                try:
                    sub = trans.get_feature_names_out(cols)
                except Exception:
                    sub = cols
            else:
                sub = cols
            # prefix with transformer name to keep uniqueness
            sub = [f"{name}__{c}" for c in sub]
            names.extend(sub)
        return np.array(names, dtype=object)

# 7a) ElasticNet coefficients
reg_prep = best_reg.named_steps["prep"]
reg_model = best_reg.named_steps["model"]

feat_names = get_feature_names_from_preprocessor(reg_prep)
coefs = getattr(reg_model, "coef_", None)
reg_coef_df = None
if coefs is not None and len(np.atleast_1d(coefs)) == len(feat_names):
    reg_coef_df = pd.DataFrame({"feature": feat_names, "coef": np.atleast_1d(coefs)})
    reg_coef_df.sort_values("coef", key=np.abs, ascending=False, inplace=True)
    reg_coef_df.to_csv(os.path.join(REPORTS_DIR, f"reg_elasticnet_coefficients_{STAMP}.csv"), index=False)

# 7b) RF classifier importances (aggregated back to original fields)
cls_prep = best_cls.named_steps["prep"]
cls_model = best_cls.named_steps["model"]
cls_feat_names = get_feature_names_from_preprocessor(cls_prep)
importances = getattr(cls_model, "feature_importances_", None)

agg_imp = None
if importances is not None and len(importances) == len(cls_feat_names):
    imp_df = pd.DataFrame({"feature": cls_feat_names, "importance": importances})
    # collapse one-hot groups back to their source column
    # a feature name typically looks like: "cat__Region_East" or "num__OrderMonth"
    def base_col(s):
        # remove transformer prefix and one-hot suffix
        s = s.replace("cat__", "").replace("num__", "")
        # if one-hot, it looks like Region_East -> base 'Region'
        return s.split("_")[0]
    imp_df["base"] = imp_df["feature"].apply(base_col)
    agg_imp = imp_df.groupby("base", as_index=False)["importance"].sum().sort_values("importance", ascending=False)
    agg_imp.to_csv(os.path.join(REPORTS_DIR, f"classifier_importance_aggregated_{STAMP}.csv"), index=False)

    # quick bar plot of top-10
    top = agg_imp.head(10)
    plt.figure(figsize=(6,4))
    plt.barh(top["base"][::-1], top["importance"][::-1])
    plt.title("RF Classifier: Top feature groups")
    plt.tight_layout()
    plt.savefig(os.path.join(REPORTS_DIR, f"classifier_importance_top10_{STAMP}.png"), dpi=140)
    plt.close()

# ---------- 8) Comparison tables from previous JSONs ----------
# Helpful when you’ve saved all model metrics during Day 4
if "results" in prev_reg_json and prev_reg_json["results"]:
    reg_table = pd.DataFrame(prev_reg_json["results"]).T.reset_index().rename(columns={"index":"model"})
    reg_table.to_csv(os.path.join(REPORTS_DIR, f"regression_models_comparison_{STAMP}.csv"), index=False)

if "results" in prev_cls_json and prev_cls_json["results"]:
    cls_table = pd.DataFrame(prev_cls_json["results"]).T.reset_index().rename(columns={"index":"model"})
    cls_table.to_csv(os.path.join(REPORTS_DIR, f"classification_models_comparison_{STAMP}.csv"), index=False)

# ---------- 9) Model Card generator ----------
MODEL_CARD = f"""# Model Card — Retail Sales (Day 5)
Generated: {STAMP}

## Overview
- **Project:** Cloud-Based Retail Sales Analytics & Forecasting
- **Data source:** Kaggle Sales Forecasting (train CSV), ~9.8k rows landed
- **Split policy:** time-aware split by OrderDate (80% train, 20% test)
- **Target(s):** 
  - Regression: Sales (float)
  - Classification: High-seller (>= 80th percentile of train Sales)

## Best Models (Day 4 -> Day 5 verified)
### Regression
- Algorithm: {type(best_reg.named_steps['model']).__name__}
- Metrics (test): MAE={reg_mae:.2f}, RMSE={reg_rmse:.2f}, R2={reg_r2:.3f}
- Notes: baseline performance is limited; stronger features needed (lags, product/region aggregates).

### Classification
- Algorithm: {type(best_cls.named_steps['model']).__name__}
- Metrics (test, default): ACC={acc0:.3f}, F1={f10:.3f}, AUC={auc0:.3f}
- Metrics (test, tuned threshold={best_thr:.3f}): F1={rows[-1]['F1']:.3f} (if tuned computed)
- Notes: class imbalance; tuned threshold improves F1. Try more features + class weighting and AUC optimization.

## Features used
- Categorical: Region, OrderMonthName
- Numeric: OrderYear, OrderMonth, OrderQuarter, OrderWeekOfYear, OrderIsWeekendOrder
- Leakage control: no Ship* fields used for training

## Explainability
- ElasticNet coefficients: see reports/reg_elasticnet_coefficients_*.csv
- RF classifier importances (aggregated): see reports/classifier_importance_aggregated_*.csv

## Artifacts
- Models: models/regression_*.pkl, models/classifier_*.pkl
- Metrics: reports/regression_metrics_*.json, reports/classifier_metrics_*.json
- Day 5 eval: reports/regression_eval_*.csv, reports/classification_eval_*.csv
- Plots: reports/reg_residuals_vs_pred_*.png, reports/cls_roc_*.png, reports/cls_pr_*.png

## Intended use
- Educational baseline for retail sales analytics and forecasting in a cloud-like stack (ADF + SQL + Python).

## Limitations
- Current features are simple date-based signals; no product/customer history, no external seasonality.
- Dataset variant lacks Quantity/Discount/Profit.

## Next steps
- Add lags and rolling aggregates per product/region.
- Try XGBoost/LightGBM and hyperparameter tuning.
- Register in Azure ML and deploy a small REST endpoint.
"""
with open(os.path.join(DOCS_DIR, "MODEL_CARD.md"), "w") as f:
    f.write(MODEL_CARD)

print("\nSaved:")
print(" -", os.path.join(REPORTS_DIR, f"regression_eval_{STAMP}.csv"))
print(" -", os.path.join(REPORTS_DIR, f"classification_eval_{STAMP}.csv"))
print(" -", os.path.join(DOCS_DIR, "MODEL_CARD.md"))


No engine found, will use local fallback if needed. Err: ModuleNotFoundError("No module named 'db_connect'")


FileNotFoundError: No SQL and no _inputs/processed_sales.parquet or _inputs/fact_sales.csv