In [5]:
"""
Supervised Learning — Hotel (classification) + US Accidents (regression)

References (used in comments citations):
[R1] Mitchell, T. M., "Machine Learning", McGraw-Hill, 1997. (DT/kNN/ANN overview)
[R2] SL Report spec v6-1 (metrics/plots/splits/compute logs; deliverables)
[R3] scikit-learn docs (APIs; implementation reference)
[R4] Quinlan, "Induction of Decision Trees", Machine Learning, 1986. (DT)
[R5] Cover & Hart, "Nearest Neighbor Pattern Classification", IEEE TIT, 1967. (kNN)
[R6] Cortes & Vapnik, "Support-Vector Networks", Machine Learning, 1995. (SVM)
[R7] Rumelhart et al., "Learning representations by back-propagating errors", Nature, 1986. (NN)
"""

# ===============================
# CHUNK 0 — imports, constants
# ===============================

import os, json, time, platform, warnings
from typing import Dict, Tuple, List

import numpy as np
import pandas as pd
import polars as pl  # fast CSV & memory-friendly transforms
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, KFold, learning_curve
)
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_curve, auc,
    precision_recall_curve, confusion_matrix, classification_report,
    mean_absolute_error, mean_squared_error, r2_score, log_loss
)
from sklearn.metrics import median_absolute_error as medae

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.exceptions import ConvergenceWarning


try:
    import psutil  # [R2] runtime/RAM logging
except Exception:
    psutil = None

# Silence benign warnings; keep real errors visible
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning, module=r"sklearn\.neural_network")

# Output roots
OUT_HOTEL = "outputs/hotel_cls"   # Hotel = classification (is_canceled) [R2]
OUT_ACC   = "outputs/acc_reg"     # Accidents = regression (duration_minutes) [R2]
os.makedirs(OUT_HOTEL, exist_ok=True)
os.makedirs(OUT_ACC, exist_ok=True)

RANDOM_STATE = 42  # [R2] reproducibility

# ========== YOUR FILE PATHS ==========
HOTEL_CSV = "hotel_bookings.csv"
ACC_CSV   = "US_Accidents_March23.csv"
# =====================================


# ================================================
# CHUNK 1 — tiny profiler + hardware descriptor
# ================================================

def hw_info():
    info = dict(platform=platform.platform(), python=platform.python_version())
    if psutil:
        info["cpu_count"] = psutil.cpu_count(logical=True)
        info["ram_gb"] = round(psutil.virtual_memory().total / 1e9, 2)
    return info

class Profiler:
    """Wall-clock + peak RSS logger for a code section. [R2]"""
    def __init__(self, tag="run"): self.tag = tag; self.t0 = None; self.max_rss = 0
    def __enter__(self):
        self.t0 = time.perf_counter()
        self._sample()
        print(f"[Profiler] START: {self.tag}")
        return self
    def _sample(self):
        if psutil:
            rss = psutil.Process().memory_info().rss
            self.max_rss = max(self.max_rss, rss)
    def tick(self): self._sample()
    def __exit__(self, *exc):
        self.seconds_fit = time.perf_counter() - self.t0
        self._sample()
        self.peak_gb = round(self.max_rss / 1e9, 3) if self.max_rss else None
        print(f"[Profiler] END: {self.tag} | sec={self.seconds_fit:.2f} | peakGB={self.peak_gb}")

def save_json_safe(path, obj):
    """Ensure json-serializable (cast numpy types to native Python)."""
    def cast(o):
        if isinstance(o, (np.integer,)): return int(o)
        if isinstance(o, (np.floating,)): return float(o)
        if isinstance(o, (np.ndarray,)): return o.tolist()
        return o
    if isinstance(obj, dict):
        obj = {k: cast(v) for k,v in obj.items()}
    json.dump(obj, open(path,"w"), indent=2)

def save_profile(path, **kwargs):
    save_json_safe(path, kwargs)


# ========================================
# CHUNK 2 — plotting helpers [R2]
# ========================================

def plot_confusion(cm: np.ndarray, labels: List[str], title: str, outpath: str):
    plt.figure()
    plt.imshow(cm, interpolation="nearest", aspect="auto")
    plt.title(title)
    ticks = np.arange(len(labels))
    plt.xticks(ticks, labels, rotation=45)
    plt.yticks(ticks, labels)
    thr = cm.max()/2 if cm.max()>0 else 0.5
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, f"{int(cm[i,j])}", ha="center",
                     color="white" if cm[i,j]>thr else "black")
    plt.ylabel("True"); plt.xlabel("Predicted")
    plt.tight_layout(); plt.savefig(outpath, bbox_inches="tight"); plt.close()

def plot_line(x, y, title, xlabel, ylabel, outpath):
    plt.figure()
    plt.plot(x, y, label=title)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel(ylabel); plt.legend()
    plt.tight_layout(); plt.savefig(outpath, bbox_inches="tight"); plt.close()

def plot_learning_curve_single(
    estimator, X, y, title, outpath, cv,
    train_sizes=np.linspace(0.2, 1.0, 5),
    scoring=None,
    n_jobs=-1,
    random_state=RANDOM_STATE
):
    """Learning curves diagnose bias/variance; required per model [R1][R2]."""
    print(f"[Plot] Learning curve: {title}")
    sizes, tr, va = learning_curve(
        estimator, X, y,
        train_sizes=train_sizes,
        cv=cv, shuffle=True, random_state=random_state,
        scoring=scoring,
        n_jobs=n_jobs,
        return_times=False
    )
    plt.figure()
    plt.plot(sizes, tr.mean(axis=1), marker="o", label="Training")
    plt.plot(sizes, va.mean(axis=1), marker="s", label="Validation")
    plt.xlabel("Training examples"); plt.ylabel("Score"); plt.title(title); plt.legend()
    plt.tight_layout(); plt.savefig(outpath, bbox_inches="tight"); plt.close()

def plot_parity(y_true, y_pred, title, outpath):
    """Regression: parity (ŷ vs y) [R2]."""
    print(f"[Plot] Parity: {title}")
    plt.figure()
    plt.scatter(y_true, y_pred, s=6, alpha=0.35)
    lo, hi = float(min(np.min(y_true), np.min(y_pred))), float(max(np.max(y_true), np.max(y_pred)))
    plt.plot([lo, hi], [lo, hi], linestyle="--")
    plt.title(title); plt.xlabel("True"); plt.ylabel("Predicted")
    plt.tight_layout(); plt.savefig(outpath, bbox_inches="tight"); plt.close()

def plot_residuals(y_true, y_pred, title, outpath):
    """Residuals (ŷ − y) vs ŷ exposes bias/heteroscedasticity [R2]."""
    print(f"[Plot] Residuals: {title}")
    res = y_pred - y_true
    plt.figure()
    plt.scatter(y_pred, res, s=6, alpha=0.35)
    plt.axhline(0.0, linestyle="--")
    plt.title(title); plt.xlabel("Predicted"); plt.ylabel("Residual (ŷ − y)")
    plt.tight_layout(); plt.savefig(outpath, bbox_inches="tight"); plt.close()

def plot_reliability(y_true, y_proba, title, outpath, n_bins=10):
    """Probability calibration (reliability curve) for classification [R2]."""
    print(f"[Plot] Reliability: {title}")
    from sklearn.calibration import calibration_curve
    frac_pos, mean_pred = calibration_curve(y_true, y_proba, n_bins=n_bins, strategy="uniform")
    plt.figure()
    plt.plot(mean_pred, frac_pos, marker="o", label="Reliability")
    plt.plot([0,1],[0,1], linestyle="--", label="Perfect")
    plt.title(title); plt.xlabel("Mean predicted probability"); plt.ylabel("Fraction positives"); plt.legend()
    plt.tight_layout(); plt.savefig(outpath, bbox_inches="tight"); plt.close()


# =========================================
# CHUNK 3 — metrics & helpers [R2]
# =========================================

def summarize_classification(y_true, y_proba, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prc, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)
    prec, reca, _ = precision_recall_curve(y_true, y_proba)
    pr_auc = auc(reca, prec)
    return ({"accuracy":float(acc),"precision":float(prc),"recall":float(rec),"f1":float(f1),
             "roc_auc":float(roc_auc),"pr_auc":float(pr_auc)},
            (fpr,tpr), (reca,prec))

def summarize_regression(y_true, y_pred):
    return {
        "MAE":   float(mean_absolute_error(y_true, y_pred)),
        "MedAE": float(medae(y_true, y_pred)),
        "RMSE":  float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "R2":    float(r2_score(y_true, y_pred))
    }

def prevalence_baseline(y_true):
    """PR-AUC baseline equals positive class prevalence [R2]."""
    return float(np.mean(y_true))

def f1_optimal_threshold(y_true_val, y_proba_val):
    """Pick threshold on validation to maximize F1 (document rule in report) [R2]."""
    from sklearn.metrics import f1_score
    thr_grid = np.linspace(0.05, 0.95, 19)
    scores = [(thr, f1_score(y_true_val, (y_proba_val >= thr).astype(int))) for thr in thr_grid]
    best_thr, best_f1 = max(scores, key=lambda t: t[1])
    return float(best_thr), float(best_f1)

# ==== Extra helpers: permutation importances + NN budget audit ====

def permutation_importance_topk(pipeline, X, y, scoring, k=10, n_repeats=10, random_state=RANDOM_STATE):
    """Top-k permutation importances on raw columns (permutes pre-transform inputs)."""
    from sklearn.inspection import permutation_importance
    r = permutation_importance(pipeline, X, y, scoring=scoring, n_repeats=n_repeats, random_state=random_state)
    imp = pd.DataFrame({"feature": X.columns, "importance": r.importances_mean})
    return imp.sort_values("importance", ascending=False).head(k)

def _estimate_mlp_params(n_in, hidden, n_out):
    sizes = [n_in] + list(hidden) + [n_out]
    params = 0
    for i in range(len(sizes) - 1):
        params += sizes[i] * sizes[i+1] + sizes[i+1]  # weights + biases
    return int(params)

def _estimate_input_dim(preproc, X_sample):
    # tiny fit on a small slice to infer transformed width (safe & quick)
    Xt = preproc.fit_transform(X_sample.head(64))
    return Xt.shape[1]

def finite_clip(X, clip=1_000.0):
    # Ensure finite values and tame outliers before scaling
    X = np.asarray(X, dtype=np.float32, order="C")
    X = np.nan_to_num(X, nan=0.0, posinf=clip, neginf=-clip)
    return np.clip(X, -clip, clip)

def to_dense32(X):
    # ColumnTransformer may return sparse; MLP needs dense
    if hasattr(X, "toarray"):
        X = X.toarray()
    return np.asarray(X, dtype=np.float32, order="C")


# ==========================================================
# CHUNK 4 — encoders & preprocessor (float32, high-card) [R2]
# ==========================================================

def make_ohe():
    """Create OneHotEncoder with best available arg (compat shim)."""
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=True)

def detect_high_cardinality(df, cols, min_unique=100, min_ratio=0.05):
    hi, lo = [], []
    n = len(df)
    for c in cols:
        u = df[c].nunique(dropna=True)
        if (u >= min_unique) or (u / max(n,1) >= min_ratio): hi.append(c)
        else: lo.append(c)
    return hi, lo

def to_float32(X):
    try:    return X.astype(np.float32)
    except: return np.asarray(X, dtype=np.float32)

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    """
    Frequency-encode high-card categorical columns (leak-safe when used inside CV/Pipeline).
    Clone-safe: init stores params verbatim (no mutation).
    Works for DataFrame or ndarray slices (ColumnTransformer passes only selected columns).
    """
    def __init__(self, cols=None, min_samples=5):
        self.cols = cols
        self.min_samples = min_samples
        self._maps_by_pos = None
        self._n_features_in_ = None

    def fit(self, X, y=None):
        Xc = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        self._n_features_in_ = Xc.shape[1]
        self._maps_by_pos = {}
        for j in range(self._n_features_in_):
            s = Xc.iloc[:, j]
            vc = s.value_counts(dropna=True)
            if (self.min_samples is not None) and (self.min_samples > 1):
                vc = vc[vc >= self.min_samples]
            total = float(vc.sum()) if vc.sum() > 0 else 1.0
            freq = (vc / total).astype("float32")
            # prior fallback for unseen
            freq = pd.concat([freq, pd.Series({"__PRIOR__": np.float32(0.0)}, dtype="float32")])
            self._maps_by_pos[j] = freq
        return self

    def transform(self, X):
        Xc = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        k = Xc.shape[1]
        out = np.zeros((len(Xc), k), dtype="float32")
        for j in range(k):
            s = Xc.iloc[:, j]
            freq = self._maps_by_pos.get(j)
            if freq is None:
                continue
            mapped = s.map(freq).fillna(freq.get("__PRIOR__", np.float32(0.0))).astype("float32")
            out[:, j] = mapped.to_numpy(copy=False)
        return out

def make_preprocessor(df, target, task="classification"):
    """
    Mixed encoding per [R2]:
      - numerics: median impute + StandardScaler + float32
      - low-card categoricals: one-hot (cast to float32)
      - high-card categoricals: frequency encoding (inside pipeline)
    """
    X = df.drop(columns=[target])
    num = X.select_dtypes(include=[np.number]).columns.tolist()
    cat = X.select_dtypes(include=["object", "category", "bool", "boolean", "string"]).columns.tolist()

    hi, lo = detect_high_cardinality(X, cat, min_unique=100, min_ratio=0.05)

    num_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("finite", FunctionTransformer(finite_clip, accept_sparse=True)),
        ("scale",  StandardScaler()),                 # requirement: StandardScaler for numerics
        ("to32",   FunctionTransformer(to_float32, accept_sparse=True))
    ])

    transformers = [("num", num_pipe, num)]

    if lo:
        transformers.append(("ohe",
            Pipeline([
                ("impute", SimpleImputer(strategy="most_frequent", missing_values=np.nan)),
                ("onehot", make_ohe()),
                ("to32",   FunctionTransformer(to_float32, accept_sparse=True)),
            ]),
            lo))

    if hi:
        transformers.append(("freq_hi",
            Pipeline([("impute", SimpleImputer(strategy="most_frequent", missing_values=np.nan)),
                      ("freq",   FrequencyEncoder(cols=hi, min_samples=5)),
                      ("to32",   FunctionTransformer(to_float32))]),
            hi))

    return ColumnTransformer(transformers=transformers, sparse_threshold=0.3)

# ===================================================
# CHUNK 5 — HOTEL load/clean/split + preprocessor
# ===================================================

print("[Hotel] Loading with Polars…")

# Treat these strings as nulls; avoids parse errors like 'NA' in integer columns
CSV_NULLS = ["NA", "NaN", "NULL", "null", "", "None"]

# Read directly (file is small enough)
hotel_df = pl.read_csv(
    HOTEL_CSV,
    infer_schema_length=20000,
    null_values=CSV_NULLS,
    schema_overrides={
        "children": pl.Float64,
        "babies": pl.Float64,
        "adults": pl.Float64,
    },
)
# Safety cast in case the columns are missing in your version
for col in ("children", "babies", "adults"):
    if col in hotel_df.columns:
        hotel_df = hotel_df.with_columns(pl.col(col).cast(pl.Float64, strict=False))
print(f"[Hotel] Loaded rows={hotel_df.height:,}, cols={hotel_df.width}")

print("[Hotel] Converting to pandas and casting dtypes…")
hotel = hotel_df.to_pandas(use_pyarrow_extension_array=False)
try:
    hotel = hotel.replace({pd.NA: np.nan})
except Exception:
    pass

# Drop known leakage column if present
if "reservation_status" in hotel.columns:
    hotel = hotel.drop(columns=["reservation_status"])

# Cast boolean-like strings to category (cleaner OHE)
for c in hotel.columns:
    if str(hotel[c].dtype) in ("string[pyarrow]", "object"):
        vals = set(pd.Series(hotel[c]).dropna().unique())
        if vals.issubset({"True","False","Yes","No","TRUE","FALSE"}):
            hotel[c] = hotel[c].astype("category")

assert "is_canceled" in hotel.columns, "Expected 'is_canceled' in hotel dataset."
y_h = hotel["is_canceled"].astype("int32")
X_h = hotel.drop(columns=["is_canceled"])

print("[Hotel] Stratified splits (70/15/15)…")
Xh_tr, Xh_tmp, yh_tr, yh_tmp = train_test_split(
    X_h, y_h, test_size=0.30, stratify=y_h, random_state=RANDOM_STATE
)
Xh_va, Xh_te, yh_va, yh_te = train_test_split(
    Xh_tmp, yh_tmp, test_size=0.50, stratify=yh_tmp, random_state=RANDOM_STATE
)

pre_hotel = make_preprocessor(
    pd.concat([X_h, y_h.rename("is_canceled")], axis=1),
    target="is_canceled",
    task="classification"
)

hotel_counts = dict(
    tag="Hotel",
    raw_rows=int(len(hotel)), cleaned_rows=int(len(X_h)),
    train_rows=int(len(Xh_tr)), val_rows=int(len(Xh_va)), test_rows=int(len(Xh_te))
)
save_json_safe(os.path.join(OUT_HOTEL, "counts.json"), hotel_counts)
print(f"[Hotel] Counts: {hotel_counts}")


# =================================================
# CHUNK 6 — HOTEL: Decision Tree (DT) [R1][R4][R2]
# =================================================

cv_h = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

print("[Hotel/DT] Fitting DecisionTreeClassifier…")
dt_h = Pipeline([("pre", clone(pre_hotel)),
                 ("clf", DecisionTreeClassifier(
                     criterion="gini",
                     max_depth=14,              # guardrails {6,10,14,18} [R2]
                     min_samples_leaf=50,       # {50,100,200}
                     min_samples_split=200,     # {100,200,400}
                     max_features="sqrt",       # {"sqrt","log2",0.5}
                     random_state=RANDOM_STATE
                 ))])

with Profiler("Hotel_DT_fit") as prof:
    dt_h.fit(Xh_tr, yh_tr)

print("[Hotel/DT] Predicting & scoring…")
t0 = time.perf_counter()
yh_prob_dt = dt_h.predict_proba(Xh_te)[:,1]
yh_hat_dt  = dt_h.predict(Xh_te)
pred_s = time.perf_counter()-t0

m_dt, (fpr_dt, tpr_dt), (rec_dt, prec_dt) = summarize_classification(yh_te, yh_prob_dt, yh_hat_dt)
pd.DataFrame([m_dt]).to_csv(os.path.join(OUT_HOTEL, "dt_metrics.csv"), index=False)
plot_confusion(confusion_matrix(yh_te, yh_hat_dt), ["NotCanceled","Canceled"],
               "Hotel: DT Confusion @0.5", os.path.join(OUT_HOTEL, "dt_confusion.png"))
plot_line(fpr_dt, tpr_dt, "Hotel: DT ROC", "FPR", "TPR", os.path.join(OUT_HOTEL, "dt_roc.png"))
plot_line(rec_dt, prec_dt, "Hotel: DT PR", "Recall", "Precision", os.path.join(OUT_HOTEL, "dt_pr.png"))
plot_learning_curve_single(dt_h, Xh_tr, yh_tr, "Hotel: DT Learning Curve",
                           os.path.join(OUT_HOTEL, "dt_learning_curve.png"), cv=cv_h)
plot_reliability(yh_te, yh_prob_dt, "Hotel: DT Reliability", os.path.join(OUT_HOTEL, "dt_reliability.png"))

# Threshold via validation F1
print("[Hotel/DT] Selecting threshold by val F1…")
yh_prob_val = dt_h.predict_proba(Xh_va)[:,1]
thr_dt, bestf1_dt = f1_optimal_threshold(yh_va, yh_prob_val)
save_json_safe(os.path.join(OUT_HOTEL, "dt_val_threshold.json"),
               {"best_val_threshold": thr_dt, "val_f1_at_thr": bestf1_dt})
yh_hat_thr = (yh_prob_dt >= thr_dt).astype(int)
cm_thr = confusion_matrix(yh_te, yh_hat_thr)
plot_confusion(cm_thr, ["NotCanceled","Canceled"], "Hotel: DT Confusion @F1-threshold",
               os.path.join(OUT_HOTEL, "dt_confusion_f1thr.png"))

# Realized tree stats [R2]
tree = dt_h.named_steps["clf"]
realized = {
    "depth": int(tree.get_depth()),
    "leaves": int(tree.get_n_leaves()),
    "nodes": int(tree.tree_.node_count),
}
save_json_safe(os.path.join(OUT_HOTEL, "dt_realized_stats.json"), realized)

# PR-AUC prevalence baseline
save_json_safe(os.path.join(OUT_HOTEL, "baseline.json"),
               {"prevalence_baseline_pr_auc": prevalence_baseline(yh_te)})

# Full classification report (threshold 0.5)
save_json_safe(os.path.join(OUT_HOTEL, "dt_classification_report.json"),
               classification_report(yh_te, yh_hat_dt, output_dict=True, zero_division=0))

# Runtime record (fit + predict + hardware)
save_profile(os.path.join(OUT_HOTEL, "dt_profile.json"),
             tag="Hotel_DT_fit+predict", seconds_fit=prof.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof.peak_gb, hardware=hw_info())

# Model-Complexity curve (vary max_depth) [R2]
print("[Hotel/DT] Model complexity sweep (max_depth)…")
def model_complexity_curve(pipe_maker, param_name, param_grid, X, y, cv, scorer, out_csv, title, out_png):
    rows = []
    for val in param_grid:
        clf = pipe_maker(val)
        scores, tr_scores = [], []
        for tr, va in cv.split(X, y):
            clf.fit(X.iloc[tr], y.iloc[tr])
            yhat_va = clf.predict(X.iloc[va]); scores.append(scorer(y.iloc[va], yhat_va))
            yhat_tr = clf.predict(X.iloc[tr]); tr_scores.append(scorer(y.iloc[tr], yhat_tr))
        rows.append(dict(param=val, val_score=float(np.mean(scores)), tr_score=float(np.mean(tr_scores))))

    df = pd.DataFrame(rows)
    df.to_csv(out_csv, index=False)

    # robust plotting: index on x, labels as strings
    x = np.arange(len(df))
    labels = df["param"].astype(str).tolist()
    plt.figure()
    plt.plot(x, df["val_score"], marker="o", label="Validation")
    plt.plot(x, df["tr_score"], marker="s", label="Training")
    plt.title(title); plt.xlabel(param_name); plt.ylabel("Accuracy"); plt.legend()
    plt.xticks(x, labels, rotation=0)
    plt.tight_layout(); plt.savefig(out_png, bbox_inches="tight"); plt.close()

def dt_maker(max_depth):
    return Pipeline([("pre", clone(pre_hotel)),
                     ("clf", DecisionTreeClassifier(max_depth=int(max_depth),
                                                   min_samples_leaf=50, min_samples_split=200,
                                                   random_state=RANDOM_STATE))])
model_complexity_curve(dt_maker, "max_depth", [6,10,14,18], Xh_tr, yh_tr, cv_h,
                       scorer=accuracy_score,
                       out_csv=os.path.join(OUT_HOTEL,"mc_dt.csv"),
                       title="Hotel: DT Model-Complexity (max_depth)",
                       out_png=os.path.join(OUT_HOTEL,"mc_dt.png"))

# Permutation importances (top-10 by F1)
imp_dt = permutation_importance_topk(dt_h, Xh_te, yh_te, scoring="f1", k=10, n_repeats=10)
imp_dt.to_csv(os.path.join(OUT_HOTEL, "dt_perm_importance_top10.csv"), index=False)

[Hotel] Loading with Polars…
[Hotel] Loaded rows=119,390, cols=32
[Hotel] Converting to pandas and casting dtypes…
[Hotel] Stratified splits (70/15/15)…
[Hotel] Counts: {'tag': 'Hotel', 'raw_rows': 119390, 'cleaned_rows': 119390, 'train_rows': 83573, 'val_rows': 17908, 'test_rows': 17909}
[Hotel/DT] Fitting DecisionTreeClassifier…
[Profiler] START: Hotel_DT_fit
[Profiler] END: Hotel_DT_fit | sec=0.33 | peakGB=2.149
[Hotel/DT] Predicting & scoring…
[Plot] Learning curve: Hotel: DT Learning Curve
[Plot] Reliability: Hotel: DT Reliability
[Hotel/DT] Selecting threshold by val F1…
[Hotel/DT] Model complexity sweep (max_depth)…


In [6]:
# =========================================================
# CHUNK 7 — HOTEL: kNN + Linear/RBF SVM [R5][R6][R2]
# =========================================================

# kNN (exact, brute) + learning curve + MC(k)
print("[Hotel/kNN] Fitting exact kNN (brute)…")
knn_h = Pipeline([("pre", clone(pre_hotel)),
                  ("clf", KNeighborsClassifier(n_neighbors=11, algorithm="brute", n_jobs=-1))])

with Profiler("Hotel_kNN_fit") as prof_knn:
    knn_h.fit(Xh_tr, yh_tr)
t0 = time.perf_counter()
yh_prob_knn = knn_h.predict_proba(Xh_te)[:,1]
yh_hat_knn  = knn_h.predict(Xh_te)
pred_s = time.perf_counter()-t0
m_knn, (fpr_knn, tpr_knn), (rec_knn, prec_knn) = summarize_classification(yh_te, yh_prob_knn, yh_hat_knn)
pd.DataFrame([m_knn]).to_csv(os.path.join(OUT_HOTEL, "knn_metrics.csv"), index=False)
plot_confusion(confusion_matrix(yh_te, yh_hat_knn), ["NotCanceled","Canceled"],
               "Hotel: kNN Confusion @0.5", os.path.join(OUT_HOTEL, "knn_confusion.png"))
plot_line(fpr_knn, tpr_knn, "Hotel: kNN ROC", "FPR", "TPR", os.path.join(OUT_HOTEL, "knn_roc.png"))
plot_line(rec_knn, prec_knn, "Hotel: kNN PR", "Recall", "Precision", os.path.join(OUT_HOTEL, "knn_pr.png"))
plot_learning_curve_single(knn_h, Xh_tr, yh_tr, "Hotel: kNN Learning Curve",
                           os.path.join(OUT_HOTEL, "knn_learning_curve.png"), cv=cv_h)
plot_reliability(yh_te, yh_prob_knn, "Hotel: kNN Reliability", os.path.join(OUT_HOTEL, "knn_reliability.png"))
save_profile(os.path.join(OUT_HOTEL, "knn_profile.json"),
             tag="Hotel_kNN_fit+predict", seconds_fit=prof_knn.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_knn.peak_gb, hardware=hw_info())

# F1-thresholding for kNN
yh_prob_knn_val = knn_h.predict_proba(Xh_va)[:, 1]
thr_knn, bestf1_knn = f1_optimal_threshold(yh_va, yh_prob_knn_val)
save_json_safe(os.path.join(OUT_HOTEL, "knn_val_threshold.json"),
               {"best_val_threshold": thr_knn, "val_f1_at_thr": bestf1_knn})
yh_hat_knn_thr = (yh_prob_knn >= thr_knn).astype(int)
plot_confusion(confusion_matrix(yh_te, yh_hat_knn_thr),
               ["NotCanceled","Canceled"],
               "Hotel: kNN Confusion @F1-threshold",
               os.path.join(OUT_HOTEL, "knn_confusion_f1thr.png"))

# MC for kNN: vary k
print("[Hotel/kNN] Model complexity sweep (k)…")
def knn_maker(k):
    return Pipeline([("pre", clone(pre_hotel)),
                     ("clf", KNeighborsClassifier(n_neighbors=int(k), algorithm="brute", n_jobs=-1))])
model_complexity_curve(knn_maker, "k", [3,5,11,21], Xh_tr, yh_tr, cv_h,
                       scorer=accuracy_score,
                       out_csv=os.path.join(OUT_HOTEL,"mc_knn.csv"),
                       title="Hotel: kNN Model-Complexity (k)",
                       out_png=os.path.join(OUT_HOTEL,"mc_knn.png"))

# Linear SVM (+ calibrated probabilities)
print("[Hotel/SVM-Lin] Fitting LinearSVC (+calibration)…")
lin_base = LinearSVC(C=1.0, dual=False, max_iter=20000, tol=1e-3, random_state=RANDOM_STATE)
lin_svm  = Pipeline([("pre", clone(pre_hotel)),
                     ("cal", CalibratedClassifierCV(lin_base, cv=3, n_jobs=-1))])

with Profiler("Hotel_LinSVM_fit") as prof_lsvm:
    lin_svm.fit(Xh_tr, yh_tr)
t0 = time.perf_counter()
yh_prob_lsvm = lin_svm.predict_proba(Xh_te)[:,1]
yh_hat_lsvm  = lin_svm.predict(Xh_te)
pred_s = time.perf_counter()-t0
m_lsvm, (fpr_lsvm, tpr_lsvm), (rec_lsvm, prec_lsvm) = summarize_classification(yh_te, yh_prob_lsvm, yh_hat_lsvm)
pd.DataFrame([m_lsvm]).to_csv(os.path.join(OUT_HOTEL, "linsvm_metrics.csv"), index=False)
plot_confusion(confusion_matrix(yh_te, yh_hat_lsvm), ["NotCanceled","Canceled"],
               "Hotel: Linear SVM Confusion @0.5", os.path.join(OUT_HOTEL, "linsvm_confusion.png"))
plot_line(fpr_lsvm, tpr_lsvm, "Hotel: Linear SVM ROC", "FPR", "TPR", os.path.join(OUT_HOTEL, "linsvm_roc.png"))
plot_line(rec_lsvm, prec_lsvm, "Hotel: Linear SVM PR", "Recall", "Precision", os.path.join(OUT_HOTEL, "linsvm_pr.png"))
plot_learning_curve_single(lin_svm, Xh_tr, yh_tr, "Hotel: Linear SVM Learning Curve",
                           os.path.join(OUT_HOTEL, "linsvm_learning_curve.png"), cv=cv_h)
plot_reliability(yh_te, yh_prob_lsvm, "Hotel: Linear SVM Reliability", os.path.join(OUT_HOTEL, "linsvm_reliability.png"))
save_profile(os.path.join(OUT_HOTEL, "linsvm_profile.json"),
             tag="Hotel_LinSVM_fit+predict", seconds_fit=prof_lsvm.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_lsvm.peak_gb, hardware=hw_info())

# Threshold by validation F1
print("[Hotel/SVM-Lin] Selecting threshold by val F1 (calibrated probs)…")
yh_prob_lsvm_val = lin_svm.predict_proba(Xh_va)[:, 1]
thr_lsvm, bestf1_lsvm = f1_optimal_threshold(yh_va, yh_prob_lsvm_val)
save_json_safe(os.path.join(OUT_HOTEL, "linsvm_val_threshold.json"),
               {"best_val_threshold": thr_lsvm, "val_f1_at_thr": bestf1_lsvm})
yh_hat_lsvm_thr = (yh_prob_lsvm >= thr_lsvm).astype(int)
cm_lsvm_thr = confusion_matrix(yh_te, yh_hat_lsvm_thr)
plot_confusion(cm_lsvm_thr, ["NotCanceled","Canceled"],
               "Hotel: Linear SVM Confusion @F1-threshold",
               os.path.join(OUT_HOTEL, "linsvm_confusion_f1thr.png"))

# Model-complexity sweep over C
print("[Hotel/SVM-Lin] Model complexity sweep (C)…")
def linsvm_maker(Cval):
    base = LinearSVC(C=float(Cval), dual=False, max_iter=20000, tol=1e-3, random_state=RANDOM_STATE)
    return Pipeline([("pre", clone(pre_hotel)), ("lin", base)])

model_complexity_curve(
    pipe_maker=linsvm_maker,
    param_name="C",
    param_grid=[0.1, 1.0, 10.0],
    X=Xh_tr, y=yh_tr, cv=cv_h, scorer=accuracy_score,
    out_csv=os.path.join(OUT_HOTEL, "mc_linsvm.csv"),
    title="Hotel: Linear SVM Model-Complexity (C)",
    out_png=os.path.join(OUT_HOTEL, "mc_linsvm.png")
)

# Support-vector margin diagnostic
print("[Hotel/SVM-Lin] Margin diagnostic (LinearSVC on full train)…")
lin_svm_raw = Pipeline([("pre", clone(pre_hotel)), ("lin", LinearSVC(C=1.0, dual="auto", max_iter=50000, random_state=RANDOM_STATE))])
lin_svm_raw.fit(Xh_tr, yh_tr)
margins = lin_svm_raw.decision_function(Xh_te)
np.save(os.path.join(OUT_HOTEL,"linsvm_margins.npy"), margins)

# RBF SVM (cap ≤25k)
print("[Hotel/SVM-RBF] Subsampling train to ≤25k for RBF (nonlinear check)…")
cap = min(25000, len(Xh_tr))
sub_idx = np.random.RandomState(RANDOM_STATE).choice(len(Xh_tr), size=cap, replace=False)
Xh_tr_rbf, yh_tr_rbf = Xh_tr.iloc[sub_idx], yh_tr.iloc[sub_idx]

rbf_svm = Pipeline([
    ("pre", clone(pre_hotel)),
    ("clf", SVC(kernel="rbf", C=2.0, gamma="scale", probability=False,
                cache_size=1000, random_state=RANDOM_STATE))
])

with Profiler("Hotel_RBFSVM_fit") as prof_rsvm:
    rbf_svm.fit(Xh_tr_rbf, yh_tr_rbf)

print("[Hotel/SVM-RBF] Scoring using decision_function (no probability calibration)…")
t0 = time.perf_counter()
scores_rbf = rbf_svm.decision_function(Xh_te)     # continuous scores
yh_hat_rbf = (scores_rbf >= 0).astype(int)        # 0-threshold for labels
pred_s = time.perf_counter() - t0

fpr_rbf, tpr_rbf, _ = roc_curve(yh_te, scores_rbf)
roc_auc_rbf = auc(fpr_rbf, tpr_rbf)
prec_rbf, reca_rbf, _ = precision_recall_curve(yh_te, scores_rbf)
pr_auc_rbf = auc(reca_rbf, prec_rbf)
acc_rbf = accuracy_score(yh_te, yh_hat_rbf)
prc_rbf, rec_rbf, f1_rbf, _ = precision_recall_fscore_support(yh_te, yh_hat_rbf, average="binary", zero_division=0)

m_rbf = {"accuracy": float(acc_rbf), "precision": float(prc_rbf), "recall": float(rec_rbf), "f1": float(f1_rbf),
         "roc_auc": float(roc_auc_rbf), "pr_auc": float(pr_auc_rbf)}
pd.DataFrame([m_rbf]).to_csv(os.path.join(OUT_HOTEL, "rbfsvm_metrics.csv"), index=False)

plot_confusion(confusion_matrix(yh_te, yh_hat_rbf), ["NotCanceled","Canceled"],
               "Hotel: RBF SVM Confusion @0 threshold", os.path.join(OUT_HOTEL, "rbfsvm_confusion.png"))
plot_line(fpr_rbf, tpr_rbf, "Hotel: RBF SVM ROC", "FPR", "TPR", os.path.join(OUT_HOTEL, "rbfsvm_roc.png"))
plot_line(reca_rbf, prec_rbf, "Hotel: RBF SVM PR", "Recall", "Precision", os.path.join(OUT_HOTEL, "rbfsvm_pr.png"))
save_profile(os.path.join(OUT_HOTEL, "rbfsvm_profile.json"),
             tag="Hotel_RBFSVM_fit+predict", seconds_fit=prof_rsvm.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_rsvm.peak_gb, hardware=hw_info())

# F1-thresholding on decision_function
scores_rbf_val = rbf_svm.decision_function(Xh_va)
qs = np.linspace(0.05, 0.95, 19)
thr_grid = np.quantile(scores_rbf_val, qs)
from sklearn.metrics import f1_score
thr_rbf, bestf1_rbf = max(((thr, f1_score(yh_va, (scores_rbf_val >= thr).astype(int)))
                            for thr in thr_grid), key=lambda t: t[1])
thr_rbf, bestf1_rbf = float(thr_rbf), float(bestf1_rbf)
save_json_safe(os.path.join(OUT_HOTEL, "rbfsvm_val_threshold.json"),
               {"best_val_threshold_on_scores": thr_rbf, "val_f1_at_thr": bestf1_rbf})
yh_hat_rbf_thr = (scores_rbf >= thr_rbf).astype(int)
plot_confusion(confusion_matrix(yh_te, yh_hat_rbf_thr),
               ["NotCanceled","Canceled"],
               "Hotel: RBF SVM Confusion @F1-threshold",
               os.path.join(OUT_HOTEL, "rbfsvm_confusion_f1thr.png"))

# Learning curve on 25k subset
print("[Hotel/SVM-RBF] Learning curve on 25k subset…")
plot_learning_curve_single(rbf_svm, Xh_tr_rbf, yh_tr_rbf,
                           "Hotel: RBF SVM Learning Curve (25k subset)",
                           os.path.join(OUT_HOTEL, "rbfsvm_learning_curve.png"),
                           cv=cv_h)

# RBF MC sweep (gamma)
print("[Hotel/SVM-RBF] Model complexity sweep (gamma)…")
def rbf_maker(gamma):
    return Pipeline([("pre", clone(pre_hotel)),
                     ("clf", SVC(kernel="rbf", C=2.0, gamma=gamma, probability=True, random_state=RANDOM_STATE))])
d = max(1, Xh_tr.shape[1])
gammas = ["scale", 1/d, 2/d]
model_complexity_curve(rbf_maker, "gamma", gammas, Xh_tr_rbf, yh_tr_rbf, cv=cv_h,
                       scorer=accuracy_score,
                       out_csv=os.path.join(OUT_HOTEL,"mc_rbf.csv"),
                       title="Hotel: RBF SVM MC (gamma)",
                       out_png=os.path.join(OUT_HOTEL,"mc_rbf.png"))


# ==========================================================
# CHUNK 8a — HOTEL: MLP (SGD-only) + epoch curve [R7][R2]
# ==========================================================

def train_mlp_sgd_with_epoch_curve(X_tr, y_tr, X_va, y_va, out_csv, out_png, patience=3):
    """MLPClassifier with SGD only; ≤15 epochs; early stopping marker; param budget log."""
    print("[Hotel/MLP] Epoch-by-epoch (SGD-only, ≤15 epochs, early stopping)…")
    clf = Pipeline([("pre", clone(pre_hotel)),
                    ("clf", MLPClassifier(hidden_layer_sizes=(512,512),  # shallow–wide
                                          solver="sgd", learning_rate_init=0.01,
                                          momentum=0.0, nesterovs_momentum=False,
                                          batch_size=1024, max_iter=1, random_state=RANDOM_STATE,
                                          warm_start=True, alpha=1e-4))])
    rows, best = [], {"epoch": 0, "val_logloss": np.inf}
    no_improve = 0
    for epoch in range(1, 16):  # ≤15 epochs
        clf.fit(X_tr, y_tr)  # one epoch
        y_tr_proba = clf.predict_proba(X_tr)[:,1]
        y_va_proba = clf.predict_proba(X_va)[:,1]
        tr_ll = log_loss(y_tr, y_tr_proba, labels=[0,1])
        va_ll = log_loss(y_va, y_va_proba, labels=[0,1])
        rows.append({"epoch": epoch, "train_logloss": float(tr_ll), "val_logloss": float(va_ll)})
        if va_ll + 1e-6 < best["val_logloss"]:
            best.update({"epoch": epoch, "val_logloss": float(va_ll)}); no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"[Hotel/MLP] Early stopping at epoch {epoch} (best={best['epoch']})")
                break
    df = pd.DataFrame(rows); df.to_csv(out_csv, index=False)
    plt.figure(); plt.plot(df["epoch"], df["train_logloss"], marker="o", label="Train")
    plt.plot(df["epoch"], df["val_logloss"], marker="s", label="Validation")
    plt.axvline(best["epoch"], linestyle="--", label=f"best@{best['epoch']}")
    plt.title("Hotel: MLP (SGD) Epoch Curve"); plt.xlabel("Epoch"); plt.ylabel("LogLoss"); plt.legend()
    plt.tight_layout(); plt.savefig(out_png, bbox_inches="tight"); plt.close()
    # param budget audit (should be in 0.2M–1.0M)
    n_in = _estimate_input_dim(pre_hotel, X_tr); n_params = _estimate_mlp_params(n_in, (512,512), 1)
    save_json_safe(os.path.join(OUT_HOTEL, "mlp_param_budget.json"),
                   {"estimated_input_dim": n_in, "param_count": n_params})
    return clf

with Profiler("Hotel_MLP_fit") as prof_mlp:
    mlp_h = train_mlp_sgd_with_epoch_curve(Xh_tr, yh_tr, Xh_va, yh_va,
                                           os.path.join(OUT_HOTEL,"mlp_epochs.csv"),
                                           os.path.join(OUT_HOTEL,"mlp_epochs.png"))
t0 = time.perf_counter()
yh_prob_mlp = mlp_h.predict_proba(Xh_te)[:,1]
yh_hat_mlp  = mlp_h.predict(Xh_te)
print("[Hotel/MLP] Selecting threshold by val F1…")
yh_prob_mlp_val = mlp_h.predict_proba(Xh_va)[:, 1]
thr_mlp, bestf1_mlp = f1_optimal_threshold(yh_va, yh_prob_mlp_val)
save_json_safe(os.path.join(OUT_HOTEL, "mlp_val_threshold.json"),
               {"best_val_threshold": thr_mlp, "val_f1_at_thr": bestf1_mlp})
yh_hat_mlp_thr = (yh_prob_mlp >= thr_mlp).astype(int)
cm_mlp_thr = confusion_matrix(yh_te, yh_hat_mlp_thr)
plot_confusion(cm_mlp_thr, ["NotCanceled","Canceled"],
               "Hotel: MLP Confusion @F1-threshold",
               os.path.join(OUT_HOTEL, "mlp_confusion_f1thr.png"))

pred_s = time.perf_counter()-t0
m_mlp, (fpr_mlp, tpr_mlp), (rec_mlp, prec_mlp) = summarize_classification(yh_te, yh_prob_mlp, yh_hat_mlp)
pd.DataFrame([m_mlp]).to_csv(os.path.join(OUT_HOTEL, "mlp_metrics.csv"), index=False)
plot_confusion(confusion_matrix(yh_te, yh_hat_mlp), ["NotCanceled","Canceled"],
               "Hotel: MLP Confusion @0.5", os.path.join(OUT_HOTEL, "mlp_confusion.png"))
plot_line(fpr_mlp, tpr_mlp, "Hotel: MLP ROC", "FPR", "TPR", os.path.join(OUT_HOTEL, "mlp_roc.png"))
plot_line(rec_mlp, prec_mlp, "Hotel: MLP PR", "Recall", "Precision", os.path.join(OUT_HOTEL, "mlp_pr.png"))
plot_learning_curve_single(mlp_h, Xh_tr, yh_tr, "Hotel: MLP Learning Curve",
                           os.path.join(OUT_HOTEL, "mlp_learning_curve.png"), cv=cv_h)
plot_reliability(yh_te, yh_prob_mlp, "Hotel: MLP Reliability", os.path.join(OUT_HOTEL, "mlp_reliability.png"))
save_profile(os.path.join(OUT_HOTEL, "mlp_profile.json"),
             tag="Hotel_MLP_fit+predict", seconds_fit=prof_mlp.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_mlp.peak_gb, hardware=hw_info())


# =========================================================
# CHUNK 8b — NN Activation Study (Hotel; SGD only)
# =========================================================

ACT_DIR = os.path.join(OUT_HOTEL, "nn_activation_study")
os.makedirs(ACT_DIR, exist_ok=True)

def train_mlp_sgd_epoch_curve_activation(
    activation: str,
    X_tr, y_tr, X_va, y_va,
    hidden=(256, 256),        # fixed architecture across activations (shallow–wide)
    alpha=1e-4,               # fixed regularization
    lr=0.01,                  # fixed learning rate
    batch_size=1024,          # fixed batch size
    max_epochs=15,            # ≤15
    random_state=RANDOM_STATE
):
    print(f"[NN-Act] Training activation='{activation}' with SGD for {max_epochs} epochs…")
    pipe = Pipeline([
        ("pre", clone(pre_hotel)),
        ("clf", MLPClassifier(
            hidden_layer_sizes=hidden,
            activation=activation,
            solver="sgd",
            learning_rate_init=lr,
            learning_rate="constant",
            momentum=0.0, nesterovs_momentum=False,
            alpha=alpha,
            batch_size=batch_size,
            max_iter=1,
            warm_start=True,
            random_state=random_state,
            early_stopping=False
        ))
    ])

    rows = []
    for epoch in range(1, max_epochs + 1):
        pipe.fit(X_tr, y_tr)  # one epoch
        p_tr = pipe.predict_proba(X_tr)[:, 1]
        p_va = pipe.predict_proba(X_va)[:, 1]
        yhat_tr = (p_tr >= 0.5).astype(int)
        yhat_va = (p_va >= 0.5).astype(int)
        tr_ll = log_loss(y_tr, p_tr, labels=[0, 1])
        va_ll = log_loss(y_va, p_va, labels=[0, 1])
        tr_acc = accuracy_score(y_tr, yhat_tr)
        va_acc = accuracy_score(y_va, yhat_va)
        rows.append({"epoch": epoch, "activation": activation,
                     "train_logloss": float(tr_ll), "val_logloss": float(va_ll),
                     "train_acc": float(tr_acc), "val_acc": float(va_acc)})
        if epoch in (1, 5, 10, 15):
            print(f"[NN-Act] {activation}: epoch {epoch:>2} | val_logloss={va_ll:.4f} | val_acc={va_acc:.4f}")

    df = pd.DataFrame(rows)
    csv_path = os.path.join(ACT_DIR, f"epochs_{activation}.csv")
    png_path = os.path.join(ACT_DIR, f"epochs_{activation}.png")
    df.to_csv(csv_path, index=False)

    plt.figure()
    plt.plot(df["epoch"], df["val_logloss"], marker="o", label=f"{activation} (val logloss)")
    plt.plot(df["epoch"], df["train_logloss"], marker="s", label=f"{activation} (train logloss)")
    plt.title(f"Hotel NN (SGD) — Epoch Curve — {activation}")
    plt.xlabel("Epoch"); plt.ylabel("LogLoss"); plt.legend()
    plt.tight_layout(); plt.savefig(png_path, bbox_inches="tight"); plt.close()
    print(f"[NN-Act] Saved curves for {activation} → {csv_path} / {png_path}")
    return pipe, df

def activation_study_hotel(X_tr, y_tr, X_va, y_va, X_te, y_te):
    activations = ["relu", "tanh", "logistic", "identity"]
    results = []
    for act in activations:
        model, _df = train_mlp_sgd_epoch_curve_activation(
            activation=act, X_tr=X_tr, y_tr=y_tr, X_va=X_va, y_va=y_va,
            hidden=(256, 256), alpha=1e-4, lr=0.01, batch_size=1024, max_epochs=15
        )
        p_te = model.predict_proba(X_te)[:, 1]
        yhat_te = (p_te >= 0.5).astype(int)
        acc = accuracy_score(y_te, yhat_te)
        prec, rec, f1, _ = precision_recall_fscore_support(y_te, yhat_te, average="binary", zero_division=0)
        fpr, tpr, _ = roc_curve(y_te, p_te); roc_auc = auc(fpr, tpr)
        prec_curve, reca_curve, _ = precision_recall_curve(y_te, p_te); pr_auc = auc(reca_curve, prec_curve)
        results.append({"activation": act, "test_accuracy": float(acc), "test_precision": float(prec),
                        "test_recall": float(rec), "test_f1": float(f1),
                        "test_roc_auc": float(roc_auc), "test_pr_auc": float(pr_auc)})

        plt.figure(); plt.plot(fpr, tpr, label=f"{act} (AUC={roc_auc:.3f})")
        plt.title(f"Hotel NN — ROC ({act})"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend()
        plt.tight_layout(); plt.savefig(os.path.join(ACT_DIR, f"roc_{act}.png"), bbox_inches="tight"); plt.close()
        plt.figure(); plt.plot(reca_curve, prec_curve, label=f"{act} (PR AUC={pr_auc:.3f})")
        plt.title(f"Hotel NN — PR ({act})"); plt.xlabel("Recall"); plt.ylabel("Precision"); plt.legend()
        plt.tight_layout(); plt.savefig(os.path.join(ACT_DIR, f"pr_{act}.png"), bbox_inches="tight"); plt.close()

    comp = pd.DataFrame(results).sort_values("test_roc_auc", ascending=False)
    comp_path = os.path.join(ACT_DIR, "activation_comparison.csv")
    comp.to_csv(comp_path, index=False)
    print(f"[NN-Act] Wrote comparison table → {comp_path}")

    # overlays
    plt.figure()
    for act in activations:
        df = pd.read_csv(os.path.join(ACT_DIR, f"epochs_{act}.csv"))
        plt.plot(df["epoch"], df["val_logloss"], marker="o", label=f"{act}")
    plt.title("Hotel NN (SGD) — Validation LogLoss by Activation")
    plt.xlabel("Epoch"); plt.ylabel("Val LogLoss"); plt.legend()
    plt.tight_layout(); plt.savefig(os.path.join(ACT_DIR, "epochs_overlay_val_logloss.png"), bbox_inches="tight"); plt.close()

    plt.figure()
    for act in activations:
        df = pd.read_csv(os.path.join(ACT_DIR, f"epochs_{act}.csv"))
        plt.plot(df["epoch"], df["val_acc"], marker="s", label=f"{act}")
    plt.title("Hotel NN (SGD) — Validation Accuracy by Activation")
    plt.xlabel("Epoch"); plt.ylabel("Val Accuracy"); plt.legend()
    plt.tight_layout(); plt.savefig(os.path.join(ACT_DIR, "epochs_overlay_val_acc.png"), bbox_inches="tight"); plt.close()

print("[NN-Act] Starting activation study on Hotel (SGD only)…")
activation_study_hotel(Xh_tr, yh_tr, Xh_va, yh_va, Xh_te, yh_te)
print("[NN-Act] Activation study complete.")


[Hotel/kNN] Fitting exact kNN (brute)…
[Profiler] START: Hotel_kNN_fit
[Profiler] END: Hotel_kNN_fit | sec=0.27 | peakGB=1.031
[Plot] Learning curve: Hotel: kNN Learning Curve
[Plot] Reliability: Hotel: kNN Reliability
[Hotel/kNN] Model complexity sweep (k)…
[Hotel/SVM-Lin] Fitting LinearSVC (+calibration)…
[Profiler] START: Hotel_LinSVM_fit
[Profiler] END: Hotel_LinSVM_fit | sec=1.83 | peakGB=1.144
[Plot] Learning curve: Hotel: Linear SVM Learning Curve
[Plot] Reliability: Hotel: Linear SVM Reliability
[Hotel/SVM-Lin] Selecting threshold by val F1 (calibrated probs)…
[Hotel/SVM-Lin] Model complexity sweep (C)…
[Hotel/SVM-Lin] Margin diagnostic (LinearSVC on full train)…
[Hotel/SVM-RBF] Subsampling train to ≤25k for RBF (nonlinear check)…
[Profiler] START: Hotel_RBFSVM_fit
[Profiler] END: Hotel_RBFSVM_fit | sec=12.50 | peakGB=2.223
[Hotel/SVM-RBF] Scoring using decision_function (no probability calibration)…
[Hotel/SVM-RBF] Learning curve on 25k subset…
[Plot] Learning curve: Hotel: RB

In [7]:
# %%
# ==========================================================
# CHUNK 8c — HOTEL: NN (SGD) model‑complexity curve — width sweep
# Keeps ≤15 epochs; annotates param count; uses same preprocessing.
# ==========================================================

import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone
from sklearn.metrics import accuracy_score

WIDTH_DIR = os.path.join(OUT_HOTEL, "nn_width_sweep")
os.makedirs(WIDTH_DIR, exist_ok=True)

# 1) Pick candidate widths for a 2‑layer MLP [w, w], then filter by param budget.
n_in = _estimate_input_dim(pre_hotel, Xh_tr)
PARAM_MIN, PARAM_MAX = 200_000, 1_000_000  # spec budget (0.2M–1.0M)

candidates = [128, 192, 256, 320, 384, 448, 512, 640, 768, 896, 1024]
width_grid, param_map = [], {}
for w in candidates:
    p = _estimate_mlp_params(n_in, (w, w), 1)  # binary head
    if PARAM_MIN <= p <= PARAM_MAX:
        width_grid.append(w)
        param_map[w] = p

print("[NN-MC] Candidate widths within 0.2M–1.0M params:")
for w in width_grid:
    print(f"  w={w:>4} → ~{param_map[w]:,} params (n_in≈{n_in})")

# 2) Factory for the pipeline at a given width.
#    Activation: choose one and hold it fixed for the sweep.
#    (Use the best from your activation study; 'tanh' is a good default for tabular.)
def mlp_width_maker(w, activation="tanh"):
    return Pipeline([
        ("pre", clone(pre_hotel)),
        ("dense32", FunctionTransformer(to_dense32, accept_sparse=True)),
        ("clf", MLPClassifier(
            hidden_layer_sizes=(int(w), int(w)),
            activation=activation,
            solver="sgd", learning_rate="constant", learning_rate_init=0.01,
            momentum=0.0, nesterovs_momentum=False, alpha=1e-4,
            batch_size=1024,
            max_iter=15,                # ≤ 15 epochs per spec
            early_stopping=False,       # we do CV outside
            tol=1e-4,
            random_state=RANDOM_STATE,
            verbose=False
        ))
    ])

# 3) Evaluate each width with 3‑fold StratifiedKFold (cv_h defined earlier).
rows = []
for w in width_grid:
    pipe = mlp_width_maker(w, activation="tanh")  # or "identity"/"relu"/"logistic" if you prefer
    tr_scores, va_scores = [], []
    for tr_idx, va_idx in cv_h.split(Xh_tr, yh_tr):
        Xtr, Xva = Xh_tr.iloc[tr_idx], Xh_tr.iloc[va_idx]
        ytr, yva = yh_tr.iloc[tr_idx], yh_tr.iloc[va_idx]
        pipe.fit(Xtr, ytr)
        yhat_tr = pipe.predict(Xtr)
        yhat_va = pipe.predict(Xva)
        tr_scores.append(accuracy_score(ytr, yhat_tr))
        va_scores.append(accuracy_score(yva, yhat_va))
    rows.append({
        "width": int(w),
        "train_accuracy": float(np.mean(tr_scores)),
        "val_accuracy": float(np.mean(va_scores)),
        "params": int(param_map[w])
    })

df = pd.DataFrame(rows).sort_values("width")
csv_path = os.path.join(WIDTH_DIR, "mc_mlp_width.csv")
df.to_csv(csv_path, index=False)
print(f"[NN-MC] Wrote: {csv_path}")

# 4) Plot (training + validation). Annotate each point with param count (in M).
plt.figure()
plt.plot(df["width"], df["val_accuracy"], marker="o", label="Validation")
plt.plot(df["width"], df["train_accuracy"], marker="s", label="Training")
for w, va, p in zip(df["width"], df["val_accuracy"], df["params"]):
    plt.text(w, va, f"{p/1e6:.2f}M", ha="center", va="bottom", fontsize=8)

plt.title("Hotel: NN (SGD) Model‑Complexity — width (two‑layer [w,w])")
plt.xlabel("Hidden width w")
plt.ylabel("Accuracy")
plt.legend()
out_png = os.path.join(WIDTH_DIR, "mc_mlp_width.png")
plt.tight_layout(); plt.savefig(out_png, bbox_inches="tight"); plt.close()
print(f"[NN-MC] Saved plot → {out_png}")

# Optional: if you prefer F1 for the y‑axis, replace accuracy_score with F1 on (predict ≥ 0.5),
# or compute ROC‑AUC/PR‑AUC using predict_proba and change the metric accordingly.


[NN-MC] Candidate widths within 0.2M–1.0M params:
  w= 448 → ~232,065 params (n_in≈67)
  w= 512 → ~297,985 params (n_in≈67)
  w= 640 → ~454,401 params (n_in≈67)
  w= 768 → ~643,585 params (n_in≈67)
  w= 896 → ~865,537 params (n_in≈67)
[NN-MC] Wrote: outputs/hotel_cls/nn_width_sweep/mc_mlp_width.csv
[NN-MC] Saved plot → outputs/hotel_cls/nn_width_sweep/mc_mlp_width.png


In [8]:
# =======================================================
# CHUNK 9 — ACCIDENTS load + time holdout + preprocessor
# =======================================================

print("[Accidents] Streaming columns with Polars…")
PREFER_ACC = [
    "Start_Time","End_Time",
    "Distance(mi)","Temperature(F)","Humidity(%)","Pressure(in)",
    "Visibility(mi)","Wind_Speed(mph)",
    "Crossing","Junction","Traffic_Signal",
    "Sunrise_Sunset","Civil_Twilight","Amenity","Bump","No_Exit","Side",
    "State"
]

head_cols = pl.read_csv(ACC_CSV, n_rows=5).columns
use_cols = [c for c in PREFER_ACC if c in head_cols]

sample_rows = 1000000
print(f"[Accidents] Selecting columns={len(use_cols)}; streaming up to {sample_rows:,} rows…")

# Build a lazy plan:
acc_pl = (
    pl.scan_csv(ACC_CSV, infer_schema_length=5000, null_values=CSV_NULLS)
      .select([pl.col(c) for c in use_cols])
      # Parse datetimes for time-based splitting & duration target
      .with_columns(
          pl.col("Start_Time").str.strptime(pl.Datetime, strict=False).alias("_start_dt"),
          pl.col("End_Time").str.strptime(pl.Datetime, strict=False).alias("_end_dt"),
      )
      # Cast known numeric columns to floats (robust to parse errors)
      .with_columns([
          pl.col(c).cast(pl.Float64, strict=False)
          for c in ["Distance(mi)","Temperature(F)","Humidity(%)","Pressure(in)",
                    "Visibility(mi)","Wind_Speed(mph)"]
          if c in use_cols
      ])
      # Create duration target (minutes), clipped to [1, 1440], and sortable timestamp
      .with_columns(
          ((pl.col("_end_dt") - pl.col("_start_dt")).dt.total_seconds() / 60.0)
            .pipe(lambda s: pl.when(s < 1.0).then(1.0)
                   .when(s > 24.0*60.0).then(24.0*60.0)
                   .otherwise(s))
            .alias("duration_minutes"),
          pl.col("_start_dt").dt.epoch("s").alias("_start_ts")
      )
      .head(sample_rows)  # first N rows efficiently
)

# Materialize
acc_df = acc_pl.collect()
print(f"[Accidents] Loaded rows={acc_df.height:,}, cols={acc_df.width}")

# Convert to pandas + clean NA
acc = acc_df.to_pandas(use_pyarrow_extension_array=False).replace({pd.NA: np.nan})

# Keep rows with valid duration
acc = acc.loc[acc["duration_minutes"].notna()].reset_index(drop=True)

# Sort by time and split 70/15/15 (time-aware holdout)
acc["_start_ts"] = pd.to_datetime(acc["_start_dt"]).astype("int64")  # ns since epoch
acc_sorted = acc.sort_values("_start_ts").reset_index(drop=True)
n = len(acc_sorted)
n_test = int(0.15 * n); n_val = int(0.15 * n)

acc_tr = acc_sorted.iloc[: n - (n_val + n_test)]
acc_va = acc_sorted.iloc[n - (n_val + n_test) : n - n_test]
acc_te = acc_sorted.iloc[n - n_test : ]

# Features/targets; drop time columns after splitting (avoid leakage)
drop_time_cols = [c for c in ["Start_Time","End_Time","_start_dt","_end_dt","_start_ts"] if c in acc_tr.columns]

X_acc_tr = acc_tr.drop(columns=drop_time_cols + ["duration_minutes"])
y_acc_tr = acc_tr["duration_minutes"].astype("float32")
X_acc_va = acc_va.drop(columns=drop_time_cols + ["duration_minutes"])
y_acc_va = acc_va["duration_minutes"].astype("float32")
X_acc_te = acc_te.drop(columns=drop_time_cols + ["duration_minutes"])
y_acc_te = acc_te["duration_minutes"].astype("float32")

# Small evaluation subset for heavy predictors (≤25k) within the test time block
eval_cap = min(25000, len(X_acc_te))
X_acc_te_eval = X_acc_te.iloc[:eval_cap]
y_acc_te_eval = y_acc_te.iloc[:eval_cap]
save_json_safe(os.path.join(OUT_ACC, "eval_subset.json"),
               {"eval_cap": int(eval_cap), "test_total": int(len(X_acc_te))})

# Keep full X/y for schema
X_acc = acc_sorted.drop(columns=drop_time_cols + ["duration_minutes"])
y_acc = acc_sorted["duration_minutes"].astype("float32")

# Preprocessor for regression
pre_acc = make_preprocessor(
    pd.concat([X_acc, y_acc.rename("duration_minutes")], axis=1),
    target="duration_minutes",
    task="regression"
)

acc_counts = dict(
    tag="Accidents",
    raw_rows=int(len(acc_df)),
    cleaned_rows=int(len(acc_sorted)),
    train_rows=int(len(X_acc_tr)),
    val_rows=int(len(X_acc_va)),
    test_rows=int(len(X_acc_te)),
)
save_json_safe(os.path.join(OUT_ACC, "counts.json"), acc_counts)
print(f"[Accidents] Counts: {acc_counts}")


# ========================================================
# CHUNK 10 — ACCIDENTS: Decision Tree Regressor [R4][R2]
# ========================================================

cv_reg = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

print("[Accidents/DTR] Fitting DecisionTreeRegressor…")
dtr = Pipeline([("pre", clone(pre_acc)),
                ("reg", DecisionTreeRegressor(
                    max_depth=18,            # cap for large set [R2]
                    min_samples_leaf=100,
                    min_samples_split=200,
                    random_state=RANDOM_STATE
                ))])

with Profiler("Acc_DTR_fit") as prof_dtr:
    dtr.fit(X_acc_tr, y_acc_tr)
t0 = time.perf_counter(); yhat_dtr = dtr.predict(X_acc_te); pred_s = time.perf_counter()-t0
m_dtr = summarize_regression(y_acc_te, yhat_dtr)
pd.DataFrame([m_dtr]).to_csv(os.path.join(OUT_ACC, "dtr_metrics.csv"), index=False)
plot_parity(y_acc_te.values, yhat_dtr, "Accidents: DTR Parity", os.path.join(OUT_ACC, "dtr_parity.png"))
plot_residuals(y_acc_te.values, yhat_dtr, "Accidents: DTR Residuals", os.path.join(OUT_ACC, "dtr_residuals.png"))
plot_learning_curve_single(dtr, X_acc_tr, y_acc_tr, "Accidents: DTR Learning Curve",
                           os.path.join(OUT_ACC, "dtr_learning_curve.png"), cv=cv_reg)

# Realized stats
tree_r = dtr.named_steps["reg"]
realized_r = {
    "depth": int(tree_r.get_depth()),
    "leaves": int(tree_r.get_n_leaves()),
    "nodes": int(tree_r.tree_.node_count),
}
save_json_safe(os.path.join(OUT_ACC, "dtr_realized_stats.json"), realized_r)
save_profile(os.path.join(OUT_ACC, "dtr_profile.json"),
             tag="Acc_DTR_fit+predict", seconds_fit=prof_dtr.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_dtr.peak_gb, hardware=hw_info())

# Permutation importances (top-10 by neg-MAE)
imp_dtr = permutation_importance_topk(dtr, X_acc_te, y_acc_te, scoring="neg_mean_absolute_error",
                                      k=10, n_repeats=10)
imp_dtr.to_csv(os.path.join(OUT_ACC, "dtr_perm_importance_top10.csv"), index=False)



[Accidents] Streaming columns with Polars…
[Accidents] Selecting columns=17; streaming up to 1,000,000 rows…
[Accidents] Loaded rows=1,000,000, cols=21
[Accidents] Counts: {'tag': 'Accidents', 'raw_rows': 1000000, 'cleaned_rows': 1000000, 'train_rows': 700000, 'val_rows': 150000, 'test_rows': 150000}
[Accidents/DTR] Fitting DecisionTreeRegressor…
[Profiler] START: Acc_DTR_fit
[Profiler] END: Acc_DTR_fit | sec=11.41 | peakGB=3.488
[Plot] Parity: Accidents: DTR Parity
[Plot] Residuals: Accidents: DTR Residuals
[Plot] Learning curve: Accidents: DTR Learning Curve


In [9]:
# =======================================================
# CHUNK 11 — ACCIDENTS: kNN Regressor (caps) [R5][R2]
# =======================================================

cap_train = min(250000, len(X_acc_tr))
print(f"[Accidents/kNN] Subsample train to {cap_train:,} (exact brute)…")
idx_sub = np.random.RandomState(RANDOM_STATE).choice(len(X_acc_tr), size=cap_train, replace=False)
Xa_knn, ya_knn = X_acc_tr.iloc[idx_sub], y_acc_tr.iloc[idx_sub]

knr = Pipeline([("pre", clone(pre_acc)),
                ("reg", KNeighborsRegressor(n_neighbors=21, algorithm="brute", n_jobs=-1))])

with Profiler("Acc_kNNReg_fit") as prof_knr:
    knr.fit(Xa_knn, ya_knn)

def predict_in_chunks(model, X, chunk=5000):
    out = []
    for s in range(0, len(X), chunk):
        out.append(model.predict(X.iloc[s:s+chunk]))
    return np.concatenate(out)

t0 = time.perf_counter(); yhat_knr = predict_in_chunks(knr, X_acc_te_eval, chunk=5000); pred_s = time.perf_counter()-t0
m_knr = summarize_regression(y_acc_te_eval, yhat_knr)
pd.DataFrame([m_knr]).to_csv(os.path.join(OUT_ACC, "knn_metrics.csv"), index=False)
plot_parity(y_acc_te_eval.values, yhat_knr, "Accidents: kNN Parity (≤25k eval)", os.path.join(OUT_ACC, "knn_parity.png"))
plot_residuals(y_acc_te_eval.values, yhat_knr, "Accidents: kNN Residuals (≤25k eval)", os.path.join(OUT_ACC, "knn_residuals.png"))
print("[Accidents/kNN] Learning curve (on subsample)…")

# Smaller LC-only slice of the TRAIN set (diagnostic, not the final model)
lc_cap = min(40000, len(X_acc_tr))  # if still slow, drop to 20000
rng = np.random.RandomState(RANDOM_STATE)
lc_idx = rng.choice(len(X_acc_tr), size=lc_cap, replace=False)
X_lc, y_lc = X_acc_tr.iloc[lc_idx], y_acc_tr.iloc[lc_idx]

# Coarser grid & fewer folds
cv_lc = KFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)

plot_learning_curve_single(
    knr, X_lc, y_lc,
    title="Accidents: kNNReg Learning Curve (subsample)",
    outpath=os.path.join(OUT_ACC, "knn_learning_curve.png"),
    cv=cv_lc,
    train_sizes=np.linspace(0.2, 1.0, 4),    # 4 points
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

save_profile(os.path.join(OUT_ACC, "knn_profile.json"),
             tag="Acc_kNNReg_fit+predict", seconds_fit=prof_knr.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_knr.peak_gb, hardware=hw_info())




[Accidents/kNN] Subsample train to 250,000 (exact brute)…
[Profiler] START: Acc_kNNReg_fit
[Profiler] END: Acc_kNNReg_fit | sec=0.65 | peakGB=3.078
[Plot] Parity: Accidents: kNN Parity (≤25k eval)
[Plot] Residuals: Accidents: kNN Residuals (≤25k eval)
[Accidents/kNN] Learning curve (on subsample)…
[Plot] Learning curve: Accidents: kNNReg Learning Curve (subsample)


In [10]:
# ====================================================
# CHUNK 12 — ACCIDENTS: SVR (RBF) + linear baseline
# ====================================================

cap_svr = min(50000, len(X_acc_tr))  # tightened to meet ≤100k kernel SVM rule
print(f"[Accidents/SVR] Subsample train to {cap_svr:,} for RBF SVR…")
idx_svr = np.random.RandomState(RANDOM_STATE).choice(len(X_acc_tr), size=cap_svr, replace=False)
Xa_svr, ya_svr = X_acc_tr.iloc[idx_svr], y_acc_tr.iloc[idx_svr]

svr = Pipeline([("pre", clone(pre_acc)),
                ("reg", SVR(C=1.0, kernel="rbf"))])

with Profiler("Acc_SVR_fit") as prof_svr:
    svr.fit(Xa_svr, ya_svr)
t0 = time.perf_counter(); yhat_svr = svr.predict(X_acc_te_eval); pred_s = time.perf_counter()-t0
m_svr = summarize_regression(y_acc_te_eval, yhat_svr)
pd.DataFrame([m_svr]).to_csv(os.path.join(OUT_ACC, "svr_metrics.csv"), index=False)
plot_parity(y_acc_te_eval.values, yhat_svr, "Accidents: SVR Parity (≤25k eval)", os.path.join(OUT_ACC, "svr_parity.png"))
plot_residuals(y_acc_te_eval.values, yhat_svr, "Accidents: SVR Residuals (≤25k eval)", os.path.join(OUT_ACC, "svr_residuals.png"))

print("[Accidents/SVR] Learning curve (on subsample)…")
svr_lc_cap = min(25000, len(X_acc_tr))
svr_lc_idx = np.random.RandomState(RANDOM_STATE).choice(len(X_acc_tr), size=svr_lc_cap, replace=False)
X_svr_lc, y_svr_lc = X_acc_tr.iloc[svr_lc_idx], y_acc_tr.iloc[svr_lc_idx]

cv_svr_lc = KFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)

plot_learning_curve_single(
    svr, X_svr_lc, y_svr_lc,
    title="Accidents: SVR(RBF) Learning Curve (subsample)",
    outpath=os.path.join(OUT_ACC, "svr_learning_curve.png"),
    cv=cv_svr_lc,
    train_sizes=np.linspace(0.25, 1.0, 3),   # 3 points
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)
save_profile(os.path.join(OUT_ACC, "svr_profile.json"),
             tag="Acc_SVR_fit+predict", seconds_fit=prof_svr.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_svr.peak_gb, hardware=hw_info())
# Optional linear baseline (fast)
# print("[Accidents/SVR-Linear] Linear SVR baseline…")
# svr_lin = Pipeline([("pre", clone(pre_acc)),
#                     ("reg", SVR(C=1.0, kernel="linear"))])
# with Profiler("Acc_SVRLinear_fit") as prof_svrL:
#     svr_lin.fit(X_acc_tr, y_acc_tr)
# t0 = time.perf_counter(); yhat_svrL = svr_lin.predict(X_acc_te); pred_s = time.perf_counter()-t0
# m_svrL = summarize_regression(y_acc_te, yhat_svrL)
# pd.DataFrame([m_svrL]).to_csv(os.path.join(OUT_ACC, "svr_linear_metrics.csv"), index=False)
# plot_parity(y_acc_te.values, yhat_svrL, "Accidents: Linear SVR Parity", os.path.join(OUT_ACC, "svr_linear_parity.png"))
# plot_residuals(y_acc_te.values, yhat_svrL, "Accidents: Linear SVR Residuals", os.path.join(OUT_ACC, "svr_linear_residuals.png"))
# save_profile(os.path.join(OUT_ACC, "svr_linear_profile.json"),
#              tag="Acc_SVRLinear_fit+predict", seconds_fit=prof_svrL.seconds_fit,
#              seconds_predict=pred_s, peak_GB=prof_svrL.peak_gb, hardware=hw_info())

# ==========================================================
# CHUNK 12b — ACCIDENTS: Linear ε-SVR via SGDRegressor [spec]
# ==========================================================

print("[Accidents/SGDR] Linear ε-SVR analogue (SGDRegressor)…")
sgdr = Pipeline([
    ("pre", clone(pre_acc)),
    ("reg", SGDRegressor(
        loss="epsilon_insensitive",      # linear ε-SVR analogue
        alpha=1e-4,                      # L2 regularization
        penalty="l2",
        learning_rate="constant",
        eta0=0.01,
        max_iter=20000,
        tol=1e-3,
        random_state=RANDOM_STATE
    ))
])

with Profiler("Acc_SGDR_fit") as prof_sgdr:
    sgdr.fit(X_acc_tr, y_acc_tr)

t0 = time.perf_counter()
yhat_sgdr = sgdr.predict(X_acc_te)
pred_s = time.perf_counter() - t0

m_sgdr = summarize_regression(y_acc_te, yhat_sgdr)
pd.DataFrame([m_sgdr]).to_csv(os.path.join(OUT_ACC, "sgdr_metrics.csv"), index=False)
plot_parity(y_acc_te.values, yhat_sgdr, "Accidents: SGDR(ε) Parity", os.path.join(OUT_ACC, "sgdr_parity.png"))
plot_residuals(y_acc_te.values, yhat_sgdr, "Accidents: SGDR(ε) Residuals", os.path.join(OUT_ACC, "sgdr_residuals.png"))

save_profile(os.path.join(OUT_ACC, "sgdr_profile.json"),
             tag="Acc_SGDR_fit+predict", seconds_fit=prof_sgdr.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_sgdr.peak_gb, hardware=hw_info())

# Learning curve for SGDR
plot_learning_curve_single(
    sgdr, X_acc_tr, y_acc_tr,
    title="Accidents: SGDR(ε) Learning Curve",
    outpath=os.path.join(OUT_ACC, "sgdr_learning_curve.png"),
    cv=cv_reg,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

# Model-complexity curve (vary alpha)
print("[Accidents/SGDR] Model complexity sweep (alpha)…")
def sgdr_maker(alpha):
    return Pipeline([("pre", clone(pre_acc)),
                     ("reg", SGDRegressor(loss="epsilon_insensitive",
                                          alpha=float(alpha), penalty="l2",
                                          learning_rate="constant", eta0=0.01,
                                          max_iter=20000, tol=1e-3,
                                          random_state=RANDOM_STATE))])

model_complexity_curve(
    pipe_maker=sgdr_maker,
    param_name="alpha", param_grid=[1e-5, 1e-4, 1e-3],
    X=X_acc_tr, y=y_acc_tr, cv=cv_reg, scorer=r2_score,
    out_csv=os.path.join(OUT_ACC, "mc_sgdr.csv"),
    title="Accidents: SGDR(ε) Model-Complexity (alpha)",
    out_png=os.path.join(OUT_ACC, "mc_sgdr.png")
)

[Accidents/SVR] Subsample train to 50,000 for RBF SVR…
[Profiler] START: Acc_SVR_fit
[Profiler] END: Acc_SVR_fit | sec=85.72 | peakGB=0.753
[Plot] Parity: Accidents: SVR Parity (≤25k eval)
[Plot] Residuals: Accidents: SVR Residuals (≤25k eval)
[Accidents/SVR] Learning curve (on subsample)…
[Plot] Learning curve: Accidents: SVR(RBF) Learning Curve (subsample)
[Accidents/SGDR] Linear ε-SVR analogue (SGDRegressor)…
[Profiler] START: Acc_SGDR_fit
[Profiler] END: Acc_SGDR_fit | sec=2.97 | peakGB=1.309
[Plot] Parity: Accidents: SGDR(ε) Parity
[Plot] Residuals: Accidents: SGDR(ε) Residuals
[Plot] Learning curve: Accidents: SGDR(ε) Learning Curve




[Accidents/SGDR] Model complexity sweep (alpha)…


In [11]:
# =======================================================
# CHUNK 13 — ACCIDENTS: MLPRegressor (SGD-only) [R2]
# =======================================================

def train_mlpr_sgd_epoch_curve(X_tr, y_tr, X_te, y_te, out_csv, out_png, patience=3):
    """MLPRegressor with SGD only; ≤15 epochs; early stopping marker; param budget log."""
    print("[Accidents/MLPReg] Epoch-by-epoch (SGD-only, ≤15 epochs, early stopping)…")
    reg = Pipeline([
    ("pre", clone(pre_acc)),
    ("dense32", FunctionTransformer(to_dense32, accept_sparse=True)),
    ("reg", MLPRegressor(
        hidden_layer_sizes=(256, 256, 128, 128),
        activation="tanh",
        solver="sgd",
        learning_rate_init=0.001,
        learning_rate="constant",
        alpha=1e-3,
        momentum=0.0,
        batch_size=2048,
        max_iter=1,       # one epoch per .fit()
        warm_start=True,
        random_state=RANDOM_STATE
    ))
])

    rows, best = [], {"epoch": 0, "val_MAE": np.inf}
    no_improve = 0
    for epoch in range(1, 16):
        reg.fit(X_tr, y_tr)  # one epoch
        ytr = reg.predict(X_tr); yte = reg.predict(X_te)
        mae_tr = mean_absolute_error(y_tr, ytr); mae_te = mean_absolute_error(y_te, yte)
        rows.append({"epoch": epoch, "train_MAE": float(mae_tr), "val_MAE": float(mae_te)})
        if mae_te + 1e-6 < best["val_MAE"]:
            best.update({"epoch": epoch, "val_MAE": float(mae_te)}); no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"[Accidents/MLPReg] Early stopping at epoch {epoch} (best={best['epoch']})")
                break
    df = pd.DataFrame(rows); df.to_csv(out_csv, index=False)
    plt.figure(); plt.plot(df["epoch"], df["train_MAE"], marker="o", label="Train MAE")
    plt.plot(df["epoch"], df["val_MAE"], marker="s", label="Val MAE")
    plt.axvline(best["epoch"], linestyle="--", label=f"best@{best['epoch']}")
    plt.title("Accidents: MLPReg (SGD) Epoch Curve"); plt.xlabel("Epoch"); plt.ylabel("MAE"); plt.legend()
    plt.tight_layout(); plt.savefig(out_png, bbox_inches="tight"); plt.close()
    # param budget audit
    n_in = _estimate_input_dim(pre_acc, X_tr); n_params = _estimate_mlp_params(n_in, (256,256,128,128), 1)
    save_json_safe(os.path.join(OUT_ACC, "mlpreg_param_budget.json"),
                   {"estimated_input_dim": n_in, "param_count": n_params})
    return reg

with Profiler("Acc_MLPReg_fit") as prof_mlpr:
    mlpr = train_mlpr_sgd_epoch_curve(X_acc_tr, y_acc_tr, X_acc_te, y_acc_te,
                                      os.path.join(OUT_ACC,"mlp_epochs.csv"),
                                      os.path.join(OUT_ACC,"mlp_epochs.png"))
t0 = time.perf_counter(); yhat_mlpr = mlpr.predict(X_acc_te); pred_s = time.perf_counter()-t0
Xt_tr = mlpr.named_steps["dense32"].transform( mlpr.named_steps["pre"].transform(X_acc_tr) )
Xt_te = mlpr.named_steps["dense32"].transform( mlpr.named_steps["pre"].transform(X_acc_te) )
print("Train feats:", Xt_tr.shape, " Test feats:", Xt_te.shape)

m_mlpr = summarize_regression(y_acc_te, yhat_mlpr)
pd.DataFrame([m_mlpr]).to_csv(os.path.join(OUT_ACC, "mlp_metrics.csv"), index=False)
plot_parity(y_acc_te.values, yhat_mlpr, "Accidents: MLPReg Parity", os.path.join(OUT_ACC, "mlp_parity.png"))
plot_residuals(y_acc_te.values, yhat_mlpr, "Accidents: MLPReg Residuals", os.path.join(OUT_ACC, "mlp_residuals.png"))
print("[Accidents/MLPReg] Learning curve (on subsample)…")

mlpr_lc = Pipeline([
    ("pre", clone(pre_acc)),
    ("dense32", FunctionTransformer(to_dense32, accept_sparse=True)),
    ("reg", MLPRegressor(
        hidden_layer_sizes=(256, 256, 128, 128),
        activation="tanh",
        solver="sgd",
        learning_rate_init=0.001,
        learning_rate="constant",
        alpha=1e-3,
        momentum=0.0,
        max_iter=50,
        random_state=RANDOM_STATE
    ))
])



mlp_lc_cap = min(30000, len(X_acc_tr))
mlp_lc_idx = np.random.RandomState(RANDOM_STATE).choice(len(X_acc_tr), size=mlp_lc_cap, replace=False)
X_mlp_lc, y_mlp_lc = X_acc_tr.iloc[mlp_lc_idx], y_acc_tr.iloc[mlp_lc_idx]

cv_mlp_lc = KFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)

plot_learning_curve_single(
    mlpr_lc, X_mlp_lc, y_mlp_lc,
    title="Accidents: MLPReg Learning Curve (subsample)",
    outpath=os.path.join(OUT_ACC, "mlp_learning_curve.png"),
    cv=cv_mlp_lc,
    train_sizes=np.linspace(0.2, 1.0, 4),
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

save_profile(os.path.join(OUT_ACC, "mlp_profile.json"),
             tag="Acc_MLPReg_fit+predict", seconds_fit=prof_mlpr.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_mlpr.peak_gb, hardware=hw_info())


[Profiler] START: Acc_MLPReg_fit
[Accidents/MLPReg] Epoch-by-epoch (SGD-only, ≤15 epochs, early stopping)…
[Accidents/MLPReg] Early stopping at epoch 4 (best=1)
[Profiler] END: Acc_MLPReg_fit | sec=27.65 | peakGB=2.336
Train feats: (700000, 71)  Test feats: (150000, 71)
[Plot] Parity: Accidents: MLPReg Parity
[Plot] Residuals: Accidents: MLPReg Residuals
[Accidents/MLPReg] Learning curve (on subsample)…
[Plot] Learning curve: Accidents: MLPReg Learning Curve (subsample)


In [12]:
# =====================================================
# CHUNK 14 — Summaries & runtime tables for the report
# =====================================================

# HOTEL summary table (sorted by ROC-AUC)
print("[Hotel] Building metrics summary…")
hotel_rows = []
for f in ["dt_metrics.csv","knn_metrics.csv","linsvm_metrics.csv","rbfsvm_metrics.csv","mlp_metrics.csv"]:
    p = os.path.join(OUT_HOTEL, f)
    if os.path.exists(p):
        df = pd.read_csv(p); df.insert(0, "model", f.split("_")[0].upper())
        hotel_rows.append(df)
if hotel_rows:
    hotel_summary = pd.concat(hotel_rows, ignore_index=True).sort_values("roc_auc", ascending=False)
    hotel_summary.to_csv(os.path.join(OUT_HOTEL, "metrics_summary.csv"), index=False)
    print(hotel_summary)

# ACC summary table (sorted by RMSE ascending)
print("[Accidents] Building metrics summary…")
acc_rows = []
for f in ["dtr_metrics.csv","knn_metrics.csv","svr_metrics.csv",
          "mlp_metrics.csv","svr_linear_metrics.csv","sgdr_metrics.csv"]:
    p = os.path.join(OUT_ACC, f)
    if os.path.exists(p):
        df = pd.read_csv(p); df.insert(0, "model", f.split("_")[0].upper())
        acc_rows.append(df)
if acc_rows:
    acc_summary = pd.concat(acc_rows, ignore_index=True).sort_values("RMSE", ascending=True)
    acc_summary.to_csv(os.path.join(OUT_ACC, "metrics_summary.csv"), index=False)
    print(acc_summary)

# Aggregate runtime tables (fit+predict+hardware)
def build_runtime_table(root, out_csv):
    rows = []
    for fname in os.listdir(root):
        if fname.endswith("_profile.json"):
            j = json.load(open(os.path.join(root, fname)))
            j["file"] = fname
            rows.append(j)
    if rows:
        pd.DataFrame(rows).to_csv(out_csv, index=False)

print("[All] Building runtime tables…")
build_runtime_table(OUT_HOTEL, os.path.join(OUT_HOTEL, "runtime_table.csv"))
build_runtime_table(OUT_ACC,   os.path.join(OUT_ACC,   "runtime_table.csv"))

print("✅ Pipeline finished. See outputs/ for figures, metrics, and logs.")


[Hotel] Building metrics summary…
    model  accuracy  precision    recall        f1   roc_auc    pr_auc
3  RBFSVM  0.844659   0.860945  0.692493  0.767586  0.909404  0.880933
0      DT  0.812999   0.826476  0.626771  0.712902  0.889125  0.850334
2  LINSVM  0.808532   0.800713  0.643202  0.713366  0.886355  0.843167
1     KNN  0.819030   0.787884  0.699879  0.741279  0.883948  0.855353
4     MLP  0.806913   0.825277  0.607326  0.699722  0.863167  0.831103
[Accidents] Building metrics summary…
  model        MAE      MedAE       RMSE        R2
4  SGDR  17.890766  14.396326  32.938952 -0.074464
3   MLP  20.384327  17.570444  33.042102 -0.081204
2   SVR  16.523454  12.159758  33.573486 -0.045412
0   DTR  21.588574  17.669019  34.244566 -0.161330
1   KNN  20.180107  15.571829  35.162493 -0.146711
[All] Building runtime tables…
✅ Pipeline finished. See outputs/ for figures, metrics, and logs.


In [None]:
# Prep cell: run this once before the patch cells
import os, numpy as np, matplotlib.pyplot as plt
from sklearn.base import clone
from sklearn.model_selection import cross_val_score, KFold

# Simple helper for model-complexity plots (MAE for regression)
def run_mc(pipe_maker, grid, X, y, cv, xlabel, title, out_png, scoring="neg_mean_absolute_error"):
    rows=[]
    for val in grid:
        est = pipe_maker(val)
        scores = cross_val_score(est, X, y, cv=cv, scoring=scoring, n_jobs=-1)
        rows.append((val, scores.mean(), scores.std()))
    xs, means, stds = zip(*rows)
    plt.figure(figsize=(6.2,4.6))
    plt.errorbar(xs, [-m for m in means] if scoring.startswith("neg_") else means, yerr=stds, marker='o')
    plt.xlabel(xlabel); plt.ylabel("MAE" if scoring.startswith("neg_") else scoring.upper())
    plt.title(title); plt.grid(alpha=.3)
    os.makedirs(os.path.dirname(out_png), exist_ok=True)
    plt.savefig(out_png, dpi=160); plt.close()


In [None]:
# Model-Complexity (max_depth) for Accidents DTR
from sklearn.metrics import mean_absolute_error as _mae
neg_mae = lambda yt, yp: -_mae(yt, yp)

def dtr_maker(max_depth):
    return Pipeline([("pre", clone(pre_acc)),
                     ("reg", DecisionTreeRegressor(
                         max_depth=int(max_depth),
                         min_samples_leaf=100,
                         min_samples_split=200,
                         random_state=RANDOM_STATE))])

model_complexity_curve(
    pipe_maker=dtr_maker,
    param_name="max_depth",
    param_grid=[6, 10, 14, 18],
    X=X_acc_tr, y=y_acc_tr, cv=cv_reg, scorer=neg_mae,
    out_csv=os.path.join(OUT_ACC, "mc_dtr.csv"),
    title="Accidents: DTR Model-Complexity (max_depth)",
    out_png=os.path.join(OUT_ACC, "mc_dtr.png")
)

# Model-Complexity (k) for Accidents kNN on the 250k subset
def knr_maker(k):
    return Pipeline([("pre", clone(pre_acc)),
                     ("reg", KNeighborsRegressor(n_neighbors=int(k),
                                                algorithm="brute", metric="euclidean",
                                                n_jobs=-1))])

model_complexity_curve(
    pipe_maker=knr_maker,
    param_name="n_neighbors",
    param_grid=[3, 5, 11, 21],
    X=Xa_knn, y=ya_knn, cv=cv_reg, scorer=neg_mae,
    out_csv=os.path.join(OUT_ACC, "mc_knn.csv"),
    title="Accidents: kNN Model-Complexity (k)",
    out_png=os.path.join(OUT_ACC, "mc_knn.png")
)

# Model-Complexity (C) for Accidents RBF-SVR on the ≤100k subset
def rbf_svr_maker(C):
    return Pipeline([("pre", clone(pre_acc)),
                     ("reg", SVR(kernel="rbf", C=float(C)))])

model_complexity_curve(
    pipe_maker=rbf_svr_maker,
    param_name="C",
    param_grid=[0.5, 2.0, 8.0],
    X=Xa_svr, y=ya_svr, cv=cv_reg, scorer=neg_mae,
    out_csv=os.path.join(OUT_ACC, "mc_svr_rbf.csv"),
    title="Accidents: RBF SVR Model-Complexity (C)",
    out_png=os.path.join(OUT_ACC, "mc_svr_rbf.png")
)
# NN Model-Complexity (width sweep within 0.2M–1.0M params)
PARAM_MIN, PARAM_MAX = 2e5, 1e6
n_in = _estimate_input_dim(pre_acc, X_acc_tr)
candidates = [128, 256, 512]
widths = [w for w in candidates if PARAM_MIN <= _estimate_mlp_params(n_in, (w, w), 1) <= PARAM_MAX]

def mlpr_maker(w):
    return Pipeline([
        ("pre", clone(pre_acc)),
        ("dense32", FunctionTransformer(to_dense32, accept_sparse=True)),
        ("reg", MLPRegressor(hidden_layer_sizes=(int(w), int(w)),
                             solver="sgd", learning_rate="constant", learning_rate_init=0.001,
                             alpha=1e-3, momentum=0.0, batch_size=2048,
                             max_iter=15, early_stopping=False, random_state=RANDOM_STATE))
    ])

# Use a manageable slice for MC (keeps runtime reasonable)
idx_w = np.random.RandomState(RANDOM_STATE).choice(len(X_acc_tr), size=min(300_000, len(X_acc_tr)), replace=False)
model_complexity_curve(
    pipe_maker=mlpr_maker,
    param_name="width",
    param_grid=widths,
    X=X_acc_tr.iloc[idx_w], y=y_acc_tr.iloc[idx_w],
    cv=KFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE),
    scorer=neg_mae,
    out_csv=os.path.join(OUT_ACC, "mc_mlpr_width.csv"),
    title="Accidents: NN Model-Complexity (width)",
    out_png=os.path.join(OUT_ACC, "mc_mlpr_width.png")
)

# Linear SVR (literal linear SVM regressor) — LC + MC
from sklearn.svm import LinearSVR

print("[Accidents/LinearSVR] Fitting LinearSVR on full train…")
lsvr = Pipeline([("pre", clone(pre_acc)),
                 ("reg", LinearSVR(C=1.0, epsilon=0.1, max_iter=20000, random_state=RANDOM_STATE))])

with Profiler("Acc_LinearSVR_fit") as prof_lsvr:
    lsvr.fit(X_acc_tr, y_acc_tr)

t0 = time.perf_counter(); yhat_lsvr = lsvr.predict(X_acc_te); pred_s = time.perf_counter()-t0
m_lsvr = summarize_regression(y_acc_te, yhat_lsvr)
pd.DataFrame([m_lsvr]).to_csv(os.path.join(OUT_ACC, "linearsvr_metrics.csv"), index=False)
save_profile(os.path.join(OUT_ACC, "linearsvr_profile.json"),
             tag="Acc_LinearSVR_fit+predict", seconds_fit=prof_lsvr.seconds_fit,
             seconds_predict=pred_s, peak_GB=prof_lsvr.peak_gb, hardware=hw_info())

# Learning curve
plot_learning_curve_single(
    lsvr, X_acc_tr, y_acc_tr,
    title="Accidents: Linear SVR Learning Curve",
    outpath=os.path.join(OUT_ACC, "linearsvr_learning_curve.png"),
    cv=cv_reg, scoring="neg_mean_absolute_error", n_jobs=-1
)

# Model-Complexity (C)
def lsvr_maker(C):
    return Pipeline([("pre", clone(pre_acc)),
                     ("reg", LinearSVR(C=float(C), epsilon=0.1, max_iter=20000, random_state=RANDOM_STATE))])

model_complexity_curve(
    pipe_maker=lsvr_maker,
    param_name="C", param_grid=[0.1, 1.0, 10.0],
    X=X_acc_tr, y=y_acc_tr, cv=cv_reg, scorer=neg_mae,
    out_csv=os.path.join(OUT_ACC, "mc_linearsvr.csv"),
    title="Accidents: Linear SVR Model-Complexity (C)",
    out_png=os.path.join(OUT_ACC, "mc_linearsvr.png")
)
