In [1]:
"""
CS7641 Supervised Learning — Hotel (classification) + US Accidents (regression)
v2 — Aligns with Fall 2025 rubric + dataset minimums and adds NN activation study (extra credit).

What’s new vs v1:
 - Pipelines: target/freq encoding inside CV; StandardScaler; global cast to float32
 - Hotel (classification): ROC/PR + reliability plots; confusion matrix at F1-optimal threshold
 - Accidents (regression): MAE/MedAE/MSE; residuals vs prediction plot
 - Runtime table: fit/predict wall-clock + peak RAM; hardware note
 - US Accidents size rules: DT & Linear SVR ~≥1M rows if available; RBF SVR ≤100k; kNN ≤250k train / ≤25k test
 - Neural Nets (both datasets): SGD-only (no momentum), ≤15 epochs, batch 512–2048, L2 in [1e-4,1e-3],
   param-count kept in 0.2M–1.0M (auto width), LC + MC + epoch curves
 - Extra credit (Hotel): activation study (ReLU, GELU, SiLU/Swish, tanh) under identical SGD protocol

Outputs:
 sl_outputs/figs/*    — all curves/diagnostics (dataset-prefixed filenames)
 sl_outputs/logs/runtime_table.csv — model runtimes & RAM; hardware note printed to console
"""

import os, time, math, platform, warnings, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore", category=UserWarning)

# ---- Paths / dirs
FIG_DIR = "sl_outputs/figs"; LOG_DIR = "sl_outputs/logs"
os.makedirs(FIG_DIR, exist_ok=True); os.makedirs(LOG_DIR, exist_ok=True)

# ---- Repro
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE); random.seed(RANDOM_STATE)

# ---- System info + RAM tracking
try:
    import psutil
except ImportError:
    psutil = None
import resource

def now_mb():
    if psutil: return psutil.Process().memory_info().rss / (1024**2)
    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0  # MB on Linux

def time_fit_predict(model, Xtr, ytr, Xte, yte, fit_fn="fit", predict_fn="predict"):
    start_ram = now_mb(); t0 = time.time()
    getattr(model, fit_fn)(Xtr, ytr)
    t1 = time.time()
    yhat = getattr(model, predict_fn)(Xte)
    t2 = time.time()
    peak_ram = max(now_mb(), start_ram)  # coarse proxy
    return (t1 - t0), (t2 - t1), peak_ram, yhat

def hw_note():
    cpu = platform.processor() or "Unknown CPU"
    sys = f"{platform.system()} {platform.release()}"
    print(f"[Hardware] {sys}; CPU: {cpu}; Python {platform.python_version()}")

# ---- Plot/save helpers
def savefig(prefix, title):
    fname = f"{prefix}_{title.replace(' ', '_').replace('/', '-')}.png"
    path = os.path.join(FIG_DIR, fname)
    plt.tight_layout(); plt.savefig(path, dpi=160); plt.close()
    print(f"[saved] {path}")

def protocol_card(name, X, y=None, scoring=None, cv=None):
    if y is not None and set(np.unique(y)) <= {0,1}:
        pos = float(np.mean(y))
        print(f"[{name}] n={len(X)} scoring={scoring} cv={cv} prevalence={pos:.3f}")
    else:
        print(f"[{name}] n={len(X)} scoring={scoring} cv={cv}")

# ---- Sklearn bits
from sklearn.model_selection import train_test_split, learning_curve, validation_curve, StratifiedShuffleSplit, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, auc, f1_score, confusion_matrix,
    RocCurveDisplay, PrecisionRecallDisplay, mean_absolute_error, mean_squared_error, median_absolute_error
)
from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import LinearSVC, SVC, LinearSVR, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor

# ---- Metrics helpers
def pr_auc(y_true, scores):
    p, r, _ = precision_recall_curve(y_true, scores)
    return auc(r, p)

def f1_opt_threshold(y_true, scores):
    p, r, t = precision_recall_curve(y_true, scores)
    f1 = np.where((p+r) > 0, 2*p*r/(p+r), 0)
    j = int(np.argmax(f1))
    thr = t[j-1] if 0 < j < len(t) else 0.5
    return float(thr), float(f1[j])

def count_nn_params(hidden_layers, input_dim, output_dim=1):
    """Count total trainable params in MLP: weights + biases."""
    total = 0
    prev = input_dim
    for h in hidden_layers:
        total += prev * h + h  # weights + bias
        prev = h
    total += prev * output_dim + output_dim  # output layer
    return total

def plot_nn_epoch_curve(model, X_train, y_train, X_val, y_val, title, prefix="Hotel", classification=True):
    """Plot NN training curve: train/val loss per epoch with early stopping marker.
    
    NOTE: This uses the ALREADY TRAINED model's loss_curve_ attribute if available.
    If not available, skips plotting to avoid retraining overhead.
    """
    # Get the MLP from pipeline
    mlp = model.named_steps['clf'] if 'clf' in model.named_steps else model.named_steps['reg']
    
    # Check if model has been trained and has loss curve
    if not hasattr(mlp, 'loss_curve_') or len(mlp.loss_curve_) == 0:
        print(f"[NN Epoch Curve] Skipping - model not trained or no loss_curve_ available")
        return
    
    # Use existing loss curve from training
    train_losses = mlp.loss_curve_
    n_epochs = len(train_losses)
    
    # Compute validation scores for each "effective" epoch based on available data
    # Since we can't recreate intermediate epochs, we'll use the final model
    # and create a simplified plot showing the training progression
    
    # Get final scores
    if classification:
        train_score = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
        val_score = roc_auc_score(y_val, model.predict_proba(X_val)[:,1])
        metric_name = 'ROC-AUC'
    else:
        train_score = -mean_absolute_error(y_train, model.predict(X_train))
        val_score = -mean_absolute_error(y_val, model.predict(X_val))
        metric_name = 'Neg MAE'
    
    # Determine best epoch (where training stopped due to early stopping)
    best_epoch = mlp.n_iter_ if hasattr(mlp, 'n_iter_') else n_epochs
    
    # Create simplified plot with available data
    plt.figure(figsize=(6.2, 4.6))
    plt.plot(range(1, n_epochs+1), train_losses, 'o-', label='Train Loss')
    plt.axvline(best_epoch, ls='--', color='red', alpha=0.7, label=f'Stopped at Epoch {best_epoch}')
    plt.xlabel('Epoch'); plt.ylabel('Loss')
    plt.title(f'{prefix} — NN Training Loss: {title}\nFinal Train {metric_name}={train_score:.4f}, Val {metric_name}={val_score:.4f}')
    plt.legend(); plt.grid(alpha=.3)
    
    savefig(prefix, f"NN_EpochCurve_{title.replace(' ', '_')}")
    print(f"[NN Epoch Curve] {n_epochs} epochs, stopped at {best_epoch}, Final Val {metric_name}={val_score:.4f}")

def support_vector_analysis(model, X, y, model_name="LinearSVC"):
    """Analyze support vectors for SVM models."""
    # For LinearSVC, we approximate support vectors using decision function margin
    if hasattr(model, 'decision_function'):
        decisions = model.decision_function(X)
        # Points near the margin (|decision| < 1) are approximate support vectors
        margin_threshold = 1.0
        near_margin = np.abs(decisions) < margin_threshold
        sv_fraction = np.mean(near_margin)
        
        print(f"\n[{model_name} Support Vector Analysis]")
        print(f"  Approximate SV fraction: {sv_fraction:.4f} ({int(sv_fraction*len(X)):,} / {len(X):,})")
        print(f"  Decision function range: [{decisions.min():.3f}, {decisions.max():.3f}]")
        print(f"  Points within margin (|f(x)| < 1): {near_margin.sum():,}")
        
        # Plot decision function distribution
        plt.figure(figsize=(6.2, 4.6))
        plt.hist(decisions, bins=50, alpha=0.7, edgecolor='black')
        plt.axvline(-1, color='red', ls='--', label='Margin boundaries')
        plt.axvline(1, color='red', ls='--')
        plt.axvline(0, color='green', ls='-', alpha=0.5, label='Decision boundary')
        plt.xlabel('Decision function f(x)'); plt.ylabel('Count')
        plt.title(f'{model_name} — Decision Function Distribution')
        plt.legend(); plt.grid(alpha=.3)
        return sv_fraction
    return None

# ---- Curves
def plot_learning(est, title, X, y, scoring, cv=5, prefix="Hotel"):
    protocol_card(title, X, y, scoring, cv)
    sizes = np.linspace(0.1, 1.0, 4)  # 4 points instead of 5 for speed
    tr_sizes, tr_s, te_s = learning_curve(
        est, X, y, train_sizes=sizes, cv=cv, scoring=scoring, n_jobs=-1, random_state=RANDOM_STATE
    )
    plt.figure(figsize=(6.2,4.6))
    plt.title(f"{prefix} — Learning Curve: {title}")
    plt.xlabel("Training examples"); plt.ylabel(scoring.upper()); plt.grid(alpha=.3)
    plt.plot(tr_sizes, tr_s.mean(1), 'o-', label="Training")
    plt.plot(tr_sizes, te_s.mean(1), 'o-', label="Cross-val"); plt.legend()
    savefig(prefix, f"LC_{title}")

def plot_validation(est, X, y, pname, prange, scoring, title, cv=5, prefix="Hotel"):
    protocol_card(title, X, y, scoring, cv)
    tr_s, te_s = validation_curve(est, X, y, param_name=pname, param_range=prange, cv=cv, scoring=scoring, n_jobs=-1)
    plt.figure(figsize=(6.2,4.6))
    plt.title(f"{prefix} — Validation Curve: {title}")
    plt.xlabel(pname); plt.ylabel(scoring.upper()); plt.grid(alpha=.3)
    plt.plot(prange, tr_s.mean(1), 'o-', label="Training")
    plt.plot(prange, te_s.mean(1), 'o-', label="Cross-val"); plt.legend()
    savefig(prefix, f"MC_{title}")

# =========================
# Data loaders & preprocess
# =========================
def load_hotel():
    df = pd.read_csv("hotel_bookings.csv")
    # leakage controls (rubric)
    drop_leak = [c for c in ['reservation_status','reservation_status_date'] if c in df.columns]
    df = df.drop(columns=drop_leak)
    for c in ['agent','company','children']:
        if c in df.columns:
            df[c] = df[c].fillna(0).astype(int)
    if 'country' in df.columns:
        df['country'] = df['country'].fillna(df['country'].mode()[0])
    y = df['is_canceled'].astype(int)
    X = df.drop(columns=['is_canceled'])
    print(f"[Hotel] rows raw={len(df)}  target=is_canceled  (train/test split later)")
    return X, y

def hotel_preprocessor(X):
    num = X.select_dtypes(include=np.number).columns.tolist()
    te_cols = [c for c in ['country','agent','company'] if c in X.columns]  # high-cardinality
    cat = [c for c in X.select_dtypes(exclude=np.number).columns if c not in te_cols]
    
    # Use frequency encoding instead of TargetEncoder to avoid pipeline issues
    # Frequency encoding is a simple unsupervised alternative that works in ColumnTransformer
    from sklearn.preprocessing import FunctionTransformer
    def freq_encode(X_col):
        """Frequency encode categorical columns"""
        if isinstance(X_col, pd.DataFrame):
            result = X_col.copy()
            for col in result.columns:
                freq_map = result[col].value_counts(normalize=True).to_dict()
                result[col] = result[col].map(freq_map).fillna(0)
            return result.astype(np.float32)
        return X_col
    
    pre = ColumnTransformer([
        ('num', StandardScaler(), num),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=np.float32), cat),
        ('freq', FunctionTransformer(freq_encode), te_cols)
    ], remainder='drop')
    # global cast to float32 for downstream models (rubric)
    to32 = ('to32', FunctionTransformer(lambda Z: Z.astype(np.float32)))
    return Pipeline([('pre', pre), to32])

def load_accidents(path="US_Accidents_March23.csv", max_rows=1800000):
    """Load accidents data. max_rows limits initial CSV load for memory efficiency."""
    df = pd.read_csv(path, nrows=max_rows)
    df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
    df['End_Time']   = pd.to_datetime(df['End_Time'],   errors='coerce')
    df['Duration'] = (df['End_Time'] - df['Start_Time']).dt.total_seconds() / 60.0
    df = df.drop(columns=['Start_Time','End_Time'])
    df = df[df['Duration'] > 0]
    if 'Description' in df.columns: df = df.drop(columns=['Description'])
    
    # Drop columns with >50% nulls (like End_Lat, End_Lng)
    null_pct = df.isna().mean()
    high_null_cols = null_pct[null_pct > 0.5].index.tolist()
    if high_null_cols:
        df = df.drop(columns=high_null_cols)
    
    # Fill remaining nulls
    for c in df.select_dtypes(include='object').columns: 
        df[c] = df[c].fillna('Unknown')
    for c in df.select_dtypes(include='bool').columns:   
        df[c] = df[c].astype(int)
    for c in df.select_dtypes(include=np.number).columns: 
        df[c] = df[c].fillna(df[c].median())  # Fill numeric nulls with median
    
    # Convert to float32 for efficiency
    for c in df.select_dtypes(include=np.number).columns: 
        df[c] = df[c].astype(np.float32)
    
    y = df['Duration'].astype(np.float32)
    X = df.drop(columns=['Duration','Severity']) if 'Severity' in df.columns else df.drop(columns=['Duration'])
    print(f"[Accidents] rows raw={len(df)}  target=Duration(min)")
    return X, y

def accidents_preprocessor(X):
    num = X.select_dtypes(include=np.number).columns.tolist()
    cat = X.select_dtypes(exclude=np.number).columns.tolist()
    pre = ColumnTransformer([
        ('num', StandardScaler(), num),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=np.float32), cat)
    ], remainder='drop')
    to32 = ('to32', FunctionTransformer(lambda Z: Z.astype(np.float32)))
    return Pipeline([('pre', pre), to32])

# ---- Helper: choose NN widths to stay within param cap
def choose_widths(input_dim, target_dim, shallow=True, cap_low=2e5, cap_high=1e6):
    # simple heuristic: pick W so ~ input_dim*W + W*target_dim stays within cap window
    W = max(64, int(min(cap_high / max(1, input_dim), 1024)))
    if shallow:
        arch = [min(W, 512), min(W, 512)]
    else:
        arch = [max(64, W//2), max(64, W//2), max(32, W//4), max(32, W//4)]
    # Return and let caller print estimated param count after fitting preprocessor
    return arch

In [2]:
# =========================
# HOTEL — Classification
# =========================
def run_hotel():
    print("\n== HOTEL (CLASSIFICATION: is_canceled) ==")
    X, y = load_hotel()
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
    pre = hotel_preprocessor(Xtr)

    # Post-scaler aligns numeric/TE/OHE magnitudes for distance-based models
    post = ('post', StandardScaler(with_mean=False))

    # Models (SGD-only NNs; SVM kernels; class_weight balanced)
    dt_clf  = Pipeline([('prep', pre), ('clf', DecisionTreeClassifier(class_weight='balanced', random_state=RANDOM_STATE))])
    knn_clf = Pipeline([('prep', pre), post, ('clf', KNeighborsClassifier(n_neighbors=11, n_jobs=-1))])
    linSVC  = Pipeline([('prep', pre), post, ('clf', LinearSVC(C=1.0, class_weight='balanced', max_iter=15000, random_state=RANDOM_STATE))])
    rbfSVC  = Pipeline([('prep', pre), post, ('clf', SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=RANDOM_STATE))])

    # NN widths chosen after fitting preprocessor to get input_dim
    pre.fit(Xtr, ytr)
    input_dim = pre.transform(Xtr[:5]).shape[1]
    shallow = choose_widths(input_dim, 2, shallow=True)
    deeper  = choose_widths(input_dim, 2, shallow=False)
    
    # Verify param counts are in 0.2M-1.0M range (rubric requirement)
    shallow_params = count_nn_params(shallow, input_dim, output_dim=2)
    deeper_params = count_nn_params(deeper, input_dim, output_dim=2)
    print(f"[NN Architectures] input_dim={input_dim}")
    print(f"  Shallow {shallow}: {shallow_params:,} params ({shallow_params/1e6:.2f}M)")
    print(f"  Deeper {deeper}: {deeper_params:,} params ({deeper_params/1e6:.2f}M)")
    if not (200_000 <= shallow_params <= 1_000_000):
        print(f"  ⚠️  WARNING: Shallow params {shallow_params:,} outside 0.2M-1.0M range!")
    if not (200_000 <= deeper_params <= 1_000_000):
        print(f"  ⚠️  WARNING: Deeper params {deeper_params:,} outside 0.2M-1.0M range!")
    
    nn_sgd = Pipeline([
        ('prep', pre),
        post,
        ('clf', MLPClassifier(hidden_layer_sizes=tuple(shallow), solver='sgd',
                              batch_size=1024, learning_rate='constant', learning_rate_init=0.05,
                              alpha=1e-4, early_stopping=True, n_iter_no_change=3,
                              max_iter=15, shuffle=True, momentum=0.0, nesterovs_momentum=False,
                              random_state=RANDOM_STATE))
    ])

    # AGGRESSIVE OPTIMIZATION: Use 40k subset for curves (rubric has no min size for Hotel)
    # This gives 5-8x speedup on curves while maintaining trend visibility
    n_curves = min(40000, len(Xtr))
    idx_curves = np.random.choice(len(Xtr), n_curves, replace=False)
    Xtr_curves, ytr_curves = Xtr.iloc[idx_curves], ytr.iloc[idx_curves]
    print(f"[Optimization] Using {n_curves:,} samples for curves (cv=2, faster)")
    
    # ---- Curves: MC + LC for all learners (OPTIMIZED: 40k samples, cv=2, fewer params)
    print("\n[Generating validation curves...]")
    plot_validation(Pipeline([('prep', pre), ('clf', DecisionTreeClassifier(class_weight='balanced', random_state=RANDOM_STATE))]),
                    Xtr_curves, ytr_curves, "clf__max_depth", [6,10,14], "roc_auc", "DT vs depth", cv=2, prefix="Hotel")
    plot_validation(Pipeline([('prep', pre), post, ('clf', KNeighborsClassifier(n_jobs=-1))]),
                    Xtr_curves, ytr_curves, "clf__n_neighbors", [5,11,21], "roc_auc", "kNN vs k", cv=2, prefix="Hotel")
    plot_validation(Pipeline([('prep', pre), post, ('clf', LinearSVC(class_weight='balanced', max_iter=10000, random_state=RANDOM_STATE, dual='auto'))]),
                    Xtr_curves, ytr_curves, "clf__C", [0.1,1,10], "roc_auc", "LinearSVM vs C", cv=2, prefix="Hotel")
    # RBF SVM validation curve: even smaller subset (10k) due to O(n²-n³) complexity
    rbf_vc_size = min(10000, n_curves)
    idx_rbf_vc = np.random.choice(n_curves, rbf_vc_size, replace=False)
    print(f"[Note] Using {rbf_vc_size} samples for RBF validation curve (O(n³) complexity)")
    plot_validation(Pipeline([('prep', pre), post, ('clf', SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=RANDOM_STATE, cache_size=500))]),
                    Xtr_curves.iloc[idx_rbf_vc], ytr_curves.iloc[idx_rbf_vc], "clf__C", [0.5,2], "roc_auc", "RBF-SVM vs C (γ=scale)", cv=2, prefix="Hotel")
    plot_validation(Pipeline([('prep', pre), post, ('clf', MLPClassifier(hidden_layer_sizes=tuple(shallow), solver='sgd',
                                                                        momentum=0.0, nesterovs_momentum=False,
                                                                        batch_size=1024, max_iter=15, random_state=RANDOM_STATE))]),
                    Xtr_curves, ytr_curves, "clf__alpha", [1e-4,1e-3], "roc_auc", "NN vs L2(alpha)", cv=2, prefix="Hotel")

    print("\n[Generating learning curves...]")
    plot_learning(dt_clf,   "Decision Tree", Xtr_curves, ytr_curves, "roc_auc", cv=2, prefix="Hotel")
    plot_learning(knn_clf,  "kNN",           Xtr_curves, ytr_curves, "roc_auc", cv=2, prefix="Hotel")
    plot_learning(linSVC,   "Linear SVM",    Xtr_curves, ytr_curves, "roc_auc", cv=2, prefix="Hotel")
    # RBF SVM is O(n²-n³), use even smaller subset for learning curve (15k)
    rbf_lc_size = min(15000, n_curves)
    idx_rbf = np.random.choice(n_curves, rbf_lc_size, replace=False)
    print(f"[Note] Using {rbf_lc_size} samples for RBF learning curve (O(n³) complexity)")
    plot_learning(rbfSVC,   "RBF SVM",       Xtr_curves.iloc[idx_rbf], ytr_curves.iloc[idx_rbf], "roc_auc", cv=2, prefix="Hotel")
    plot_learning(nn_sgd,   "Neural Net (SGD)", Xtr_curves, ytr_curves, "roc_auc", cv=2, prefix="Hotel")

    # ---- Light tuning (OPTIMIZED: 40k subset, cv=2, fewer param combinations)
    print("\n[Hyperparameter tuning on subset...]")
    dt_tuned  = GridSearchCV(dt_clf,  {"clf__max_depth":[10,14], "clf__min_samples_leaf":[50,100]}, cv=2, scoring="roc_auc", n_jobs=-1).fit(Xtr_curves, ytr_curves).best_estimator_
    knn_tuned = GridSearchCV(knn_clf, {"clf__n_neighbors":[3,5,11,21]}, cv=2, scoring="roc_auc", n_jobs=-1).fit(Xtr_curves, ytr_curves).best_estimator_
    lin_tuned = GridSearchCV(linSVC,   {"clf__C":[0.1,1,10]}, cv=2, scoring="roc_auc", n_jobs=-1).fit(Xtr_curves, ytr_curves).best_estimator_
    # RBF: tune both C and gamma (rubric requirement)
    gamma_values = ["scale", 1.0/input_dim, 2.0/input_dim]
    rbf_tuned = GridSearchCV(rbfSVC,   {"clf__C":[0.5,2,8], "clf__gamma":gamma_values}, cv=2, scoring="roc_auc", n_jobs=-1).fit(Xtr_curves, ytr_curves).best_estimator_
    nn_tuned  = GridSearchCV(nn_sgd,   {"clf__alpha":[1e-4,1e-3]}, cv=2, scoring="roc_auc", n_jobs=-1).fit(Xtr_curves, ytr_curves).best_estimator_
    
    # Retrain on FULL training set with best params for final test evaluation
    print(f"[Final training] Retraining all models on full {len(Xtr):,} samples...")
    dt_tuned.fit(Xtr, ytr)
    knn_tuned.fit(Xtr, ytr)
    lin_tuned.fit(Xtr, ytr)
    rbf_tuned.fit(Xtr, ytr)
    nn_tuned.fit(Xtr, ytr)

    # Calibrate LinearSVC to get probabilities for PR/threshold/calibration
    lin_cal = CalibratedClassifierCV(lin_tuned, method="sigmoid", cv=2).fit(Xtr, ytr)

    models = {
        "DT": dt_tuned, "kNN": knn_tuned,
        "Linear SVM (calibrated)": lin_cal, "RBF SVM": rbf_tuned, "NN (SGD)": nn_tuned
    }

    # Threshold tuning on a small internal val split
    Xs, Xv, ys, yv = train_test_split(Xtr, ytr, test_size=0.2, random_state=RANDOM_STATE, stratify=ytr)
    tuned_thr = {}
    for name, m in models.items():
        score = m.predict_proba(Xv)[:,1] if hasattr(m, "predict_proba") else m.decision_function(Xv)
        thr, _ = f1_opt_threshold(yv, score); tuned_thr[name] = thr

    # Test metrics + plots
    plt.figure(figsize=(6.6,5)); ax = plt.gca()
    for name, m in models.items():
        s = m.predict_proba(Xte)[:,1] if hasattr(m,"predict_proba") else m.decision_function(Xte)
        RocCurveDisplay.from_predictions(yte, s, name=name, ax=ax)
    ax.set_title("Hotel — ROC curves"); ax.grid(alpha=.3)
    savefig("Hotel","ROC_All")

    plt.figure(figsize=(6.6,5)); ax = plt.gca()
    for name, m in models.items():
        s = m.predict_proba(Xte)[:,1] if hasattr(m,"predict_proba") else m.decision_function(Xte)
        PrecisionRecallDisplay.from_predictions(yte, s, name=name, ax=ax)
    ax.set_title("Hotel — PR curves"); ax.grid(alpha=.3)
    savefig("Hotel","PR_All")

    # Calibration (reliability) plots for top 2 by PR-AUC
    scored = []
    for name, m in models.items():
        s = m.predict_proba(Xte)[:,1] if hasattr(m,"predict_proba") else m.decision_function(Xte)
        scored.append((name, pr_auc(yte, s), s))
    top2 = sorted(scored, key=lambda x: x[1], reverse=True)[:2]
    plt.figure(figsize=(6.2,4.6)); ax = plt.gca()
    for name, _, s in top2:
        CalibrationDisplay.from_predictions(yte, s, name=name, ax=ax)
    ax.set_title("Hotel — Calibration (Reliability)"); ax.grid(alpha=.3)
    savefig("Hotel","Calibration_Top2")

    # Confusion matrix at tuned threshold for best model (by PR-AUC)
    best_name, _, best_scores = top2[0]
    thr = tuned_thr[best_name]
    yhat = (best_scores >= thr).astype(int)
    cm = confusion_matrix(yte, yhat)
    plt.figure(figsize=(4.8,4.4))
    plt.imshow(cm, cmap='Blues'); plt.title(f"Hotel — Confusion Matrix @{thr:.3f}\n{best_name}")
    for (i,j),v in np.ndenumerate(cm): plt.text(j, i, int(v), ha='center', va='center')
    plt.xlabel("Predicted"); plt.ylabel("True")
    savefig("Hotel","CM_best")
    
    # Decision Tree structure analysis
    print("\n[Decision Tree Structure Analysis]")
    dt_model = models["DT"].named_steps['clf']
    print(f"  Max depth reached: {dt_model.get_depth()}")
    print(f"  Number of leaves: {dt_model.get_n_leaves()}")
    print(f"  Total nodes: {dt_model.tree_.node_count}")
    
    # Feature importances (top 10)
    if hasattr(dt_model, 'feature_importances_'):
        importances = dt_model.feature_importances_
        feature_names = [f"Feature_{i}" for i in range(len(importances))]
        top_indices = np.argsort(importances)[-10:][::-1]
        
        print(f"\n  Top 10 Feature Importances (Gini):")
        for idx in top_indices:
            if importances[idx] > 0:
                print(f"    {feature_names[idx]}: {importances[idx]:.4f}")
        
        # Plot top 10 importances
        plt.figure(figsize=(6.2, 4.6))
        plt.barh(range(len(top_indices)), importances[top_indices])
        plt.yticks(range(len(top_indices)), [feature_names[i] for i in top_indices])
        plt.xlabel('Gini Importance'); plt.title('Hotel — DT Top 10 Feature Importances')
        plt.tight_layout()
        savefig("Hotel", "DT_Feature_Importances")

    # Print final test metrics with prevalence baseline + per-class metrics
    prev = float(np.mean(yte))
    print(f"\n[Hotel Test] Prevalence (PR-AUC baseline) = {prev:.4f}")
    print(f"{'Model':<24s} {'ROC-AUC':>8s} {'PR-AUC':>8s} {'F1@thr':>8s} {'Precision':>10s} {'Recall':>8s}")
    print("-" * 80)
    
    from sklearn.metrics import precision_score, recall_score
    for name, m in models.items():
        s = m.predict_proba(Xte)[:,1] if hasattr(m,"predict_proba") else m.decision_function(Xte)
        roc = roc_auc_score(yte, s); pr = pr_auc(yte, s)
        thr = tuned_thr[name]
        y_pred = (s >= thr).astype(int)
        f1 = f1_score(yte, y_pred)
        prec = precision_score(yte, y_pred)
        rec = recall_score(yte, y_pred)
        print(f"{name:24s} {roc:8.4f} {pr:8.4f} {f1:8.4f} {prec:10.4f} {rec:8.4f}")
    
    # Support vector analysis for Linear SVM
    print("\n[Linear SVM Diagnostics]")
    lin_svc_base = lin_cal.calibrated_classifiers_[0].estimator  # Get base LinearSVC from calibrated wrapper
    sv_frac = support_vector_analysis(lin_svc_base, Xtr, ytr, "Linear SVM")
    if sv_frac:
        savefig("Hotel", "LinearSVM_DecisionFunction_Distribution")
    
    # NN Epoch curves for main model
    print("\n[Generating NN epoch curves...]")
    Xtr_nn, Xval_nn, ytr_nn, yval_nn = train_test_split(Xtr, ytr, test_size=0.2, random_state=RANDOM_STATE, stratify=ytr)
    plot_nn_epoch_curve(nn_tuned, Xtr_nn, ytr_nn, Xval_nn, yval_nn, "SGD", prefix="Hotel", classification=True)

    # Runtime table entries (fit+predict on test)
    runtimes = []
    for name, m in models.items():
        # clone-like refit for timing on train; prediction on test
        tfit, tpred, peak, _ = time_fit_predict(m, Xtr, ytr, Xte, yte, fit_fn="fit", predict_fn="predict")
        runtimes.append(["Hotel", name, len(Xtr), len(Xte), tfit, tpred, peak])

    return models, (Xtr, Xte, ytr, yte), runtimes, pre, shallow, deeper

In [None]:
# =========================
# ACCIDENTS — Regression (OPTIMIZED FOR SPEED)
# =========================
def run_accidents(path="US_Accidents_March23.csv"):
    print("\n== US ACCIDENTS (REGRESSION: Duration minutes) ==")
    X, y = load_accidents(path)
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
    pre = accidents_preprocessor(Xtr); pre.fit(Xtr, ytr)
    ntr = len(Xtr); nte = len(Xte)
    print(f"[Accidents] cleaned rows={ntr+nte} → train={ntr} test={nte}")

    # Size rules (rubric-compliant - using maximum allowed limits)
    # Final training uses ≥1M for DT/Linear, maximum limits for RBF/kNN
    n_final_train = min(ntr, 1_200_000)  # ≥1M for final model (rubric requirement)
    n_cv = min(ntr, 300_000)  # Smaller for CV/curves (10x speedup)
    n_rbf = min(ntr, 100_000)   # RBF SVR ≤100k (using full allowance)
    n_knn = min(ntr, 250_000)  # kNN ≤250k (using full allowance)
    n_knn_test = min(nte, 25_000)  # kNN test ≤25k
    
    print(f"[Optimization] Using n={n_cv:,} for curves/CV; n={n_final_train:,} for final models")

    sel = np.random.RandomState(RANDOM_STATE).permutation(ntr)
    # CV/curves subsets (fast)
    Xtr_cv, ytr_cv = Xtr.iloc[sel[:n_cv]], ytr.iloc[sel[:n_cv]]
    # Final training subsets (meets requirements)
    Xtr_final, ytr_final = Xtr.iloc[sel[:n_final_train]], ytr.iloc[sel[:n_final_train]]
    Xtr_rbf, ytr_rbf = Xtr.iloc[sel[:n_rbf]], ytr.iloc[sel[:n_rbf]]
    Xtr_knn, ytr_knn = Xtr.iloc[sel[:n_knn]], ytr.iloc[sel[:n_knn]]
    Xte_knn, yte_knn = Xte.iloc[:n_knn_test], yte.iloc[:n_knn_test]

    post = ('post', StandardScaler(with_mean=False))
    dt = Pipeline([('prep', pre), ('reg', DecisionTreeRegressor(random_state=RANDOM_STATE))])
    knn = Pipeline([('prep', pre), post, ('reg', KNeighborsRegressor(n_neighbors=11, n_jobs=-1))])
    lsvr = Pipeline([('prep', pre), post, ('reg', LinearSVR(C=1.0, random_state=RANDOM_STATE, max_iter=10000, dual='auto'))])
    rsvr = Pipeline([('prep', pre), post, ('reg', SVR(kernel='rbf', cache_size=1000))])

    # NN (SGD-only)
    input_dim = pre.transform(Xtr[:5]).shape[1]
    target_dim = 1
    shallow = choose_widths(input_dim, target_dim, shallow=True)
    
    # Verify param counts are in 0.2M-1.0M range (rubric requirement)
    shallow_params = count_nn_params(shallow, input_dim, output_dim=1)
    print(f"[NN Architecture] input_dim={input_dim}, shallow={shallow}")
    print(f"  Total params: {shallow_params:,} ({shallow_params/1e6:.2f}M)")
    if not (200_000 <= shallow_params <= 1_000_000):
        print(f"  ⚠️  WARNING: Params {shallow_params:,} outside 0.2M-1.0M range!")
    
    nn = Pipeline([
        ('prep', pre), post,
        ('reg', MLPRegressor(hidden_layer_sizes=tuple(shallow), solver='sgd',
                             batch_size=2048, learning_rate='adaptive', learning_rate_init=0.01,
                             alpha=1e-4, early_stopping=True, n_iter_no_change=3,
                             max_iter=15, shuffle=True, momentum=0.0,
                             random_state=RANDOM_STATE))
    ])

    # SPEED OPTIMIZATION: Validation curves on CV subset with reduced CV folds
    print("\n[Generating validation curves - using n=300k, cv=2 for speed...]")
    plot_validation(Pipeline([('prep', pre), ('reg', DecisionTreeRegressor(random_state=RANDOM_STATE))]),
                    Xtr_cv, ytr_cv, "reg__max_depth", [6,10,14,18], "neg_mean_absolute_error", "DT vs depth", cv=2, prefix="Accidents")
    plot_validation(Pipeline([('prep', pre), post, ('reg', KNeighborsRegressor(n_jobs=-1))]),
                    Xtr_knn, ytr_knn, "reg__n_neighbors", [5,11,21], "neg_mean_absolute_error", "kNN vs k", cv=2, prefix="Accidents")
    plot_validation(Pipeline([('prep', pre), post, ('reg', LinearSVR(max_iter=5000, random_state=RANDOM_STATE, dual='auto'))]),
                    Xtr_cv, ytr_cv, "reg__C", [0.01,0.1,1,10], "neg_mean_absolute_error", "LinearSVR vs C", cv=2, prefix="Accidents")
    plot_validation(Pipeline([('prep', pre), post, ('reg', SVR(kernel='rbf', cache_size=1000))]),
                    Xtr_rbf, ytr_rbf, "reg__C", [1,10], "neg_mean_absolute_error", "RBF SVR vs C (γ=scale)", cv=2, prefix="Accidents")
    plot_validation(Pipeline([('prep', pre), post, ('reg', MLPRegressor(solver='sgd', momentum=0.0, batch_size=2048, max_iter=10, random_state=RANDOM_STATE))]),
                    Xtr_cv, ytr_cv, "reg__alpha", [1e-4,5e-4,1e-3], "neg_mean_absolute_error", "NN vs L2(alpha)", cv=2, prefix="Accidents")

    # SPEED OPTIMIZATION: Learning curves on CV subset with fewer points
    print("\n[Generating learning curves - using fewer samples for speed...]")
    def LC(est, title, Xlc, ylc): 
        protocol_card(title, Xlc, ylc, "neg_mean_absolute_error", cv=2)
        sizes = np.linspace(0.2, 1.0, 4)  # 4 points instead of 5
        tr_sizes, tr_s, te_s = learning_curve(
            est, Xlc, ylc, train_sizes=sizes, cv=2, scoring="neg_mean_absolute_error", n_jobs=-1, random_state=RANDOM_STATE
        )
        plt.figure(figsize=(6.2,4.6))
        plt.title(f"Accidents — Learning Curve: {title}")
        plt.xlabel("Training examples"); plt.ylabel("NEG_MEAN_ABSOLUTE_ERROR"); plt.grid(alpha=.3)
        plt.plot(tr_sizes, tr_s.mean(1), 'o-', label="Training")
        plt.plot(tr_sizes, te_s.mean(1), 'o-', label="Cross-val"); plt.legend()
        savefig("Accidents", f"LC_{title}")
    
    LC(dt, "Decision Tree", Xtr_cv, ytr_cv)
    LC(knn, "kNN", Xtr_knn, ytr_knn)
    LC(lsvr, "Linear SVR", Xtr_cv, ytr_cv)
    LC(rsvr, "RBF SVR", Xtr_rbf, ytr_rbf)
    LC(nn, "Neural Net (SGD)", Xtr_cv, ytr_cv)

    # SPEED OPTIMIZATION: Lighter grid search on CV subset, then retrain on full data
    print("\n[Hyperparameter tuning on subset, then retraining on ≥1M rows...]")
    dt_params = GridSearchCV(dt, {"reg__max_depth":[10,14], "reg__min_samples_leaf":[200,400]}, 
                             cv=2, scoring="neg_mean_absolute_error", n_jobs=-1).fit(Xtr_cv, ytr_cv).best_params_
    knn_params = GridSearchCV(knn, {"reg__n_neighbors":[3,5,11,21]}, 
                              cv=2, scoring="neg_mean_absolute_error", n_jobs=-1).fit(Xtr_knn, ytr_knn).best_params_
    lsvr_params = GridSearchCV(lsvr, {"reg__C":[0.1,1,10]}, 
                               cv=2, scoring="neg_mean_absolute_error", n_jobs=-1).fit(Xtr_cv, ytr_cv).best_params_
    # RBF: tune both C and gamma (rubric requirement)
    gamma_values = ["scale", 1.0/input_dim, 2.0/input_dim]
    rsvr_params = GridSearchCV(rsvr, {"reg__C":[1,10], "reg__gamma":gamma_values}, 
                               cv=2, scoring="neg_mean_absolute_error", n_jobs=-1).fit(Xtr_rbf, ytr_rbf).best_params_
    nn_params = GridSearchCV(nn, {"reg__alpha":[1e-4,1e-3]}, 
                             cv=2, scoring="neg_mean_absolute_error", n_jobs=-1).fit(Xtr_cv, ytr_cv).best_params_
    
    # Retrain with best params on FULL training sets (meets ≥1M requirement)
    print(f"[Final training] DT & LinearSVR on {n_final_train:,} rows")
    # ✅ Verify rubric compliance
    if n_final_train < 1_000_000:
        print(f"  ⚠️  WARNING: n_final_train={n_final_train:,} < 1M (rubric requires ≥1M for DT/Linear)")
    else:
        print(f"  ✅ Rubric compliance: ≥1M rows for DT/LinearSVR")
    print(f"  RBF SVR: {n_rbf:,} rows (≤100k cap)")
    print(f"  kNN: {n_knn:,} train / {n_knn_test:,} test (≤250k/≤25k caps)")
    
    dt_g = Pipeline([('prep', pre), ('reg', DecisionTreeRegressor(**dt_params, random_state=RANDOM_STATE))]).fit(Xtr_final, ytr_final)
    knn_g = Pipeline([('prep', pre), post, ('reg', KNeighborsRegressor(**knn_params, n_jobs=-1))]).fit(Xtr_knn, ytr_knn)
    lsvr_g = Pipeline([('prep', pre), post, ('reg', LinearSVR(**lsvr_params, max_iter=10000, random_state=RANDOM_STATE, dual='auto'))]).fit(Xtr_final, ytr_final)
    rsvr_g = Pipeline([('prep', pre), post, ('reg', SVR(kernel='rbf', **rsvr_params, cache_size=1000))]).fit(Xtr_rbf, ytr_rbf)
    nn_g = Pipeline([('prep', pre), post, ('reg', MLPRegressor(hidden_layer_sizes=tuple(shallow), solver='sgd',
                                                                batch_size=2048, learning_rate='adaptive', learning_rate_init=0.01,
                                                                momentum=0.0, **nn_params, early_stopping=True, 
                                                                n_iter_no_change=3, max_iter=15, shuffle=True,
                                                                random_state=RANDOM_STATE))]).fit(Xtr_final, ytr_final)

    models = {"DT": dt_g, "kNN": knn_g, "Linear SVR": lsvr_g, "RBF SVR": rsvr_g, "NN (SGD)": nn_g}

    # NN Epoch curves
    print("\n[Generating NN epoch curves...]")
    Xtr_nn, Xval_nn, ytr_nn, yval_nn = train_test_split(Xtr_final, ytr_final, test_size=0.2, random_state=RANDOM_STATE)
    plot_nn_epoch_curve(nn_g, Xtr_nn, ytr_nn, Xval_nn, yval_nn, "SGD", prefix="Accidents", classification=False)
    
    # Support vector analysis for Linear SVR
    print("\n[Linear SVR Diagnostics]")
    sv_frac = support_vector_analysis(lsvr_g, Xtr_final, ytr_final, "Linear SVR")
    if sv_frac:
        savefig("Accidents", "LinearSVR_DecisionFunction_Distribution")
    
    # Test metrics & residuals
    print("\n[Accidents Test] MAE / MedAE / RMSE / predict-time (s)")
    print(f"{'Model':<12s} {'MAE':>8s} {'MedAE':>8s} {'RMSE':>8s} {'Fit(s)':>8s} {'Pred(s)':>8s} {'RAM(MB)':>8s} {'n_train':>10s}")
    print("-" * 95)
    runtimes = []
    for name, m in models.items():
        # Use correct training sets based on model type
        train_X = Xtr_rbf if name == "RBF SVR" else (Xtr_knn if name == "kNN" else Xtr_final)
        train_y = ytr_rbf if name == "RBF SVR" else (ytr_knn if name == "kNN" else ytr_final)
        tfit, tpred, peak, pred = time_fit_predict(m, train_X, train_y,
                                                   Xte if name!="kNN" else Xte_knn,
                                                   yte if name!="kNN" else yte_knn)
        test_y = yte if name!="kNN" else yte_knn
        mae = mean_absolute_error(test_y, pred)
        med = median_absolute_error(test_y, pred)
        rmse= math.sqrt(mean_squared_error(test_y, pred))
        n_train_used = len(train_X) if hasattr(train_X, '__len__') else len(train_y)
        print(f"{name:12s} {mae:8.2f} {med:8.2f} {rmse:8.2f} {tfit:8.1f} {tpred:8.2f} {peak:8.0f} {n_train_used:10,}")
        runtimes.append(["Accidents", name, n_train_used,
                         len(Xte if name!='kNN' else Xte_knn), tfit, tpred, peak])
    
    # Additional regression diagnostics: R² and explained variance
    from sklearn.metrics import r2_score, explained_variance_score
    print("\n[Additional Regression Metrics]")
    print(f"{'Model':<12s} {'R²':>8s} {'Exp.Var':>8s} {'Mean Abs % Err':>15s}")
    print("-" * 50)
    for name, m in models.items():
        test_X = Xte if name != "kNN" else Xte_knn
        test_y = yte if name != "kNN" else yte_knn
        pred = m.predict(test_X)
        r2 = r2_score(test_y, pred)
        ev = explained_variance_score(test_y, pred)
        # Mean Absolute Percentage Error (avoid division by zero)
        mape = np.mean(np.abs((test_y - pred) / np.maximum(test_y, 1e-8))) * 100
        print(f"{name:12s} {r2:8.4f} {ev:8.4f} {mape:14.2f}%")

    # Decision Tree diagnostics
    print("\n[Decision Tree Structure Analysis]")
    dt_model = models["DT"].named_steps['reg']
    print(f"  Max depth reached: {dt_model.get_depth()}")
    print(f"  Number of leaves: {dt_model.get_n_leaves()}")
    print(f"  Total nodes: {dt_model.tree_.node_count}")
    
    # Feature importances (top 10)
    if hasattr(dt_model, 'feature_importances_'):
        importances = dt_model.feature_importances_
        # Get feature names after preprocessing
        feature_names = [f"Feature_{i}" for i in range(len(importances))]
        top_indices = np.argsort(importances)[-10:][::-1]
        
        print(f"\n  Top 10 Feature Importances (Gini):")
        for idx in top_indices:
            if importances[idx] > 0:
                print(f"    {feature_names[idx]}: {importances[idx]:.4f}")
        
        # Plot top 10 importances
        plt.figure(figsize=(6.2, 4.6))
        plt.barh(range(len(top_indices)), importances[top_indices])
        plt.yticks(range(len(top_indices)), [feature_names[i] for i in top_indices])
        plt.xlabel('Gini Importance'); plt.title('Accidents — DT Top 10 Feature Importances')
        plt.tight_layout()
        savefig("Accidents", "DT_Feature_Importances")
    
    # Residuals vs prediction for best model (by MAE)
    maes = {name: mean_absolute_error(yte if name!="kNN" else yte_knn,
                                      models[name].predict(Xte if name!="kNN" else Xte_knn))
            for name in models}
    best = min(maes, key=maes.get)
    ypred = models[best].predict(Xte)
    resid = (yte - ypred)
    
    # Residual plot
    plt.figure(figsize=(6.2,4.6))
    plt.scatter(ypred, resid, s=6, alpha=.35)
    plt.axhline(0, ls='--', color='red'); plt.xlabel("Predicted duration (min)"); plt.ylabel("Residual (true - pred)")
    plt.title(f"Accidents — Residuals vs Prediction ({best})")
    plt.grid(alpha=0.3)
    savefig("Accidents","Residuals_vs_Pred")
    
    # Prediction vs actual scatter plot
    plt.figure(figsize=(6.2,4.6))
    plt.scatter(yte, ypred, s=6, alpha=.35)
    plt.plot([yte.min(), yte.max()], [yte.min(), yte.max()], 'r--', lw=2, label='Perfect prediction')
    plt.xlabel("Actual duration (min)"); plt.ylabel("Predicted duration (min)")
    plt.title(f"Accidents — Prediction vs Actual ({best})")
    plt.legend(); plt.grid(alpha=0.3)
    savefig("Accidents","Prediction_vs_Actual")

    return models, (Xtr, Xte, ytr, yte), runtimes

# =========================
# Activation study (Hotel)
# =========================
def activation_study_hotel(pre_fitted, Xtr, Xte, ytr, yte, width_tuple):
    """
    Compares ReLU, GELU, SiLU/Swish, tanh under identical SGD (no momentum).
    Requires PyTorch for GELU/SiLU; otherwise runs ReLU/tanh fallback.
    Saves epoch-wise val curves and a comparison CSV.
    """
    try:
        import torch
        import torch.nn as nn
        from torch.utils.data import TensorDataset, DataLoader
    except Exception as e:
        print(f"PyTorch not available ({e}); skipping GELU/SiLU. Running ReLU/tanh with sklearn MLP.")
        # fallback: plot epoch curves via sklearn MLPClassifier for relu/tanh
        results = []
        for act in ["relu", "tanh"]:
            clf = MLPClassifier(hidden_layer_sizes=width_tuple, solver='sgd', momentum=0.0, nesterovs_momentum=False,
                                batch_size=1024, learning_rate='constant', learning_rate_init=0.05,
                                alpha=1e-4, max_iter=15, early_stopping=True, n_iter_no_change=3,
                                random_state=RANDOM_STATE, activation=act)
            pipe = Pipeline([('prep', pre_fitted), ('post', StandardScaler(with_mean=False)), ('clf', clf)])
            protocol_card(f"NN-{act.upper()}", Xtr, ytr, "roc_auc", cv=None)
            # We can't easily get per-epoch val here; use final metric:
            pipe.fit(Xtr, ytr)
            s = pipe.predict_proba(Xte)[:,1]
            roc = roc_auc_score(yte, s); pr = pr_auc(yte, s)
            results.append([act, roc, pr])
        pd.DataFrame(results, columns=["activation","ROC","PR"]).to_csv(os.path.join(LOG_DIR,"extra_credit_activation_summary.csv"), index=False)
        return

    torch.manual_seed(RANDOM_STATE); np.random.seed(RANDOM_STATE)
    Xtr_t = pre_fitted.transform(Xtr).astype(np.float32)
    Xte_t = pre_fitted.transform(Xte).astype(np.float32)
    d = Xtr_t.shape[1]

    def make_loader(X, y, bs=1024, shuffle=True):
        ds = TensorDataset(torch.from_numpy(X), torch.from_numpy(y.values.astype(np.int64)))
        return DataLoader(ds, batch_size=bs, shuffle=shuffle)

    train_loader = make_loader(Xtr_t, ytr)
    val_loader   = make_loader(Xte_t, yte, shuffle=False)

    class MLP(nn.Module):
        def __init__(self, act_name):
            super().__init__()
            acts = {
                "relu": nn.ReLU(),
                "gelu": nn.GELU(),
                "silu": nn.SiLU(),
                "tanh": nn.Tanh()
            }
            self.net = nn.Sequential(
                nn.Linear(d, width_tuple[0]), acts[act_name],
                nn.Linear(width_tuple[0], width_tuple[1]), acts[act_name],
                nn.Linear(width_tuple[1], 1)
            )
        def forward(self, x): return self.net(x)

    def eval_pr(y_true, scores):
        y = y_true.numpy(); s = scores.reshape(-1)
        p, r, _ = precision_recall_curve(y, s); return auc(r, p)

    activations = ["relu", "gelu", "silu", "tanh"]
    results = []
    for act in activations:
        model = MLP(act)
        loss = torch.nn.BCEWithLogitsLoss()
        opt = torch.optim.SGD(model.parameters(), lr=0.05, momentum=0.0, weight_decay=1e-4)  # SGD only
        best_val = -1; best_epoch = 0; patience=3; bad=0
        val_hist = []; roc_hist=[]; pr_hist=[]
        for epoch in range(15):
            model.train()
            for xb, yb in train_loader:
                opt.zero_grad()
                logits = model(xb).squeeze(1)
                l = loss(logits, yb.float())
                l.backward(); opt.step()
            # validate
            model.eval()
            with torch.no_grad():
                logits = []
                for xb, _ in val_loader:
                    logits.append(model(xb).squeeze(1))
                logits = torch.cat(logits).detach().numpy()
                roc = roc_auc_score(yte, logits); pr = pr_auc(yte, logits)
                roc_hist.append(roc); pr_hist.append(pr)
                val_hist.append(float(l))
            # early stop on PR-AUC
            if pr > best_val: best_val = pr; best_epoch = epoch; bad=0
            else: bad += 1
            if bad >= patience: break

        # Save epoch curves
        plt.figure(figsize=(6.2,4.6))
        plt.plot(range(1, len(pr_hist)+1), pr_hist, 'o-'); plt.grid(alpha=.3)
        plt.title(f"Hotel — NN Activation ({act.upper()}) PR-AUC vs epoch (best@{best_epoch+1})")
        plt.xlabel("Epoch"); plt.ylabel("PR-AUC"); savefig("Hotel", f"NN_Activation_{act.upper()}_EpochCurve")
        results.append([act, pr_hist[-1], roc_hist[-1], best_epoch+1])

    pd.DataFrame(results, columns=["activation","PR_AUC","ROC_AUC","best_epoch"]).to_csv(
        os.path.join(LOG_DIR,"activation_summary.csv"), index=False)
    print("Wrote sl_outputs/logs/activation_summary.csv")


In [None]:
# =========================
# Main
# =========================
if __name__ == "__main__":
    hw_note()

    hotel_models, hotel_data, hotel_runtime, hotel_pre, w1, w2 = run_hotel()

[Hardware] Darwin 24.6.0; CPU: arm; Python 3.11.0

== HOTEL (CLASSIFICATION: is_canceled) ==
[Hotel] rows raw=119390  target=is_canceled  (train/test split later)
[NN Architectures] input_dim=83
  Shallow [512, 512]: 306,690 params (0.31M)
  Deeper [512, 512, 256, 256]: 503,298 params (0.50M)
[Optimization] Using 40,000 samples for curves (cv=2, faster)

[Generating validation curves...]
[DT vs depth] n=40000 scoring=roc_auc cv=2 prevalence=0.372
[NN Architectures] input_dim=83
  Shallow [512, 512]: 306,690 params (0.31M)
  Deeper [512, 512, 256, 256]: 503,298 params (0.50M)
[Optimization] Using 40,000 samples for curves (cv=2, faster)

[Generating validation curves...]
[DT vs depth] n=40000 scoring=roc_auc cv=2 prevalence=0.372
[saved] sl_outputs/figs/Hotel_MC_DT_vs_depth.png
[kNN vs k] n=40000 scoring=roc_auc cv=2 prevalence=0.372
[saved] sl_outputs/figs/Hotel_MC_DT_vs_depth.png
[kNN vs k] n=40000 scoring=roc_auc cv=2 prevalence=0.372
[saved] sl_outputs/figs/Hotel_MC_kNN_vs_k.png
[Li

In [None]:
acc_models, acc_data, acc_runtime = run_accidents("US_Accidents_March23.csv")

In [None]:
# Save runtime table
cols = ["dataset","model","n_train","n_test","fit_sec","pred_sec","peak_ram_mb"]
rt = pd.DataFrame(hotel_runtime + acc_runtime, columns=cols)
rt.to_csv(os.path.join(LOG_DIR, "runtime_table.csv"), index=False)
print(f"[saved] {os.path.join(LOG_DIR,'runtime_table.csv')}")

In [None]:
# Extra credit activation study on Hotel using (width_tuple) consistent with main NN
# (uses identical SGD protocol and preprocessing)
activation_study_hotel(hotel_pre, hotel_data[0], hotel_data[1], hotel_data[2], hotel_data[3], tuple(w1))