In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sobhanmoosavi/us-accidents")

print("Path to dataset files:", path)

In [None]:
"""Supervised/Online Learning — Hotel (classification) + US Accidents (regression)

References:
[1] N. C. António, A. Almeida, and L. Nunes, "Hotel booking demand datasets," Data in Brief, vol. 22, pp. 41-49, 2019.
[2] S. Moosavi, M. H. Samavatian, et al., "A Countrywide Traffic Accident Dataset," in Proc. IEEE/CVF Conf. Comput. Vis. Pattern Recog. Workshops, 2019.
[3] T. Saito and M. Rehmsmeier, "The Precision-Recall Plot Is More Informative than the ROC Plot When Evaluating Binary Classifiers on Imbalanced Datasets," PLOS ONE, vol. 10, no. 3, p. e0118432, 2015.
[4] T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," in Proc. 22nd ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining, 2016, pp. 785-794.
[5] J. H. Friedman, “Greedy Function Approximation: A Gradient Boosting Machine,” Annals of Statistics, vol. 29, no. 5, pp. 1189–1232, 2001.
[6] T. M. Mitchell, "Machine Learning," McGraw-Hill, 1997.
[7] J. R. Quinlan, "Induction of Decision Trees," Machine Learning, 1986.
[8] P. E. Cover and P. E. Hart, "Nearest Neighbor Pattern Classification," IEEE Transactions on Information Theory, vol. 13, no. 1, pp. 21-27, 1967.
[9] C. J. Cortes and V. Vapnik, "Support-Vector Networks," Machine Learning, vol. 20, no. 3, pp. 273-297, 1995.
[10] D. E. Rumelhart, G. E. Hinton, and R. J. Williams, "Learning representations by back-propagating errors," Nature, vol. 323, no. 6088, pp. 533-536, 1986.
[11] N. C. António, A. Almeida, and L. Nunes, "Hotel booking demand datasets," Data in Brief, vol. 22, pp. 41-49, 2019.
[12] S. Moosavi et al., “US Accidents (since 2016),” [Online]. Available: Kaggle US Accidents.
"""

# Use a non-interactive backend for script runs (before importing pyplot)
import matplotlib
try:
    matplotlib.use("Agg")
except Exception:
    pass

try:
    from IPython import get_ipython  # noqa
    _IN_IPY = get_ipython() is not None
except Exception:
    _IN_IPY = False

# memory_profiler: safe @mp_profile decorator & programmatic runner
try:
    from memory_profiler import profile as mp_profile, memory_usage as _memory_usage  # noqa
    _HAVE_MEMPROF = True
except Exception:
    _HAVE_MEMPROF = False
    def mp_profile(func):  # noqa
        return func
    def _memory_usage(*a, **k):  # noqa
        return None

# tqdm shim (fallback to identity if tqdm missing)
try:
    from tqdm import tqdm  # noqa
except Exception:
    def tqdm(x, **k):  # noqa
        return x

# display shim: if IPython not available, print instead
try:
    from IPython.display import display as ipy_display  # noqa
except Exception:
    def ipy_display(x):  # noqa
        try:
            import pandas as _pd
            if isinstance(x, _pd.DataFrame):
                print(x.head(20).to_string())
            elif hasattr(x, "to_string"):
                print(x.to_string())
            else:
                print(x)
        except Exception:
            print(x)

# Programmatic memory-profiling runner for scripts
import os, json, time, sys
def run_with_memprofile(func, *args, enable=None, label=None, log_dir="./ol_outputs/logs", **kwargs):
    """    Run a function and (optionally) record peak memory using memory_profiler.memory_usage.
    - enable=True or set env ENABLE_MEMPROF=1 to activate
    - returns (retval, peak_mb or None)
    """
    enabled = enable if enable is not None else (os.getenv("ENABLE_MEMPROF", "0") == "1")
    if not enabled or not _HAVE_MEMPROF:
        return func(*args, **kwargs), None
    mem_list, retval = _memory_usage((func, args, kwargs), retval=True, interval=0.1)
    try:
        os.makedirs(log_dir, exist_ok=True)
        with open(os.path.join(log_dir, "memory_profile.jsonl"), "a", encoding="utf-8") as f:
            rec = {"ts": time.time(), "label": label or getattr(func, "__name__", "func"),
                   "peak_mb": float(max(mem_list) if mem_list else -1.0)}
            f.write(json.dumps(rec) + "\n")
    except Exception as _e:
        print("[memprof] could not write log:", _e, file=sys.stderr)
    return retval, float(max(mem_list) if mem_list else -1.0)
# ================================================================

# %%
# Install if needed else comment
# [shell-magic removed] !pip install category_encoders
# [shell-magic removed] !pip install memory_profiler
# [notebook-magic removed] %load_ext memory_profiler
# Imports
import pandas as pd
import numpy as np
import time

# Scikit-learn preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder
from sklearn.pipeline import Pipeline

# Scikit-learn models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
# Scikit-learn model selection and metrics
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV, learning_curve, validation_curve
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay, f1_score, roc_auc_score, precision_recall_curve, auc, classification_report, confusion_matrix
# Import matplotlib for plotting
import matplotlib.pyplot as plt


# Data loading and cleaning
try:
    df_hotel = pd.read_csv('/hotel_bookings.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'hotel_bookings.csv' not found.")
    df_hotel = pd.DataFrame()

if not df_hotel.empty:
    leakage_cols = ['reservation_status', 'reservation_status_date']
    df_hotel_cleaned = df_hotel.drop(columns=leakage_cols).copy()  # Added .copy()

    # Fix pandas FutureWarning by using .loc
    df_hotel_cleaned.loc[:, 'agent'] = df_hotel_cleaned['agent'].fillna(0)
    df_hotel_cleaned.loc[:, 'company'] = df_hotel_cleaned['company'].fillna(0)
    df_hotel_cleaned.loc[:, 'country'] = df_hotel_cleaned['country'].fillna(df_hotel_cleaned['country'].mode()[0])
    df_hotel_cleaned.loc[:, 'children'] = df_hotel_cleaned['children'].fillna(0)

    df_hotel_cleaned[['children', 'agent', 'company']] = df_hotel_cleaned[['children', 'agent', 'company']].astype(int)
    print("Data cleaning complete.")

# Helper functions to evaluate the model
def evaluate_model(y_true, y_scores, threshold=0.5):
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    pr_auc = auc(recall, precision)
    roc_auc = roc_auc_score(y_true, y_scores)
    if np.min(y_scores) < 0:
        threshold = 0.0
    y_pred_class = (y_scores >= threshold).astype(int)
    f1 = f1_score(y_true, y_pred_class)
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"PR-AUC: {pr_auc:.4f}")
    print(f"F1-Score (at threshold {threshold}): {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred_class))

def train_and_evaluate_model(pipeline, param_grid, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
    print(f"--- Starting Grid Search for {pipeline.steps[-1][1].__class__.__name__} ---")
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    end_time = time.time()
    print(f"Grid Search completed in {end_time - start_time:.2f} seconds.")
    print(f"Best parameters found: {grid_search.best_params_}")
    best_model = grid_search.best_estimator_
    if hasattr(best_model, 'predict_proba'):
        y_scores = best_model.predict_proba(X_test)[:, 1]
    else:
        y_scores = best_model.decision_function(X_test)
    print("--- Evaluation on Test Set ---")
    evaluate_model(y_test, y_scores)
    return best_model

# Target encoding for high cardinality and On-hot for low cardinality
X = df_hotel_cleaned.drop(columns=['is_canceled'])
y = df_hotel_cleaned['is_canceled']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
# We define high-cardinality features to be target encoded
target_encode_features = ['country', 'agent', 'company']
# All other non-numeric features will be one-hot encoded
ohe_features = [col for col in X_train.select_dtypes(exclude=np.number).columns if col not in target_encode_features]

# UPDATED: Create the new preprocessor with three steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ohe_features),
        ('te', TargetEncoder() if TargetEncoder is not None else OneHotEncoder(handle_unknown='ignore', sparse_output=False), target_encode_features)
    ],
    remainder='passthrough'
)
print("Final data preparation is complete. Preprocessor now includes Target Encoding.")

# Run the experiments

# --- Decision Tree --- #
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier(random_state=42))])
dt_param_grid = {'classifier__max_depth': [8, 16], 'classifier__min_samples_leaf': [100, 200], 'classifier__class_weight': ['balanced']}
best_dt_model = train_and_evaluate_model(dt_pipeline, dt_param_grid, X_train, y_train, X_test, y_test)

# --- k-Nearest Neighbors ---
knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', KNeighborsClassifier(n_jobs=-1))])
knn_param_grid = {'classifier__n_neighbors': [5, 11]}
best_knn_model = train_and_evaluate_model(knn_pipeline, knn_param_grid, X_train, y_train, X_test, y_test)


# ## Train and evaluate model

# Neural Network

# adding just in case it dosesnt show up in the initial import
from sklearn.neural_network import MLPClassifier

print("--- Training Shallow-Wide Neural Network ---")
nn_shallow_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', MLPClassifier(hidden_layer_sizes=(512, 512), solver='sgd', learning_rate_init=0.01, batch_size=512, max_iter=100, early_stopping=True, random_state=42, verbose=False))])
# [notebook-magic removed] %memit nn_shallow_pipeline.fit(X_train, y_train)
# First fit the pipeline
nn_shallow_pipeline.fit(X_train, y_train)

# Then make predictions
y_scores_shallow = nn_shallow_pipeline.predict_proba(X_test)[:, 1]
evaluate_model(y_test, y_scores_shallow)

print("--- Training Deeper-Narrower Neural Network ---")
nn_deep_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', MLPClassifier(hidden_layer_sizes=(256, 256, 128, 128), solver='sgd', learning_rate_init=0.01, batch_size=512, max_iter=100, early_stopping=True, random_state=42, verbose=False))])
# [notebook-magic removed] %memit nn_deep_pipeline.fit(X_train, y_train)
nn_deep_pipeline.fit(X_train, y_train)
print("--- Evaluation of Deeper-Narrower NN ---")
y_scores_deep = nn_deep_pipeline.predict_proba(X_test)[:, 1]
evaluate_model(y_test, y_scores_deep)

# %%
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV

# --- Linear SVM --- #
linear_svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LinearSVC(random_state=42, max_iter=2000))])
linear_svm_param_grid = {'classifier__C': [0.1, 1, 10], 'classifier__class_weight': ['balanced']}
best_linear_svm_model = train_and_evaluate_model(linear_svm_pipeline, linear_svm_param_grid, X_train, y_train, X_test, y_test)

# --- RBF SVM --- #
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, train_size=20000, random_state=42, stratify=y_train)
rbf_svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVC(kernel='rbf', probability=True, random_state=42))])
rbf_svm_param_grid = {'classifier__C': [0.5, 2, 8], 'classifier__gamma': ['scale', 'auto'], 'classifier__class_weight': ['balanced']}
random_search_rbf = RandomizedSearchCV(rbf_svm_pipeline, param_distributions=rbf_svm_param_grid, n_iter=6, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)
print(f"--- Starting Randomized Search for RBF SVM on {len(X_train_sample)} samples ---")

# FIT the model first before accessing best_estimator_
random_search_rbf.fit(X_train_sample, y_train_sample)

# Now you can access best_estimator_
best_rbf_svm_model = random_search_rbf.best_estimator_
print("Best parameters found:", random_search_rbf.best_params_)
y_scores_rbf = best_rbf_svm_model.predict_proba(X_test)[:, 1]
print("--- Evaluation on Test Set ---")
evaluate_model(y_test, y_scores_rbf)

# Add this helper function to safely access search results:
def safe_get_best_model(search_cv, X_train, y_train):
    """Safely fit and get best model from search CV object."""
    if not hasattr(search_cv, 'best_estimator_') or search_cv.best_estimator_ is None:
        print("Fitting search CV object...")
        search_cv.fit(X_train, y_train)
    return search_cv.best_estimator_

# %%
# Plotting

def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=-1):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=np.linspace(.1, 1.0, 5), scoring='roc_auc'
    )
    train_scores_mean, test_scores_mean = np.mean(train_scores, axis=1), np.mean(test_scores, axis=1)
    plt.figure()
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("ROC-AUC Score")
    plt.grid()
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.savefig(f"Hotel_{title.replace(' ', '_').lower()}.png")  # Save as PNG
    plt.close()  # Close the figure to avoid displaying it

def plot_validation_curve(estimator, title, X, y, param_name, param_range, cv=5, n_jobs=-1):
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring="roc_auc", n_jobs=n_jobs
    )
    train_scores_mean, test_scores_mean = np.mean(train_scores, axis=1), np.mean(test_scores, axis=1)
    plt.figure()
    plt.title(title)
    plt.xlabel(param_name)
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    plt.grid()
    plt.plot(param_range, train_scores_mean, label="Training score", color="darkorange")
    plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="navy")
    plt.legend(loc="best")
    plt.savefig(f"Hotel_{title.replace(' ', '_').lower()}.png")  # Save as PNG
    plt.close()  # Close the figure to avoid displaying it


def plot_roc_pr_curves(models, X_test, y_test, save_dir="."):
    """    Plots ROC and PR curves for a dictionary of trained models.
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    for name, model in models.items():
        if hasattr(model, 'predict_proba'):
            y_scores = model.predict_proba(X_test)[:, 1]
        else:
            y_scores = model.decision_function(X_test)

        RocCurveDisplay.from_predictions(y_test, y_scores, name=name, ax=ax1)
        PrecisionRecallDisplay.from_predictions(y_test, y_scores, name=name, ax=ax2)

    ax1.set_title("ROC Curves")
    ax2.set_title("Precision-Recall Curves")
    ax1.grid(True)
    ax2.grid(True)
    plt.tight_layout()
    plt.savefig("Hotel_ROC_and_PR_Curves.png")
    plt.close()

def plot_nn_loss_curve(nn_model, title, save_dir="."):
    """    Plots the training loss curve from a trained scikit-learn MLP model.
    """
    plt.figure(figsize=(8, 6))
    plt.plot(nn_model.loss_curve_)
    plt.xlabel("Epochs")
    plt.ylabel("Training Loss")
    plt.title(title)
    plt.grid(True)
    plt.savefig(f"{title.replace(' ', '_')}.png")
    plt.close()

print("--- Generating Learning Curves ---")
plot_learning_curve(best_dt_model, "Learning Curve (Decision Tree)", X_train, y_train)
plot_learning_curve(best_knn_model, "Learning Curve (k-NN)", X_train, y_train)
plot_learning_curve(best_linear_svm_model, "Learning Curve (Linear SVM)", X_train, y_train)
plot_learning_curve(best_rbf_svm_model, "Learning Curve (RBF SVM)", X_train_sample, y_train_sample)

print("--- Generating Model Complexity Curve for Decision Tree ---")
dt_pipeline_untuned = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier(random_state=42, class_weight='balanced'))])
param_range_depth = np.arange(2, 25)
plot_validation_curve(dt_pipeline_untuned, "Validation Curve (Decision Tree vs. max_depth)", X_train, y_train, param_name="classifier__max_depth", param_range=param_range_depth)
models_to_plot = {
    "Decision Tree": best_dt_model,
    "k-NN": best_knn_model,
    "Linear SVM": best_linear_svm_model,
    "RBF SVM": best_rbf_svm_model,
    "Shallow NN": nn_shallow_pipeline,
    "Deep NN": nn_deep_pipeline
}
plot_roc_pr_curves(models_to_plot, X_test, y_test)

print("--- Generating NN Loss Curves ---")
plot_nn_loss_curve(nn_shallow_pipeline.named_steps['classifier'], "NN classifier Loss Curve (Shallow)")
plot_nn_loss_curve(nn_deep_pipeline.named_steps['classifier'], "NN classifier Loss Curve (Deep)")


# %%
# --- FINAL SUMMARY TABLE ---
results, models = [], {"Decision Tree": best_dt_model, "k-NN": best_knn_model, "Linear SVM": best_linear_svm_model, "RBF SVM": best_rbf_svm_model, "Shallow NN": nn_shallow_pipeline, "Deep NN": nn_deep_pipeline}

print("--- Generating Final Summary Table ---")
for name, model in models.items():

    # Time the prediction step
    start_time = time.time()
    if hasattr(model, 'predict_proba'):
        y_scores = model.predict_proba(X_test)[:, 1]
    else:
        y_scores = model.decision_function(X_test)
    end_time = time.time()
    predict_time = end_time - start_time

    # Calculate metrics
    roc_auc, (precision, recall, _) = roc_auc_score(y_test, y_scores), precision_recall_curve(y_test, y_scores)
    pr_auc = auc(recall, precision)
    threshold = 0.0 if "Linear SVM" in name else 0.5 # RBF SVM has predict_proba, so it uses 0.5
    f1 = f1_score(y_test, (y_scores >= threshold).astype(int))

    # Append all results
    results.append({"Model": name, "ROC-AUC": roc_auc, "PR-AUC": pr_auc, "F1-Score": f1, "Predict Time (s)": predict_time})

df_results = pd.DataFrame(results)
print("--- Final Model Comparison ---")
print(df_results.sort_values(by='ROC-AUC', ascending=False))

# %%
from sklearn.neural_network import MLPRegressor

# Neural Network

from sklearn.preprocessing import StandardScaler

# --- Create and fit a scaler for the target variable ---
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))

print("--- Training Shallow-Wide Neural Network (Regressor with Scaled Target) ---")

nn_shallow_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(
        hidden_layer_sizes=(512, 512),
        solver='sgd',
        learning_rate='adaptive', # Keep the adaptive rate
        learning_rate_init=0.01,
        batch_size=1024,
        max_iter=100,  # Increased from 15
        early_stopping=True,
        random_state=42,
        verbose=True
    ))
])

# Fit the model using the scaled y_train
# [notebook-magic removed] %memit nn_shallow_reg_pipeline.fit(X_train, y_train_scaled.ravel())


nn_shallow_reg_pipeline.fit(X_train, y_train_scaled.ravel())
y_pred_scaled = nn_shallow_reg_pipeline.predict(X_test)

# Convert the scaled predictions back to the original, unscaled y_test
y_pred_unscaled = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))


print("--- Evaluation on Test Set ---")
# Compare the unscaled predictions to the original, unscaled y_test
evaluate_model(y_test, y_pred_unscaled.ravel())


# NOTE: We are reusing the 'y_scaler' that was fit on y_train in the previous step.

# --- Define and train the DEEP NN pipeline on the SCALED target data ---
print("--- Training Deeper-Narrower Neural Network (Regressor with Scaled Target) ---")

nn_deep_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(
        hidden_layer_sizes=(256, 256, 128, 128), # Deeper-narrower architecture
        solver='sgd',
        learning_rate='adaptive',
        learning_rate_init=0.01,
        batch_size=1024,
        max_iter=100,  # Increased from 15
        early_stopping=True,
        random_state=42,
        verbose=True
    ))
])

# Fit the model using the same scaled y_train
# [notebook-magic removed] %memit nn_deep_reg_pipeline.fit(X_train, y_train_scaled.ravel())

nn_deep_reg_pipeline.fit(X_train, y_train_scaled.ravel())
# The model will predict in the scaled space
y_pred_deep_scaled = nn_deep_reg_pipeline.predict(X_test)

# Convert the scaled predictions back to the original units (minutes) using the same scaler
y_pred_deep_unscaled = y_scaler.inverse_transform(y_pred_deep_scaled.reshape(-1, 1))


print("--- Evaluation on Test Set ---")
# Compare the unscaled predictions to the original, unscaled y_test
evaluate_model(y_test, y_pred_deep_unscaled.ravel())

# Imports
import pandas as pd
import numpy as np
import time
import os
import matplotlib.pyplot as plt

# Scikit-learn preprocessing and pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Scikit-learn regression models
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neural_network import MLPRegressor

# Scikit-learn model selection and metrics for regression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, learning_curve, validation_curve
from sklearn.metrics import mean_absolute_error, mean_squared_error


output_dir = "US_accdnt_out"
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory '{output_dir}' is ready.")

# Helper functions for model evaluation
def evaluate_regression_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

def train_and_evaluate_regression_model(pipeline, param_grid, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
    print(f"--- Starting Grid Search for {pipeline.steps[-1][1].__class__.__name__} ---")
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    end_time = time.time()
    print(f"Grid Search completed in {end_time - start_time:.2f} seconds.")
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best MAE on validation data: {-grid_search.best_score_:.4f}")
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    print("--- Evaluation on Test Set ---")
    evaluate_regression_model(y_test, y_pred)
    return best_model

# Data loading and cleaning
try:
    # df_accidents = pd.read_csv('US_Accidents_March23.csv')
    df_accidents = pd.read_csv('/kaggle/input/us-accidents/US_Accidents_March23.csv')
    print("US Accidents dataset loaded successfully!")
except FileNotFoundError:
    print("Error: Accidents dataset file not found.")
    df_accidents = pd.DataFrame()

if not df_accidents.empty:
    cols_to_drop = ['ID', 'Weather_Timestamp', 'Airport_Code', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight', 'City', 'Street', 'Zipcode', 'County', 'Country', 'State', 'Timezone', 'Weather_Condition']
    df_cleaned = df_accidents.drop(columns=cols_to_drop)
    # Use format='mixed' to handle variations in datetime strings
    df_cleaned['Start_Time'] = pd.to_datetime(df_cleaned['Start_Time'], format='mixed')
    df_cleaned['End_Time'] = pd.to_datetime(df_cleaned['End_Time'], format='mixed')
    df_cleaned['Duration'] = (df_cleaned['End_Time'] - df_cleaned['Start_Time']).dt.total_seconds() / 60
    df_cleaned = df_cleaned[df_cleaned['Duration'] > 0]

    # Drop the 'Description' column as it contains string values and is not used in modeling
    if 'Description' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Description'])

    # Identify categorical columns for imputation
    categorical_cols = df_cleaned.select_dtypes(include='object').columns

    # Impute missing values in categorical columns with 'Unknown'
    for col in categorical_cols:
        df_cleaned[col].fillna('Unknown', inplace=True)

    # Drop rows with any remaining missing values (should only be numeric now)
    df_cleaned.dropna(inplace=True)

    bool_cols = df_cleaned.select_dtypes(include='bool').columns
    df_cleaned[bool_cols] = df_cleaned[bool_cols].astype(int)
    num_cols = df_cleaned.select_dtypes(include=np.number).columns
    df_cleaned[num_cols] = df_cleaned[num_cols].astype(np.float32)


    print("Data cleaning and feature engineering complete.")

# Sampling and Data preparation
df_sample, _ = train_test_split(df_cleaned, train_size=1500000, stratify=df_cleaned['Severity'], random_state=42)
X = df_sample.drop(columns=['Duration', 'Severity'])
y = df_sample['Duration']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_svm, _, y_train_svm, _ = train_test_split(X_train, y_train, train_size=25000, random_state=42)
X_train_knn, _, y_train_knn, _ = train_test_split(X_train, y_train, train_size=250000, random_state=42)

# Define numeric features based on the updated X_train
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
# Identify all non-numeric columns as categorical features
categorical_features = X_train.select_dtypes(exclude=np.number).columns.tolist()


# Create a ColumnTransformer to handle different feature types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

print(f"Data sampling complete. Main training set: {len(X_train)}, k-NN sample: {len(X_train_knn)}, SVM sample: {len(X_train_svm)}")
print("Preprocessor (ColumnTransformer) defined to handle numeric and categorical features.")


# %%
# Run the Experiments
# --- Decision Tree Regressor ---
dt_reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', DecisionTreeRegressor(random_state=42))])
dt_reg_param_grid = {'regressor__max_depth': [8, 16], 'regressor__min_samples_leaf': [200, 400]}
dt_reg_pipeline.fit(X_train, y_train)  # Initial fit to avoid refit issues
best_dt_reg_model = train_and_evaluate_regression_model(dt_reg_pipeline, dt_reg_param_grid, X_train, y_train, X_test, y_test)

# --- k-NN Regressor --- (Uses k-NN sample)
knn_reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', KNeighborsRegressor(n_jobs=-1))])
knn_reg_param_grid = {'regressor__n_neighbors': [5, 11]}
knn_reg_pipeline.fit(X_train_knn, y_train_knn)  # Initial fit to avoid refit issues
best_knn_reg_model = train_and_evaluate_regression_model(knn_reg_pipeline, knn_reg_param_grid, X_train_knn, y_train_knn, X_test, y_test)

# --- Linear SVR ---
linear_svr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearSVR(random_state=42, max_iter=2000))])
linear_svr_param_grid = {'regressor__C': [0.1, 1, 10]}
linear_svr_pipeline.fit(X_train, y_train)  # Initial fit to avoid refit issues
best_linear_svr_model = train_and_evaluate_regression_model(linear_svr_pipeline, linear_svr_param_grid, X_train, y_train, X_test, y_test)

# --- RBF SVR (Optimized) --- (Uses SVM sample)
rbf_svr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', SVR(kernel='rbf'))])
rbf_svr_param_grid = {'regressor__C': [1, 10], 'regressor__gamma': ['scale']}
# Changed RandomizedSearchCV to GridSearchCV
rbf_svr_pipeline.fit(X_train_svm, y_train_svm)  # Initial fit to avoid refit issues
grid_search_svr = GridSearchCV(rbf_svr_pipeline, param_grid=rbf_svr_param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
print(f"--- Starting Grid Search for SVR (RBF) on {len(X_train_svm)} samples ---")
grid_search_svr.fit(X_train_svm, y_train_svm)
best_rbf_svr_model = grid_search_svr.best_estimator_
print("Best parameters found:", grid_search_svr.best_params_)
print(f"Best MAE on validation data: {-grid_search_svr.best_score_:.4f}")
# Evaluate on test set
y_pred_rbf = best_rbf_svr_model.predict(X_test)
print("--- Evaluation on Test Set ---"); evaluate_regression_model(y_test, y_pred_rbf)

# Plotting functions
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.metrics import mean_absolute_error, mean_squared_error

def plot_learning_curve_regression(estimator, title, X, y, cv=3, n_jobs=-1, save_dir="."):
    # Calculate training and cross-validation scores
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=np.linspace(.1, 1.0, 5), scoring='neg_mean_absolute_error')
    train_scores_mean, test_scores_mean = -np.mean(train_scores, axis=1), -np.mean(test_scores, axis=1)
    # Plot learning curve
    plt.figure(); plt.title(title); plt.xlabel("Training examples"); plt.ylabel("Mean Absolute Error"); plt.grid()
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    # Save and show plot
    plt.savefig(os.path.join(save_dir, f"Accidnt_{title.replace(' ', '_')}.png")); plt.show()

def plot_residuals(y_true, y_pred, title, save_dir="."):
    # Plots the residuals of a regression model.
    residuals = y_true - y_pred
    # Scatter plot of residuals
    plt.figure(figsize=(8, 6))
    plt.scatter(y_pred, residuals, alpha=0.2)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals (Actual - Predicted)")
    plt.title(title)
    plt.grid(True)
    # Save and show plot
    plt.savefig(os.path.join(save_dir, f"Accidnt_{title.replace(' ', '_')}.png"))
    plt.show()

def plot_nn_loss_curve(nn_model, title, save_dir="."):
    # Plots the training loss curve from a trained scikit-learn MLP model.
    plt.figure(figsize=(8, 6))
    plt.plot(nn_model.loss_curve_)
    plt.xlabel("Epochs")
    plt.ylabel("Training Loss")
    plt.title(title)
    plt.grid(True)
    # Save and show plot
    plt.savefig(os.path.join(save_dir, f"{title.replace(' ', '_')}.png"))
    plt.show()


def plot_validation_curve_regression(estimator, title, X, y, param_name, param_range, cv=3, n_jobs=-1, save_dir="."):
    # Calculate training and cross-validation scores for validation curve
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name=param_name, param_range=param_range,
        cv=cv, scoring="neg_mean_absolute_error", n_jobs=n_jobs
    )

    train_scores_mean = -np.mean(train_scores, axis=1)
    test_scores_mean = -np.mean(test_scores, axis=1)

    # Plot validation curve
    plt.figure()
    plt.title(title)
    plt.xlabel(param_name)
    plt.ylabel("Mean Absolute Error")
    plt.grid()
    plt.plot(param_range, train_scores_mean, label="Training score", color="darkorange")
    plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="navy")
    plt.legend(loc="best")
    # Save and show plot
    plt.savefig(os.path.join(save_dir, f"Accidnt_{title.replace(' ', '_')}.png"))
    plt.show()


# Generate and Save Learning Curves
print("--- Generating and Saving Learning Curves ---")
plot_learning_curve_regression(best_dt_reg_model, "Learning Curve (Decision Tree)", X_train, y_train, save_dir=output_dir)
plot_learning_curve_regression(best_knn_reg_model, "Learning Curve (k-NN)", X_train_knn, y_train_knn, save_dir=output_dir)
plot_learning_curve_regression(best_linear_svr_model, "Learning Curve (Linear SVR)", X_train, y_train, save_dir=output_dir)
plot_learning_curve_regression(best_rbf_svr_model, "Learning Curve (RBF SVR)", X_train_svm, y_train_svm, save_dir=output_dir)

# Generate and Save NN Loss Curves
print("--- Generating and Saving NN Loss Curves ---")
plot_nn_loss_curve(nn_shallow_reg_pipeline.named_steps['regressor'],
                   "NN Regressor Loss Curve (Shallow)",
                   save_dir=output_dir)

plot_nn_loss_curve(nn_deep_reg_pipeline.named_steps['regressor'],
                   "NN Regressor Loss Curve (Deep)",
                   save_dir=output_dir)

# Generating residual plots
y_pred_dt = best_dt_reg_model.predict(X_test)
plot_residuals(y_test, y_pred_dt, "Residuals Plot (Decision Tree)", save_dir=output_dir)

# Generating Model Complexity (Validation) Curves
print("--- Generating and Saving Model Complexity (Validation) Curves ---")

# Validation Curve for Decision Tree Regressor
dt_reg_pipeline_untuned = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', DecisionTreeRegressor(random_state=42))])
param_range_depth = np.arange(4, 25, 4) # Test depths from 4 to 24

plot_validation_curve_regression(
    dt_reg_pipeline_untuned,
    "Validation Curve (Decision Tree vs max_depth)",
    X_train,
    y_train,
    param_name="regressor__max_depth",
    param_range=param_range_depth,
    save_dir=output_dir
)

# Validation Curve for k-NN Regressor
# Note: This runs on the smaller k-NN subsample for efficiency
knn_reg_pipeline_untuned = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', KNeighborsRegressor(n_jobs=-1))])
param_range_k = [3, 5, 8, 11, 15, 21] # Test various numbers of neighbors

plot_validation_curve_regression(
    knn_reg_pipeline_untuned,
    "Validation Curve (k-NN vs n_neighbors)",
    X_train_knn, # Using the k-NN subsample
    y_train_knn,
    param_name="regressor__n_neighbors",
    param_range=param_range_k,
    save_dir=output_dir
)

# Validation Curve for Linear SVR
linear_svr_pipeline_untuned = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearSVR(random_state=42, max_iter=2000))])
# Test the regularization parameter C on a logarithmic scale
param_range_c_linear = np.logspace(-2, 2, 5) # e.g., [0.01, 0.1, 1, 10, 100]

plot_validation_curve_regression(
    linear_svr_pipeline_untuned,
    "Validation Curve (Linear SVR vs C)",
    X_train,
    y_train,
    param_name="regressor__C",
    param_range=param_range_c_linear,
    save_dir=output_dir
)

# Validation Curve for RBF SVR
# IMPORTANT: This MUST be run on the smallest SVM subsample to be computationally feasible.
rbf_svr_pipeline_untuned = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', SVR(kernel='rbf'))])
param_range_c_rbf = np.logspace(-1, 2, 4) # e.g., [0.1, 1, 10, 100]

plot_validation_curve_regression(
    rbf_svr_pipeline_untuned,
    "Validation Curve (RBF SVR vs C)",
    X_train_svm, # Using the smallest SVM subsample
    y_train_svm,
    param_name="regressor__C",
    param_range=param_range_c_rbf,
    save_dir=output_dir
)

# Final Summary Table (Regression)
results, models_reg = [], {"Decision Tree": best_dt_reg_model, "k-NN": best_knn_reg_model, "Linear SVR": best_linear_svr_model, "RBF SVR": best_rbf_svr_model}
print("--- Generating Final Summary Table ---")
for name, model in models_reg.items():
    start_time = time.time(); y_pred = model.predict(X_test); end_time = time.time()
    predict_time = end_time - start_time
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results.append({"Model": name, "MAE": mae, "RMSE": rmse, "Predict Time (s)": predict_time})
df_results_reg = pd.DataFrame(results)
print("--- Final Model Comparison (Regression) ---")
print(df_results_reg.sort_values(by='MAE', ascending=True))

Dataset loaded successfully!
Data cleaning complete.
Final data preparation is complete. Preprocessor now includes Target Encoding.
--- Starting Grid Search for DecisionTreeClassifier ---
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Grid Search completed in 7.72 seconds.
Best parameters found: {'classifier__class_weight': 'balanced', 'classifier__max_depth': 16, 'classifier__min_samples_leaf': 100}
--- Evaluation on Test Set ---
ROC-AUC: 0.9332
PR-AUC: 0.9020
F1-Score (at threshold 0.5): 0.8043
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.84      0.87     15033
           1       0.76      0.86      0.80      8845

    accuracy                           0.85     23878
   macro avg       0.83      0.85      0.84     23878
weighted avg       0.85      0.85      0.85     23878

--- Starting Grid Search for KNeighborsClassifier ---
Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [None]:
# Adding the code for OL enhancements (FAST MODE, with timing + memory profiling)
# ---------- (0) One‐time setup & parity guarantees ----------
import os, time, random, warnings, json
import numpy as np
import pandas as pd

# Use a non-interactive backend for script runs (saves plots to files)
import matplotlib
try:
    matplotlib.use("Agg")
except Exception:
    pass
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# ======= SPEED KNOBS (dev mode) =======
FAST_MODE = True  # flip False for full report-quality runs

# Prefer MPS on Macs, then CUDA, else CPU
try:
    import torch
    _HAS_TORCH = True
    DEVICE = (torch.device("mps") if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
              else torch.device("cuda") if torch.cuda.is_available()
              else torch.device("cpu"))
except Exception:
    _HAS_TORCH = False
    class _CPU:
        def __str__(self): return "cpu"
    DEVICE = _CPU()

# After DEVICE is computedfas
IS_CUDA = (hasattr(torch, "cuda") and torch.cuda.is_available())
IS_CPU  = (str(DEVICE) == "cpu")
# Use 0 workers for MPS/CPU to avoid spawn/pickle issues and overhead
NUM_WORKERS = 0 if (not IS_CUDA) else 2

SEED = 4242
def set_seed(s=SEED):
    random.seed(s); np.random.seed(s)
    if _HAS_TORCH:
        torch.manual_seed(s); torch.cuda.manual_seed_all(s)
        torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False
set_seed(SEED)
print(f"[OL] Torch device: {DEVICE} (cuda? {torch.cuda.is_available() if _HAS_TORCH else False})")

# ---- profiling helpers (OL) ----
_MEMPROF_ENABLED = os.getenv("ENABLE_MEMPROF", "0") == "1"
try:
    from memory_profiler import memory_usage as _mem_usage
    _HAVE_MEMPROF_OL = True
except Exception:
    _HAVE_MEMPROF_OL = False

def profile_call(label, func, *args, enable=None, **kwargs):
    """Run func(*args, **kwargs), record wall time and (optionally) peak MB."""
    enable = _MEMPROF_ENABLED if enable is None else bool(enable)
    t0 = time.perf_counter()
    peak = None
    if enable and _HAVE_MEMPROF_OL:
        mem_list, retval = _mem_usage((func, args, kwargs), retval=True, interval=0.1)
        peak = float(max(mem_list) if mem_list else -1.0)
    else:
        retval = func(*args, **kwargs)
    dt = time.perf_counter() - t0
    try:
        os.makedirs("./ol_outputs/logs", exist_ok=True)
        with open("./ol_outputs/logs/ol_profile.jsonl", "a", encoding="utf-8") as f:
            f.write(json.dumps({"ts": time.time(), "label": label, "seconds": dt, "peak_mb": peak}) + "\n")
    except Exception as _e:
        print("[OL] profile log write fail:", _e)
    return retval, dt, peak

# Global budgets + toggles (equal budgets per study = OL requirement)
if FAST_MODE:
    BUDGET_UPDATES = 60
    RO_CFG = {"RHC": 300, "SA": 600, "GA": 1200, "GA_pop": 20}
    OPT_VARIANTS = [("adam", {}), ("sgd", {})]
    DO_HEATMAPS = False
    DO_STABILITY = False
    RUN_ACCIDENTS = False
    TRAIN_BS = 256
    EVAL_BS = 1024
    NUM_WORKERS = 0
else:
    BUDGET_UPDATES = 200
    RO_CFG = {"RHC": 2000, "SA": 4000, "GA": 8000, "GA_pop": 40}
    OPT_VARIANTS = [
        ("sgd", {}), ("momentum", {}), ("nesterov", {}),
        ("adam", {}), ("adam_nobias", {}), ("adam_beta1zero", {}),
        ("adamw", {"wd": 1e-4}),
    ]
    DO_HEATMAPS = True
    DO_STABILITY = True
    RUN_ACCIDENTS = True
    TRAIN_BS = 256
    EVAL_BS = 1024
    NUM_WORKERS = 0

print(f"[FAST] mode={FAST_MODE} device={DEVICE} budgets(BUDGET_UPDATES={BUDGET_UPDATES}, RO={RO_CFG})")

OUT_DIR = "./ol_outputs"; FIG_DIR = os.path.join(OUT_DIR, "figs"); LOG_DIR = os.path.join(OUT_DIR, "logs")
os.makedirs(FIG_DIR, exist_ok=True); os.makedirs(LOG_DIR, exist_ok=True)

def log_parity(info: dict, fname="parity_meta.json"):
    path = os.path.join(LOG_DIR, fname)
    try:
        with open(path, "w", encoding="utf-8") as f: json.dump(info, f, indent=2, default=str)
        print(f"[OL] Parity meta logged -> {path}")
    except Exception as e:
        print("[OL] Could not write parity meta:", e)

# ---------- Optional raw→preprocess (only if split arrays aren't exposed) ----------
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def _make_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

def _make_safe_target_encoder(cols):
    try:
        from category_encoders import TargetEncoder as _CE_TargetEncoder
    except Exception as e:
        print("[ENC] category_encoders not installed; using OneHotEncoder. Reason:", e)
        return _make_ohe()
    try:
        import pandas as _pd
        te = _CE_TargetEncoder(cols=cols, smoothing=1.0)
        Xs = _pd.DataFrame({c: ["a", "b", "a", "c"] for c in cols})
        ys = _pd.Series([0, 1, 1, 0])
        te.fit(Xs, ys); _ = te.transform(Xs)
        print("[ENC] Using TargetEncoder for:", cols)
        return te
    except Exception as e:
        print("[ENC] TargetEncoder not usable; falling back to OneHotEncoder. Reason:", e)
        return _make_ohe()

def _split_cols_by_type(df):
    return (df.select_dtypes(include=np.number).columns.tolist(),
            df.select_dtypes(exclude=np.number).columns.tolist())

def _simple_na_fill(df):
    for c in df.columns:
        if df[c].dtype.kind in "biufc":
            df.loc[:, c] = df[c].fillna(df[c].median())
        else:
            df.loc[:, c] = df[c].fillna("Unknown")
    return df

def build_preproc(X, high_card_cutoff=25):
    nums, cats = _split_cols_by_type(X)
    hi = [c for c in cats if X[c].nunique(dropna=False) > high_card_cutoff]
    lo = [c for c in cats if c not in hi]
    steps=[]
    if nums: steps.append(("num", StandardScaler(), nums))
    if lo:   steps.append(("ohe", _make_ohe(), lo))
    if hi:   steps.append(("hi_cat", _make_safe_target_encoder(hi), hi))
    return ColumnTransformer(steps, remainder="drop", verbose_feature_names_out=False)

HOTEL_CSV = os.environ.get("HOTEL_CSV", "/hotel_bookings.csv")
ACCIDENTS_CSV = os.environ.get("ACCIDENTS_CSV", "/kaggle/input/us-accidents/US_Accidents_March23.csv")

def load_hotel_raw():
    df = pd.read_csv(HOTEL_CSV, low_memory=False)
    for c in ["reservation_status", "reservation_status_date"]:
        if c in df: df = df.drop(columns=[c])
    df.loc[:, "agent"] = df.get("agent", pd.Series(index=df.index)).fillna(0)
    df.loc[:, "company"] = df.get("company", pd.Series(index=df.index)).fillna(0)
    if "country" in df:
        df.loc[:, "country"] = df["country"].fillna(df["country"].mode(dropna=True).iloc[0])
    if "children" in df:
        df.loc[:, "children"] = df["children"].fillna(0)
    y = df["is_canceled"].astype(int)
    X = _simple_na_fill(df.drop(columns=["is_canceled"]))
    return X, y

def load_accidents_raw():
    df = pd.read_csv(ACCIDENTS_CSV, low_memory=True)
    df["Start_Time"] = pd.to_datetime(df["Start_Time"], errors="coerce")
    df["End_Time"]   = pd.to_datetime(df["End_Time"], errors="coerce")
    df["Duration"]   = (df["End_Time"] - df["Start_Time"]).dt.total_seconds()/60.0
    df = df.dropna(subset=["Duration"]).loc[lambda d: d["Duration"]>0]
    if "Description" in df: df = df.drop(columns=["Description"])
    y = df["Duration"].astype(np.float32)
    X = _simple_na_fill(df.drop(columns=["Duration","Start_Time","End_Time"]))
    return X, y

def _np1d(v):
    import pandas as _pd
    return v.values if isinstance(v, (_pd.Series, _pd.DataFrame)) else v

# Build arrays if SL didn’t export them
if not all(n in globals() for n in ["Xh_tr_f","Xh_va_f","Xh_te_f","yh_tr","yh_va","yh_te"]):
    Xh, yh = load_hotel_raw()
    pre_h = build_preproc(Xh)
    Xh_tr, Xh_tmp, yh_tr, yh_tmp = train_test_split(Xh, yh, test_size=0.3, random_state=SEED, stratify=yh)
    Xh_va, Xh_te, yh_va, yh_te   = train_test_split(Xh_tmp, yh_tmp, test_size=0.5, random_state=SEED, stratify=yh_tmp)
    Xh_tr_f = pre_h.fit_transform(Xh_tr, yh_tr); Xh_va_f = pre_h.transform(Xh_va); Xh_te_f = pre_h.transform(Xh_te)

if FAST_MODE:
    RUN_ACCIDENTS = False  # ensure
if RUN_ACCIDENTS and not all(n in globals() for n in ["Xa_tr_f","Xa_va_f","Xa_te_f","ya_tr","ya_va","ya_te"]):
    Xa, ya = load_accidents_raw()
    pre_a = build_preproc(Xa)
    Xa_tr, Xa_tmp, ya_tr, ya_tmp = train_test_split(Xa, ya, test_size=0.3, random_state=SEED)
    Xa_va, Xa_te, ya_va, ya_te   = train_test_split(Xa_tmp, ya_tmp, test_size=0.5, random_state=SEED)
    Xa_tr_f = pre_a.fit_transform(Xa_tr, ya_tr); Xa_va_f = pre_a.transform(Xa_va); Xa_te_f = pre_a.transform(Xa_te)

ADAPTER_OK = all(n in globals() for n in [
    "Xh_tr_f","Xh_va_f","Xh_te_f","yh_tr","yh_va","yh_te"
]) and (True if not RUN_ACCIDENTS else all(n in globals() for n in [
    "Xa_tr_f","Xa_va_f","Xa_te_f","ya_tr","ya_va","ya_te"
]))
print("[OL] Adapter bound? ->", ADAPTER_OK)

# ---------- (1) Data: arrays → DataLoaders ----------
from torch.utils.data import Dataset, DataLoader

class NPDataset(Dataset):
    def __init__(self, X, y, task):
        self.X = np.asarray(X, dtype=np.float32)
        self.y = _np1d(y).astype(np.int64 if task=="cls" else np.float32)
        self.task = task
    def __len__(self): return len(self.y)
    def __getitem__(self, i):
        return (torch.from_numpy(self.X[i]),
                torch.tensor(self.y[i]))

def make_loaders(Xtr, ytr, Xva, yva, Xte, yte, task, bs_train=None, bs_eval=None, num_workers=None):
    bs_train = (bs_train or 256)
    bs_eval  = (bs_eval  or 1024)
    if num_workers is None:
        num_workers = NUM_WORKERS
    pin = True if IS_CUDA else False

    dtr = NPDataset(Xtr, ytr, task)
    dva = NPDataset(Xva, yva, task)
    dte = NPDataset(Xte, yte, task)

    tr = DataLoader(dtr, batch_size=bs_train, shuffle=True,  drop_last=False,
                    num_workers=num_workers, pin_memory=pin, persistent_workers=False)
    va = DataLoader(dva, batch_size=bs_eval,  shuffle=False, drop_last=False,
                    num_workers=num_workers, pin_memory=pin, persistent_workers=False)
    te = DataLoader(dte, batch_size=bs_eval,  shuffle=False, drop_last=False,
                    num_workers=num_workers, pin_memory=pin, persistent_workers=False)
    return tr, va, te


if all(n in globals() for n in ["Xh_tr_f","Xh_va_f","Xh_te_f","yh_tr","yh_va","yh_te"]):
    H_in = np.asarray(Xh_tr_f).shape[1]
    H_loaders = make_loaders(Xh_tr_f, yh_tr, Xh_va_f, yh_va, Xh_te_f, yh_te, task="cls",
                             bs_train=TRAIN_BS, bs_eval=EVAL_BS, num_workers=NUM_WORKERS)
    log_parity({"dataset":"Hotel","Xtr":list(np.asarray(Xh_tr_f).shape),"Xva":list(np.asarray(Xh_va_f).shape),
                "Xte":list(np.asarray(Xh_te_f).shape),"bs_train":TRAIN_BS,"bs_eval":EVAL_BS})

if RUN_ACCIDENTS and all(n in globals() for n in ["Xa_tr_f","Xa_va_f","Xa_te_f","ya_tr","ya_va","ya_te"]):
    A_in = np.asarray(Xa_tr_f).shape[1]
    A_loaders = make_loaders(Xa_tr_f, ya_tr, Xa_va_f, ya_va, Xa_te_f, ya_te, task="reg",
                             bs_train=TRAIN_BS, bs_eval=EVAL_BS, num_workers=NUM_WORKERS)
    log_parity({"dataset":"Accidents","Xtr":list(np.asarray(Xa_tr_f).shape),"Xva":list(np.asarray(Xa_va_f).shape),
                "Xte":list(np.asarray(Xa_te_f).shape),"bs_train":TRAIN_BS,"bs_eval":EVAL_BS})

# ---------- (2) Model: mirror SL MLP with nn.Module ----------
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, in_dim, hidden=(128,64), out_dim=2, dropout_p=0.0):
        super().__init__()
        layers=[]; dims=[in_dim]+list(hidden)
        for a,b in zip(dims[:-1], dims[1:]):
            layers += [nn.Linear(a,b), nn.ReLU()]
            if dropout_p>0: layers += [nn.Dropout(dropout_p)]
        layers += [nn.Linear(dims[-1] if hidden else in_dim, out_dim)]
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x)

if 'H_in' in globals():
    m_cls = MLP(H_in, hidden=(128,64), out_dim=2, dropout_p=0.0).to(DEVICE)
    print("[OL] Hotel MLP:", m_cls)
if RUN_ACCIDENTS and 'A_in' in globals():
    m_reg = MLP(A_in, hidden=(512,256), out_dim=1, dropout_p=0.0).to(DEVICE)
    print("[OL] Accidents MLP:", m_reg)

# ---------- (3) Freezing all but last k layers (RO) ----------
def linear_layers(model): return [m for m in model.modules() if isinstance(m, nn.Linear)]
def freeze_all_but_last_k(model, k=1):
    for p in model.parameters(): p.requires_grad = False
    Ls = linear_layers(model)
    for m in Ls[-k:]:
        for p in m.parameters(): p.requires_grad = True
    tot = sum(p.numel() for p in model.parameters())
    trn = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"[OL] Params total={tot:,} | trainable(last {k})={trn:,}")
    return trn

# ---------- (4) Losses & metrics ----------
USE_BCE_FOR_BINARY = False

def make_criterion(task="cls", out_dim=2):
    if task=="cls":
        if USE_BCE_FOR_BINARY and out_dim==1: return nn.BCEWithLogitsLoss()
        return nn.CrossEntropyLoss()
    else:
        return nn.MSELoss()

from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score, mean_absolute_error

@torch.no_grad()
def eval_metrics(model, loader, task, out_dim):
    model.eval()
    ys, ps, loss_sum, n = [], [], 0.0, 0
    crit = make_criterion(task, out_dim)
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        out = model(xb)
        if task=="cls":
            if USE_BCE_FOR_BINARY and out_dim==1:
                loss = crit(out.squeeze(1), yb.float())
                prob = torch.sigmoid(out.squeeze(1)).cpu().numpy(); y = yb.cpu().numpy()
            else:
                loss = crit(out, yb)
                prob = torch.softmax(out, dim=1)[:,1].cpu().numpy(); y = yb.cpu().numpy()
            ys.append(y); ps.append(prob)
        else:
            loss = crit(out.squeeze(-1), yb)
            ys.append(yb.cpu().numpy()); ps.append(out.squeeze(-1).cpu().numpy())
        loss_sum += float(loss.item()) * xb.size(0); n += xb.size(0)
    y = np.concatenate(ys); p = np.concatenate(ps)
    avg_loss = loss_sum / max(n,1)
    if task=="cls":
        auroc = roc_auc_score(y, p) if (len(np.unique(y))>1) else np.nan
        ap    = average_precision_score(y, p) if (len(np.unique(y))>1) else np.nan
        pred  = (p>=0.5).astype(int)
        acc   = accuracy_score(y, pred); f1 = f1_score(y, pred)
        return avg_loss, {"auroc":auroc, "prauc":ap, "acc":acc, "f1":f1}
    else:
        mae = mean_absolute_error(y, p)
        return avg_loss, {"mae":mae}

# ---------- (5) Training loop (counts grad evals) ----------
def run_epoch(model, loader, optimizer=None, task="cls", out_dim=2, l2_lambda=0.0, label_smooth=0.0, input_noise_std=0.0):
    is_train = optimizer is not None
    model.train(is_train)
    crit = make_criterion(task, out_dim)
    total_loss, n, grad_evals = 0.0, 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        if input_noise_std>0 and task=="reg" and is_train:
            xb = xb + input_noise_std*torch.randn_like(xb)
        if is_train: optimizer.zero_grad(set_to_none=True)
        out = model(xb)
        if task=="cls" and (label_smooth>0) and (not USE_BCE_FOR_BINARY or out_dim!=1):
            ncls = out.size(1)
            y_one = torch.zeros((yb.size(0), ncls), device=out.device).scatter_(1, yb.view(-1,1), 1.0)
            y_s = (1.0 - label_smooth)*y_one + label_smooth/(ncls-1)*(1.0 - y_one)
            loss = -(y_s * torch.log_softmax(out, dim=1)).sum(dim=1).mean()
        else:
            loss = (nn.BCEWithLogitsLoss()(out.squeeze(1), yb.float()) if (task=="cls" and USE_BCE_FOR_BINARY and out_dim==1)
                    else crit(out if task=="cls" else out.squeeze(-1), yb))
        if l2_lambda>0:
            l2 = sum((p**2).sum() for p in model.parameters() if p.requires_grad)
            loss = loss + l2_lambda*l2
        if is_train:
            loss.backward(); optimizer.step(); grad_evals += 1
        total_loss += float(loss.item())*xb.size(0); n += xb.size(0)
    return total_loss/max(n,1), grad_evals

# ---------- (6) Optimizer ablations (identical budgets & threshold ℓ) ----------
class AdamNoBias(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9,0.999), eps=1e-8, weight_decay=0.0):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        super().__init__(params, defaults)
    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None: loss = closure()
        for group in self.param_groups:
            lr = group["lr"]; beta1, beta2 = group["betas"]; eps = group["eps"]; wd = group["weight_decay"]
            for p in group["params"]:
                if p.grad is None: continue
                g = p.grad
                st = self.state[p]
                if len(st)==0:
                    st["m"] = torch.zeros_like(p); st["v"]=torch.zeros_like(p)
                m, v = st["m"], st["v"]
                m.mul_(beta1).add_(g, alpha=1.0-beta1)
                v.mul_(beta2).addcmul_(g, g, value=1.0-beta2)
                step = m/(v.sqrt()+eps)  # no bias correction
                if wd!=0.0: step = step + wd*p
                p.add_(step, alpha=-lr)
        return loss

def make_optimizer(params, kind, lr, beta1=0.9, beta2=0.999, wd=0.0):
    filt = [p for p in params if p.requires_grad]
    if kind=="sgd":         return torch.optim.SGD(filt, lr=lr, momentum=0.0, nesterov=False)
    if kind=="momentum":    return torch.optim.SGD(filt, lr=lr, momentum=0.9, nesterov=False)
    if kind=="nesterov":    return torch.optim.SGD(filt, lr=lr, momentum=0.9, nesterov=True)
    if kind=="adam":        return torch.optim.Adam(filt, lr=lr, betas=(beta1,beta2))
    if kind=="adam_nobias": return AdamNoBias(filt, lr=lr, betas=(beta1,beta2))
    if kind=="adam_beta1zero": return torch.optim.Adam(filt, lr=lr, betas=(0.0,beta2))
    if kind=="adamw":       return torch.optim.AdamW(filt, lr=lr, betas=(beta1,beta2), weight_decay=wd)
    raise ValueError(kind)

def train_to_budget(model, train_loader, val_loader, task, out_dim,
                    max_updates=200, L_threshold=None, tag="run", opt_kind="adam",
                    lr=1e-3, beta1=0.9, beta2=0.999, wd=0.0, label_smooth=0.0, input_noise_std=0.0):
    import copy, time as _time
    m = copy.deepcopy(model).to(DEVICE)
    for p in m.parameters(): p.requires_grad = True  # Part 2 trains full net
    opt = make_optimizer(m.parameters(), opt_kind, lr, beta1, beta2, wd)
    grad_total, best_val, t0, reached = 0, float("inf"), _time.time(), None
    hist=[]
    while grad_total < max_updates:
        trL, ge = run_epoch(m, train_loader, opt, task, out_dim, l2_lambda=0.0,
                            label_smooth=label_smooth, input_noise_std=input_noise_std)
        grad_total += ge
        vaL, extra = eval_metrics(m, val_loader, task, out_dim)
        best_val = min(best_val, vaL)
        if (L_threshold is not None) and (reached is None) and (vaL <= L_threshold):
            reached = {"grad_evals":grad_total, "time_sec": _time.time()-t0}
        hist.append({"grad_evals":grad_total,"val_loss":vaL, **extra})
    df = pd.DataFrame(hist); logf = os.path.join(LOG_DIR, f"{tag}.csv"); df.to_csv(logf, index=False)
    return {"best_val":best_val, "reached_L":reached, "log":logf}

L_HOTEL = 0.20
L_ACC   = 500.0

def part2_ablation(model, loaders, task, out_dim, base_lr, base_beta1=0.9, base_beta2=0.999, variants=None):
    tr, va, _ = loaders
    variants = variants or OPT_VARIANTS
    results=[]
    for name, kw in variants:
        tag = f"opt_{name}_{task}"
        Lth = L_HOTEL if task=="cls" else L_ACC
        # ---- profile this optimizer run ----
        (r, dt, peak) = profile_call(f"Part2::{task}::{name}",
                                     train_to_budget, model, tr, va, task, out_dim,
                                     BUDGET_UPDATES, L_threshold=Lth, tag=tag,
                                     opt_kind=name, lr=base_lr, beta1=base_beta1, beta2=base_beta2, **kw)
        r["variant"]=name; r["time_sec"]=dt; r["peak_mb"]=peak
        results.append(r)
        print(f"[OL/Part2] {name}: best_val={r['best_val']:.4f} reached_L={r['reached_L']} time={dt:.2f}s peakMB={peak}")
    # save summary with time & memory
    pd.DataFrame([{k:v for k,v in r.items() if k!='log'} for r in results]).to_csv(
        os.path.join(LOG_DIR, f"ablation_summary_{task}.csv"), index=False)
    plot_loss_vs_compute(os.path.join(LOG_DIR, "opt_adam_cls.csv"),
                     "Hotel: Adam loss vs compute",
                     "loss_vs_compute_hotel_adam.png")
    return results

def sensitivity_heatmap(model, loaders, task, out_dim, alphas=(1e-4,3e-4,1e-3), betas=(0.5,0.9,0.99), which="b1"):
    if FAST_MODE:
        print("[OL/Part2] Heatmaps disabled in FAST_MODE");
        return pd.DataFrame()
    tr, va, _ = loaders
    rec=[]
    for a in alphas:
        for b in betas:
            b1, b2 = (b, 0.999) if which=="b1" else (0.9, b)
            (r, _, _) = profile_call(f"Part2::heatmap::{task}::{which}::a{a}_b{b}",
                                     train_to_budget, model, tr, va, task, out_dim,
                                     max_updates=BUDGET_UPDATES//2, L_threshold=None,
                                     tag=f"heat_{task}_{which}_a{a}_b{b}",
                                     opt_kind="adam", lr=a, beta1=b1, beta2=b2)
            rec.append({"alpha":a,"beta":b,"best":r["best_val"]})
    df = pd.DataFrame(rec); piv = df.pivot(index="beta", columns="alpha", values="best")
    plt.figure(); plt.imshow(piv.values, aspect="auto")
    plt.xticks(range(len(piv.columns)), piv.columns); plt.yticks(range(len(piv.index)), piv.index)
    plt.xlabel("alpha (lr)"); plt.ylabel("beta" + ("1" if which=="b1" else "2"))
    plt.colorbar(label="validation loss"); plt.title(f"{task.upper()} Adam sensitivity ({which})")
    fn = os.path.join(FIG_DIR, f"heatmap_{task}_{which}.png"); plt.tight_layout(); plt.savefig(fn, dpi=160); plt.close()
    print("[OL/Part2] Saved heatmap ->", fn)
    return df

def stability_band(model, loaders, task, out_dim, seeds=(0,1,2), opt_kind="adam", lr=1e-3):
    if FAST_MODE:
        print("[OL/Part2] Stability bands disabled in FAST_MODE");
        return pd.DataFrame()
    import copy
    tr, va, _ = loaders
    traj=[]
    for sd in seeds:
        set_seed(sd)
        m = copy.deepcopy(model).to(DEVICE)
        (r, dt, peak) = profile_call(f"Part2::stability::{task}::{opt_kind}::seed{sd}",
                                     train_to_budget, m, tr, va, task, out_dim,
                                     BUDGET_UPDATES, L_threshold=None,
                                     tag=f"stab_{task}_{opt_kind}_seed{sd}", opt_kind=opt_kind, lr=lr)
        df = pd.read_csv(r["log"]); df["seed"]=sd; traj.append(df)
        print(f"[OL/Part2] seed {sd} time={dt:.2f}s peakMB={peak}")
    D = pd.concat(traj, axis=0)
    g = D.groupby("grad_evals")["val_loss"]
    xs = g.median().index.values; med=g.median().values; q25=g.quantile(0.25).values; q75=g.quantile(0.75).values
    plt.figure(); plt.plot(xs, med, label="median"); plt.fill_between(xs, q25, q75, alpha=0.3, label="IQR")
    plt.xlabel("# grad evals"); plt.ylabel("validation loss"); plt.title(f"{task.upper()} {opt_kind} stability")
    plt.legend(); fn = os.path.join(FIG_DIR, f"stability_{task}_{opt_kind}.png"); plt.tight_layout(); plt.savefig(fn, dpi=160); plt.close()
    print("[OL/Part2] Saved stability band ->", fn)
    return D

# ---------- (7) RO hygiene (Part 1) ----------
def last_linear(model):
    Ls = linear_layers(model)
    if not Ls: raise RuntimeError("No nn.Linear layers found.")
    return Ls[-1]

def pack_last(last_layer):
    W = last_layer.weight.detach().cpu().numpy().copy()
    b = last_layer.bias.detach().cpu().numpy().copy()
    return np.concatenate([W.ravel(), b.ravel()]), W.shape

def unpack_to_last(vec, last_layer, Wshape):
    out, in_ = Wshape
    W = vec[:out*in_].reshape(out, in_)
    b = vec[out*in_:]
    last_layer.weight.data = torch.from_numpy(W).to(last_layer.weight.device).type_as(last_layer.weight)
    last_layer.bias.data   = torch.from_numpy(b).to(last_layer.bias.device).type_as(last_layer.bias)

@torch.no_grad()
def validation_objective(model, val_loader, task, out_dim):
    model.eval()
    vL, _ = eval_metrics(model, val_loader, task, out_dim)
    return float(vL)  # one full val pass = one function evaluation

def RHC(f, x0, step=0.05, restarts=5, budget=2000):
    best_x, best_f = x0.copy(), f(x0); evals=1; hist=[best_f]
    for r in range(restarts+1):
        x = x0.copy() if r==0 else x0 + 0.1*np.random.randn(*x0.shape)
        fx = f(x); evals += 1
        improved=True
        while evals<budget and improved:
            improved=False
            for _ in range(20):
                c = x + step*np.random.randn(*x.shape); fc = f(c); evals += 1
                if fc < fx: x, fx = c, fc; improved=True; hist.append(fx)
                if evals>=budget: break
        if fx < best_f: best_x, best_f = x, fx
    return best_x, best_f, np.array(hist)

def SA(f, x0, T0=1.0, cooling=0.997, step=0.05, budget=4000):
    x, fx = x0.copy(), f(x0); evals=1; hist=[fx]; T=T0
    while evals<budget:
        c = x + step*np.random.randn(*x.shape); fc = f(c); evals += 1
        if fc < fx or np.random.rand() < np.exp((fx-fc)/max(T,1e-8)): x, fx = c, fc
        hist.append(fx); T *= cooling
    return x, fx, np.array(hist)

def GA(f, x0, pop=40, elites=2, cx=0.7, mut=0.2, step=0.05, budget=8000):
    dim = x0.size; evals=0
    P = x0 + 0.1*np.random.randn(pop, dim)
    fit = np.array([f(ind) for ind in P]); evals += pop; best_hist=[fit.min()]
    while evals<budget:
        idx = np.argsort(fit); P = P[idx]; fit = fit[idx]
        new = [P[i] for i in range(elites)]
        while len(new) < pop:
            if np.random.rand()<cx:
                a,b = P[np.random.randint(0,pop//2,2)]; cut = np.random.randint(0, dim)
                child = np.concatenate([a[:cut], b[cut:]])
            else: child = P[np.random.randint(0,pop)]
            if np.random.rand()<mut: child = child + step*np.random.randn(dim)
            new.append(child)
        P = np.array(new)
        fit = np.array([f(ind) for ind in P]); evals += pop; best_hist.append(fit.min())
    best = P[fit.argmin()]
    return best, fit.min(), np.array(best_hist)

def run_part1_RO(model, loaders, task, out_dim, base_tag="dataset", k_unfrozen=1, ro_cfg=None):
    import copy
    ro_cfg = ro_cfg or RO_CFG
    m = copy.deepcopy(model).to(DEVICE)
    tparams = freeze_all_but_last_k(m, k=k_unfrozen)
    assert tparams <= 50_000, f"RO trainable params {tparams} exceed ~50k cap"
    m.eval()

    last = last_linear(m)
    x0, Wshape = pack_last(last)

    def f(vec):
        with torch.no_grad():
            unpack_to_last(vec.astype(np.float32), last, Wshape)
            return validation_objective(m, loaders[1], task, out_dim)

    # Profile each RO algo
    (ret_rhc, t_rhc, mb_rhc) = profile_call(f"Part1::{base_tag}::RHC",
                                            RHC, f, x0, 0.05, (3 if FAST_MODE else 5), ro_cfg["RHC"])
    xr, fr, hr = ret_rhc
    (ret_sa,  t_sa,  mb_sa ) = profile_call(f"Part1::{base_tag}::SA",
                                            SA, f, x0, 1.0, 0.997, 0.05, ro_cfg["SA"])
    xs, fs, hs = ret_sa
    (ret_ga,  t_ga,  mb_ga ) = profile_call(f"Part1::{base_tag}::GA",
                                            GA, f, x0, ro_cfg["GA_pop"], 2, 0.7, 0.2, 0.05, ro_cfg["GA"])
    xg, fg, hg = ret_ga

    def summarize(hist, algo, Lth):
        best = float(np.min(hist))
        # first eval reaching threshold (if any)
        cummin = np.minimum.accumulate(hist)
        idx = int(np.where(cummin <= Lth)[0][0]) if np.any(cummin <= Lth) else None
        return {"algo": algo, "best_val": best, "reached_L": idx is not None, "evals_to_L": (idx if idx is not None else None)}

    Lth = L_HOTEL if task=="cls" else L_ACC
    summary = pd.DataFrame([
        summarize(hr, "RHC", Lth),
        summarize(hs, "SA",  Lth),
        summarize(hg, "GA",  Lth),
    ])
    summary_path = os.path.join(LOG_DIR, f"ro_summary_{base_tag}.csv")
    summary.to_csv(summary_path, index=False)
    print("[OL/Part1] Saved RO summary ->", summary_path)

    plt.figure()
    plt.plot(np.arange(len(hr)), np.minimum.accumulate(hr), label=f"RHC ({t_rhc:.1f}s)")
    plt.plot(np.arange(len(hs)), np.minimum.accumulate(hs), label=f"SA ({t_sa:.1f}s)")
    plt.plot(np.arange(len(hg)), np.minimum.accumulate(hg), label=f"GA ({t_ga:.1f}s)")
    plt.xlabel("function evaluations")
    plt.ylabel("best-so-far validation loss")
    plt.title(f"RO progress ({base_tag})"); plt.legend(); plt.tight_layout()
    fn = os.path.join(FIG_DIR, f"ro_progress_{base_tag}.png"); plt.savefig(fn, dpi=160); plt.close()
    print("[OL/Part1] Saved RO curves ->", fn)

# ---------- (8) Regularization study (Adam only) ----------
def part3_regularization(model, loaders, task, out_dim, adam_lr, adam_beta1=0.9, adam_beta2=0.999):
    import copy
    tr, va, _ = loaders
    def train_cfg(tag, l2=0.0, es_patience=None, drop_p=None, label_smooth=0.0, input_noise=0.0):
        m = copy.deepcopy(model).to(DEVICE)
        if drop_p is not None:
            Ls = [m for m in m.modules() if isinstance(m, nn.Linear)]
            in_dim = Ls[0].in_features; outs=[l.out_features for l in Ls[:-1]]
            out_dim_loc = Ls[-1].out_features
            new_m = MLP(in_dim, hidden=tuple(outs), out_dim=out_dim_loc, dropout_p=drop_p).to(DEVICE)
            new_m.load_state_dict(m.state_dict(), strict=False)
            m = new_m
        opt = make_optimizer(m.parameters(), "adam", adam_lr, adam_beta1, adam_beta2, wd=0.0)

        best=float("inf"); ge_total=0; hist=[]; patience_left=es_patience
        while ge_total < BUDGET_UPDATES:
            trL, ge = run_epoch(m, tr, opt, task, out_dim, l2_lambda=l2, label_smooth=label_smooth, input_noise_std=input_noise)
            ge_total += ge
            vaL, extra = eval_metrics(m, va, task, out_dim)
            best = min(best, vaL); hist.append({"grad_evals":ge_total,"val_loss":vaL, **extra})
            if es_patience is not None:
                if len(hist)>1 and hist[-1]["val_loss"] > min(h["val_loss"] for h in hist[:-1]):
                    patience_left -= 1
                    if patience_left <= 0: break
        df = pd.DataFrame(hist); logf=os.path.join(LOG_DIR,f"{tag}.csv"); df.to_csv(logf, index=False)
        return best

    # run & profile each config
    (baseline, t_base, mb_base) = profile_call(f"Part3::{task}::baseline",
                                               train_cfg, "reg_baseline_"+task)
    single = []
    for wd in [1e-5, 1e-4, 5e-4]:
        (bv, t, mb) = profile_call(f"Part3::{task}::l2::{wd}", train_cfg, f"reg_l2_{wd}_{task}", l2=wd)
        single.append(("l2", wd, bv, t, mb))
    (bv, t, mb) = profile_call(f"Part3::{task}::earlystop::3", train_cfg, f"reg_es3_{task}", es_patience=3)
    single.append(("earlystop", 3, bv, t, mb))
    for p in ([0.1, 0.3] if task=="cls" else [0.05, 0.2]):
        (bv, t, mb) = profile_call(f"Part3::{task}::dropout::{p}", train_cfg, f"reg_dropout_{p}_{task}", drop_p=p)
        single.append(("dropout", p, bv, t, mb))
    if task=="cls":
        for eps in [0.05, 0.10]:
            (bv, t, mb) = profile_call(f"Part3::{task}::label_smooth::{eps}", train_cfg, f"reg_ls_{eps}_{task}", label_smooth=eps)
            single.append(("label_smooth", eps, bv, t, mb))
    else:
        for s in [0.01, 0.05]:
            (bv, t, mb) = profile_call(f"Part3::{task}::input_noise::{s}", train_cfg, f"reg_inoise_{s}_{task}", input_noise=s)
            single.append(("input_noise", s, bv, t, mb))

    sdf = pd.DataFrame(single, columns=["reg","value","best_val","time_sec","peak_mb"]).sort_values("best_val")
    best_reg = sdf.iloc[0].to_dict()
    combo_kwargs = {}
    if best_reg["reg"]=="l2": combo_kwargs["l2"]=float(best_reg["value"])
    if best_reg["reg"]=="earlystop": combo_kwargs["es_patience"]=int(best_reg["value"])
    if best_reg["reg"]=="dropout": combo_kwargs["drop_p"]=float(best_reg["value"])
    if best_reg["reg"]=="label_smooth": combo_kwargs["label_smooth"]=float(best_reg["value"])
    if best_reg["reg"]=="input_noise": combo_kwargs["input_noise"]=float(best_reg["value"])
    if task=="cls" and "label_smooth" not in combo_kwargs: combo_kwargs["label_smooth"]=0.05
    if task=="reg" and "input_noise" not in combo_kwargs: combo_kwargs["input_noise"]=0.01
    (combo, t_combo, mb_combo) = profile_call(f"Part3::{task}::combo", train_cfg, "reg_combo_"+task, **combo_kwargs)

    plt.figure()
    names = ["Baseline","Best Single","Best Combo"]
    vals  = [baseline, best_reg["best_val"], combo]
    plt.bar(names, vals); plt.ylabel("validation loss"); plt.title(f"{task.upper()} — regularization")
    fn = os.path.join(FIG_DIR, f"regularization_{task}.png"); plt.tight_layout(); plt.savefig(fn, dpi=160); plt.close()
    print("[OL/Part3] Saved regularization bar plot ->", fn)
    sdf.to_csv(os.path.join(LOG_DIR, f"reg_single_{task}.csv"), index=False)
    # also log combo timing
    with open(os.path.join(LOG_DIR, f"reg_combo_{task}.json"), "w", encoding="utf-8") as f:
        json.dump({"best_single":best_reg, "combo_val":combo, "combo_time_sec":t_combo, "combo_peak_mb":mb_combo}, f, indent=2)
    return sdf, {"baseline":baseline, "best_single":best_reg, "combo":combo}

# ---------- (9) Reporting & accounting helpers ----------
def plot_loss_vs_compute(csv_log, title, outfile):
    if not os.path.exists(csv_log):
        print("[OL] Missing log:", csv_log); return
    df = pd.read_csv(csv_log)
    if "grad_evals" not in df or "val_loss" not in df:
        print("[OL] Missing required columns in", csv_log); return
    plt.figure(); plt.plot(df["grad_evals"], df["val_loss"])
    plt.xlabel("# gradient evaluations"); plt.ylabel("validation loss"); plt.title(title)
    fn=os.path.join(FIG_DIR, outfile); plt.tight_layout(); plt.savefig(fn, dpi=160); plt.close()
    print("[OL] Saved ->", fn)

# ============================================================
# ============ PART 1 / PART 2 / PART 3 EXECUTION ============
# ============================================================

# PART 1 — Randomized Optimization (RHC/SA/GA)
if 'H_loaders' in globals() and 'm_cls' in globals():
    run_part1_RO(m_cls, H_loaders, task="cls", out_dim=(1 if USE_BCE_FOR_BINARY else 2),
                 base_tag="hotel", k_unfrozen=1, ro_cfg=RO_CFG)
if RUN_ACCIDENTS and 'A_loaders' in globals() and 'm_reg' in globals():
    run_part1_RO(m_reg, A_loaders, task="reg", out_dim=1,
                 base_tag="accidents", k_unfrozen=1, ro_cfg=RO_CFG)

# PART 2 — Optimizer ablations (equal budgets; same splits)
if 'H_loaders' in globals() and 'm_cls' in globals():
    _ab_h = part2_ablation(m_cls, H_loaders, task="cls", out_dim=(1 if USE_BCE_FOR_BINARY else 2), base_lr=1e-3)
    if DO_HEATMAPS:
        _ = sensitivity_heatmap(m_cls, H_loaders, "cls", (1 if USE_BCE_FOR_BINARY else 2))
        _ = sensitivity_heatmap(m_cls, H_loaders, "cls", (1 if USE_BCE_FOR_BINARY else 2), which="b2")
    if DO_STABILITY:
        _ = stability_band(m_cls, H_loaders, "cls", (1 if USE_BCE_FOR_BINARY else 2), seeds=(0,1,2), opt_kind="adam", lr=1e-3)

if RUN_ACCIDENTS and 'A_loaders' in globals() and 'm_reg' in globals():
    _ab_a = part2_ablation(m_reg, A_loaders, task="reg", out_dim=1, base_lr=3e-4)
    if DO_HEATMAPS:
        _ = sensitivity_heatmap(m_reg, A_loaders, "reg", 1)
        _ = sensitivity_heatmap(m_reg, A_loaders, "reg", 1, which="b2")
    if DO_STABILITY:
        _ = stability_band(m_reg, A_loaders, "reg", 1, seeds=(0,1,2), opt_kind="adam", lr=3e-4)

# PART 3 — Regularization study (Adam only; reuse Part-2 Adam LR/betas)
if 'H_loaders' in globals() and 'm_cls' in globals():
    _s_h, _bars_h = part3_regularization(m_cls, H_loaders, "cls", (1 if USE_BCE_FOR_BINARY else 2), adam_lr=1e-3)
if RUN_ACCIDENTS and 'A_loaders' in globals() and 'm_reg' in globals():
    _s_a, _bars_a = part3_regularization(m_reg, A_loaders, "reg", 1, adam_lr=3e-4)
