# Optuna and NNs

In [None]:
# ONE-CELL Optuna workflow for small MLPRegressor (≤20 units/layer, early stopping)
# Artifacts saved to ./Module2/optuna_artifacts (folder is overwritten each run)

# --- Imports & setup ---
import os, shutil, json, pickle, datetime as dt, warnings
warnings.filterwarnings("ignore")

# If Optuna isn't installed in your env, uncomment:
# %pip install optuna

import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error

RANDOM_STATE = 42
RESULTS_DIR = "./optuna_artifacts"

# --- Recreate results directory ---
if os.path.exists(RESULTS_DIR):
    shutil.rmtree(RESULTS_DIR)
os.makedirs(RESULTS_DIR, exist_ok=True)
with open(os.path.join(RESULTS_DIR, "run_info.json"), "w") as f:
    json.dump({"run_tag": dt.datetime.now().strftime("%Y%m%d_%H%M%S")}, f, indent=2)

# --- Load & split data (80/10/10) ---
data = fetch_california_housing(as_frame=True)
X = data.frame.drop(columns=["MedHouseVal"])
y = data.frame["MedHouseVal"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE
)

# --- Scale with train-only stats ---
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# --- Optuna objective (minimize external VAL MAE); small nets + early stopping ---
def objective(trial):
    n_layers = trial.suggest_int("n_layers", 1, 3)
    hidden_sizes = tuple(trial.suggest_int(f"n_units_l{i+1}", 4, 20, step=4) for i in range(n_layers))
    params = {
        "hidden_layer_sizes": hidden_sizes,
        "activation": trial.suggest_categorical("activation", ["relu", "tanh"]),
        "solver": "adam",
        "alpha": trial.suggest_float("alpha", 1e-6, 1e-2, log=True),
        "learning_rate_init": trial.suggest_float("learning_rate_init", 1e-4, 1e-2, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [64, 128, 256]),
        "max_iter": 1000,
        "early_stopping": True,      # internal split on TRAIN only
        "validation_fraction": 0.1,
        "n_iter_no_change": 20,
        "random_state": RANDOM_STATE,
        "shuffle": True,
    }
    model = MLPRegressor(**params).fit(X_train_s, y_train)
    y_val_pred = model.predict(X_val_s)
    return mean_absolute_error(y_val, y_val_pred)

# --- Callback to log CSVs/plot after every trial ---
def log_progress_callback(study, trial):
    df = study.trials_dataframe(attrs=("number","value","state","datetime_start","datetime_complete"))
    params_df = pd.DataFrame([t.params for t in study.trials])
    if len(params_df):
        df = pd.concat([df.reset_index(drop=True), params_df.reset_index(drop=True)], axis=1)
    df.to_csv(os.path.join(RESULTS_DIR, "trials_full.csv"), index=False)

    vals = df["value"].astype(float).values
    best_so_far = np.minimum.accumulate(vals)
    pd.DataFrame({"trial": np.arange(len(vals)), "val_mae": vals, "best_mae": best_so_far}) \
      .to_csv(os.path.join(RESULTS_DIR, "best_track.csv"), index=False)

    plt.figure(figsize=(6.5,4))
    plt.plot(np.arange(len(vals)), vals, marker="o", linewidth=1)
    plt.plot(np.arange(len(vals)), best_so_far, linewidth=2)
    plt.xlabel("Trial")
    plt.ylabel("Validation MAE")
    plt.title("Optuna: MAE per trial (line = best so far)")
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, "mae_over_trials.png"), dpi=150, bbox_inches="tight")
    plt.close()

# --- Run study (calm search) ---
sampler = optuna.samplers.TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=25, callbacks=[log_progress_callback], show_progress_bar=True)

# Save best params
with open(os.path.join(RESULTS_DIR, "best_params.json"), "w") as f:
    json.dump(study.best_params, f, indent=2)

# --- Retrain best model on TRAIN+VAL; evaluate on TEST; save metrics/plots/model ---
best = study.best_params
hidden_sizes = tuple(best[f"n_units_l{i+1}"] for i in range(best["n_layers"]))
final_params = {
    "hidden_layer_sizes": hidden_sizes,
    "activation": best["activation"],
    "solver": "adam",
    "alpha": best["alpha"],
    "learning_rate_init": best["learning_rate_init"],
    "batch_size": best["batch_size"],
    "max_iter": 2000,
    "early_stopping": True,
    "validation_fraction": 0.1,
    "n_iter_no_change": 25,
    "random_state": RANDOM_STATE,
    "shuffle": True,
}

# Refit scaler on TRAIN+VAL for the final model
scaler_final = StandardScaler().fit(np.vstack([X_train, X_val]))
X_trval_s = scaler_final.transform(np.vstack([X_train, X_val]))
y_trval   = np.concatenate([y_train.values, y_val.values])
X_train_sf = scaler_final.transform(X_train)
X_val_sf   = scaler_final.transform(X_val)
X_test_sf  = scaler_final.transform(X_test)

final_model = MLPRegressor(**final_params).fit(X_trval_s, y_trval)

def metrics(y_true, y_pred):
    return {
        "R2": r2_score(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred),
    }

# Evaluate and save metrics
rows = []
for name, (Xsplit, ytrue) in {
    "train": (X_train_sf, y_train.values),
    "val":   (X_val_sf,   y_val.values),
    "test":  (X_test_sf,  y_test.values),
}.items():
    ypred = final_model.predict(Xsplit)
    rows.append({"split": name, **metrics(ytrue, ypred)})
metrics_df = pd.DataFrame(rows).round(4)
metrics_df.to_csv(os.path.join(RESULTS_DIR, "final_metrics.csv"), index=False)
print("Final metrics:\n", metrics_df.to_string(index=False))

# Scatterplot helper
def scatter_with_reference(y_true, y_pred, title, outpath):
    plt.figure(figsize=(6,6))
    plt.scatter(y_true, y_pred, alpha=0.3, s=10)
    lo = min(np.min(y_true), np.min(y_pred))
    hi = max(np.max(y_true), np.max(y_pred))
    plt.plot([lo, hi], [lo, hi], linewidth=1)
    plt.xlabel("Actual MedHouseVal")
    plt.ylabel("Predicted MedHouseVal")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(outpath, dpi=150, bbox_inches="tight")
    plt.close()

# Save train & test scatterplots
scatter_with_reference(y_train.values, final_model.predict(X_train_sf),
                       "Predicted vs Actual — Train", os.path.join(RESULTS_DIR, "scatter_train.png"))
scatter_with_reference(y_test.values, final_model.predict(X_test_sf),
                       "Predicted vs Actual — Test",  os.path.join(RESULTS_DIR, "scatter_test.png"))

# Persist model & scaler
with open(os.path.join(RESULTS_DIR, "final_model.pkl"), "wb") as f:
    pickle.dump(final_model, f)
with open(os.path.join(RESULTS_DIR, "scaler_final.pkl"), "wb") as f:
    pickle.dump(scaler_final, f)

print(f"\nArtifacts saved to: {os.path.abspath(RESULTS_DIR)}")


[I 2025-10-08 14:53:52,315] A new study created in memory with name: no-name-2a7ab0d7-f305-4edf-a1a3-5f345cc5c691
  0%|          | 0/25 [00:11<?, ?it/s]

[I 2025-10-08 14:54:04,146] Trial 0 finished with value: 0.36647443102993993 and parameters: {'n_layers': 2, 'n_units_l1': 20, 'n_units_l2': 16, 'activation': 'relu', 'alpha': 4.207053950287936e-06, 'learning_rate_init': 0.00013066739238053285, 'batch_size': 64}. Best is trial 0 with value: 0.36647443102993993.


Best trial: 0. Best value: 0.366474:   8%|▊         | 2/25 [00:23<04:24, 11.49s/it]

[I 2025-10-08 14:54:15,392] Trial 1 finished with value: 0.39011007294976646 and parameters: {'n_layers': 1, 'n_units_l1': 20, 'activation': 'relu', 'alpha': 5.337032762603957e-06, 'learning_rate_init': 0.00023270677083837802, 'batch_size': 128}. Best is trial 0 with value: 0.36647443102993993.


Best trial: 0. Best value: 0.366474:  12%|█▏        | 3/25 [00:30<03:28,  9.48s/it]

[I 2025-10-08 14:54:22,446] Trial 2 finished with value: 0.392796156175982 and parameters: {'n_layers': 1, 'n_units_l1': 16, 'activation': 'tanh', 'alpha': 2.9204338471814107e-05, 'learning_rate_init': 0.000816845589476017, 'batch_size': 64}. Best is trial 0 with value: 0.36647443102993993.


Best trial: 0. Best value: 0.366474:  16%|█▌        | 4/25 [00:33<02:30,  7.17s/it]

[I 2025-10-08 14:54:26,094] Trial 3 finished with value: 0.39218412450937495 and parameters: {'n_layers': 2, 'n_units_l1': 4, 'n_units_l2': 16, 'activation': 'relu', 'alpha': 0.006245139574743076, 'learning_rate_init': 0.00853618986286683, 'batch_size': 64}. Best is trial 0 with value: 0.36647443102993993.


Best trial: 0. Best value: 0.366474:  20%|██        | 5/25 [00:37<02:00,  6.01s/it]

[I 2025-10-08 14:54:30,058] Trial 4 finished with value: 0.37872750198541627 and parameters: {'n_layers': 3, 'n_units_l1': 12, 'n_units_l2': 4, 'n_units_l3': 12, 'activation': 'tanh', 'alpha': 1.0842262717330169e-05, 'learning_rate_init': 0.0021137059440645744, 'batch_size': 256}. Best is trial 0 with value: 0.36647443102993993.


Best trial: 0. Best value: 0.366474:  24%|██▍       | 6/25 [00:42<01:48,  5.69s/it]

[I 2025-10-08 14:54:35,125] Trial 5 finished with value: 0.3888342345218851 and parameters: {'n_layers': 1, 'n_units_l1': 20, 'activation': 'tanh', 'alpha': 0.003795853142670641, 'learning_rate_init': 0.0015696396388661157, 'batch_size': 64}. Best is trial 0 with value: 0.36647443102993993.


Best trial: 0. Best value: 0.366474:  28%|██▊       | 7/25 [00:47<01:36,  5.37s/it]

[I 2025-10-08 14:54:39,824] Trial 6 finished with value: 0.4150107493437893 and parameters: {'n_layers': 1, 'n_units_l1': 8, 'activation': 'relu', 'alpha': 0.0020651425578959264, 'learning_rate_init': 0.0005170191786366995, 'batch_size': 128}. Best is trial 0 with value: 0.36647443102993993.


Best trial: 0. Best value: 0.366474:  32%|███▏      | 8/25 [00:51<01:22,  4.87s/it]

[I 2025-10-08 14:54:43,623] Trial 7 finished with value: 0.38441589726398406 and parameters: {'n_layers': 3, 'n_units_l1': 4, 'n_units_l2': 20, 'n_units_l3': 16, 'activation': 'relu', 'alpha': 0.0018274508859816032, 'learning_rate_init': 0.002592475660475161, 'batch_size': 128}. Best is trial 0 with value: 0.36647443102993993.


Best trial: 0. Best value: 0.366474:  36%|███▌      | 9/25 [00:54<01:11,  4.46s/it]

[I 2025-10-08 14:54:47,177] Trial 8 finished with value: 0.4113002413312427 and parameters: {'n_layers': 2, 'n_units_l1': 4, 'n_units_l2': 20, 'activation': 'relu', 'alpha': 1.7956984225677624e-06, 'learning_rate_init': 0.0004187594718900631, 'batch_size': 128}. Best is trial 0 with value: 0.36647443102993993.


Best trial: 0. Best value: 0.366474:  40%|████      | 10/25 [01:06<01:41,  6.79s/it]

[I 2025-10-08 14:54:59,206] Trial 9 finished with value: 0.37012690657257485 and parameters: {'n_layers': 3, 'n_units_l1': 12, 'n_units_l2': 4, 'n_units_l3': 16, 'activation': 'relu', 'alpha': 0.0012130221181165164, 'learning_rate_init': 0.0009718319944817398, 'batch_size': 64}. Best is trial 0 with value: 0.36647443102993993.
