# Support Vector Regression (SVR)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv("qm9_fp_U0.csv")

# Columns:
# 0 = U0 (target)
# 1 = SMILES
# 2+ = Fingerprints

# Target
y = df.iloc[:, 0].values.astype(float)

# Drop SMILES (col 1)
X = df.iloc[:, 2:].values.astype(float)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (129012, 2048)
y shape: (129012,)


In [2]:
# First: 80% / 20%
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.20, random_state=42, shuffle=True
)

# Split the 20% temp into 10% val + 10% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,   # half of 20% → 10%
    random_state=42,
    shuffle=True
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)


Train: (103209, 2048)
Val: (12901, 2048)
Test: (12902, 2048)


In [5]:
np.save("X_train.npy", X_train)
np.save("X_val.npy", X_val)
np.save("X_test.npy", X_test)

np.save("y_train.npy", y_train)
np.save("y_val.npy", y_val)
np.save("y_test.npy", y_test)


# Subset training
Training on the full set was taking too long.

In [None]:
# svr_multikernel_subset_pca.py
import os, time, json, math
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from itertools import product
from joblib import Parallel, delayed, dump
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

SEED = 42
np.random.seed(SEED)

# ====================== CONFIG ======================
SUBSET_N = 5000                  # subset size for search (5k–10k is good)
USE_PCA_FOR_SEARCH = False        # PCA speeds up the search phase
USE_PCA_FOR_FINAL  = False       # True = keep PCA in the final model; False = refit without PCA
PCA_COMPONENTS = 300             # fixed # comps; typical sweet spot for ECFP2048
PCA_VAR_TARGET = None            # e.g., 0.99 to target 99% explained variance (ignored if PCA_COMPONENTS set)
CACHE_MB = 2000                  # libsvm cache in MB

param_grid_rbf = {
    "kernel":  ["rbf"],
    "C":       [300, 600, 900],         # push higher; 300 looked good
    "epsilon": [0.005, 0.01],           # a bit tighter around best
    "gamma":   ["scale", 1e-3, 3e-4],   # add mid-low between 1e-3 and 1e-4
}

param_grid_linear = {
    "kernel":  ["linear"],
    "C":       [10, 100, 300, 1000],    # cheap to try higher C
    "epsilon": [0.001, 0.01, 0.1],
}

param_grid_sigmoid = {
    "kernel":  ["sigmoid"],
    "C":       [10, 100, 300],          # small bump in capacity
    "epsilon": [0.01, 0.1],
    "gamma":   ["scale", 1e-3],         # keep compact; sigmoid is finicky
}

INCLUDE_POLY = False
param_grid_poly = {
    "kernel":  ["poly"],
    "degree":  [2],
    "C":       [10, 100],
    "epsilon": [0.01],
    "gamma":   ["scale"],
}
# ====================================================

# Assumes these exist in memory already:
# X_train, y_train, X_val, y_val, X_test, y_test

# ---------- Output dirs ----------
out = Path("artifacts_svr"); out.mkdir(exist_ok=True)
logs_dir = out / "logs"; logs_dir.mkdir(exist_ok=True)

# ---------- Build training subset ----------
if SUBSET_N is not None and SUBSET_N < len(X_train):
    idx = np.random.choice(len(X_train), SUBSET_N, replace=False)
    X_train_sub, y_train_sub = X_train[idx], y_train[idx]
else:
    X_train_sub, y_train_sub = X_train, y_train

print(f"Search subset: {len(X_train_sub)} / {len(X_train)}")

# ---------- Helpers ----------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def param_row_key(params: dict) -> str:
    return json.dumps(params, sort_keys=True)

def append_row(path: Path, row: dict):
    write_header = not path.exists()
    pd.DataFrame([row]).to_csv(path, mode="a", header=write_header, index=False)

def grid_to_param_list(grid: dict):
    keys = list(grid.keys())
    return [dict(zip(keys, vals)) for vals in product(*[grid[k] for k in keys])]

def eval_one(params, X_tr, y_tr, X_va, y_va):
    # Map flat params to SVR kwargs
    svr_kwargs = dict(C=params["C"], epsilon=params["epsilon"], cache_size=CACHE_MB)
    if "kernel" in params: svr_kwargs["kernel"] = params["kernel"]
    if "gamma"  in params: svr_kwargs["gamma"]  = params["gamma"]
    if "degree" in params: svr_kwargs["degree"] = params["degree"]

    model = SVR(**svr_kwargs)
    t0 = time.perf_counter()
    model.fit(X_tr, y_tr)
    pred_tr  = model.predict(X_tr)
    pred_val = model.predict(X_va)
    secs = time.perf_counter() - t0
    return {
        **params,
        "rmse_train": rmse(y_tr, pred_tr),
        "rmse_val": rmse(y_va, pred_val),
        "mae_val": mean_absolute_error(y_va, pred_val),
        "r2_val": r2_score(y_va, pred_val),
        "secs": secs
    }

# ---------- Optional PCA for SEARCH (fit once on subset) ----------
if USE_PCA_FOR_SEARCH:
    # Scale -> PCA on subset; transform subset + val
    scaler_sub = StandardScaler().fit(X_train_sub.astype(np.float32))
    Xtr_s = scaler_sub.transform(X_train_sub.astype(np.float32))
    Xva_s = scaler_sub.transform(X_val.astype(np.float32))

    if PCA_COMPONENTS is not None:
        pca_sub = PCA(n_components=PCA_COMPONENTS, random_state=SEED).fit(Xtr_s)
    else:
        # choose by variance target
        pca_probe = PCA(n_components=min(Xtr_s.shape[0], Xtr_s.shape[1]), random_state=SEED).fit(Xtr_s)
        cume = np.cumsum(pca_probe.explained_variance_ratio_)
        k = int(np.searchsorted(cume, PCA_VAR_TARGET if PCA_VAR_TARGET else 0.99) + 1)
        pca_sub = PCA(n_components=k, random_state=SEED).fit(Xtr_s)
        print(f"[SEARCH PCA] Target variance={PCA_VAR_TARGET or 0.99:.2f}; components={k}; "
              f"explained={cume[k-1]:.4f}")

    X_train_sub_search = pca_sub.transform(Xtr_s)
    X_val_search       = pca_sub.transform(Xva_s)
    print(f"[SEARCH PCA] Using {pca_sub.n_components_} comps; "
          f"explained={pca_sub.explained_variance_ratio_.sum():.4f}")
else:
    X_train_sub_search = X_train_sub.astype(np.float32)
    X_val_search       = X_val.astype(np.float32)

y_train_sub_search = y_train_sub.astype(np.float32)
y_val_search       = y_val.astype(np.float32)

# ---------- Search spaces ----------
search_spaces = {
    "rbf": param_grid_rbf,
    # "linear": param_grid_linear,
    # "sigmoid": param_grid_sigmoid,
}
if INCLUDE_POLY:
    search_spaces["poly"] = param_grid_poly

# ---------- Chunked, parallel, resumable search ----------
best_per_kernel = {}
combined_csv = out / "val_search_results_subset_streaming_pca.csv"
combined_csv = out / "val_search_results_subset_streaming_rbf.csv"

# start fresh for this run
if combined_csv.exists():
    combined_csv.unlink()


n_jobs = os.cpu_count() or 4
BATCH = max(2, n_jobs * 2)

for kernel_name, grid in search_spaces.items():
    print(f"\n--- Searching kernel: {kernel_name} ---")
    kernel_csv = logs_dir / f"{kernel_name}_stream_pca.csv"

    plist = grid_to_param_list(grid)
    done_keys = set()
    if kernel_csv.exists():
        try:
            prev = pd.read_csv(kernel_csv)
            for _, row in prev.iterrows():
                params = {k: row[k] for k in grid.keys() if k in row}
                done_keys.add(param_row_key(params))
        except Exception:
            pass
    todo = [p for p in plist if param_row_key(p) not in done_keys]
    total = len(plist); remaining = len(todo)
    print(f"Total combos: {total} | Already done: {total-remaining} | Remaining: {remaining}")

    for start in range(0, remaining, BATCH):
        chunk = todo[start:start+BATCH]
        print(f"[{kernel_name}] Batch {start//BATCH + 1}/{math.ceil(remaining/BATCH)} "
              f"({len(chunk)} configs)...")

        results = Parallel(n_jobs=n_jobs, backend="loky", verbose=10, batch_size=1)(
            delayed(eval_one)(
                p,
                X_train_sub_search, y_train_sub_search,
                X_val_search,       y_val_search
            ) for p in chunk
        )

        for row in results:
            row_stream = dict(row); row_stream["kernel_group"] = kernel_name
            append_row(kernel_csv, kernel_row := row_stream)
            append_row(combined_csv, kernel_row)
            print(f"[{kernel_name}] {row.get('kernel')}, C={row.get('C')}, "
                  f"eps={row.get('epsilon')}, gamma={row.get('gamma','-')} | "
                  f"val RMSE={row['rmse_val']:.6f} | {row['secs']:.1f}s")

    df_k = pd.read_csv(kernel_csv).sort_values("rmse_val").reset_index(drop=True)
    best_row = df_k.iloc[0].to_dict()
    best_params = {k: best_row[k] for k in grid.keys()}
    best_per_kernel[kernel_name] = {"best_row": best_row, "best_params": best_params}
    print(f"Best {kernel_name}: val RMSE={best_row['rmse_val']:.6f} | params={best_params}")

# ---------- Best overall ----------
all_df = pd.read_csv(combined_csv)

# drop any junk rows without kernel_group
all_df = all_df[all_df["kernel_group"].notna()]
all_df = all_df[all_df["kernel_group"].isin(search_spaces.keys())]
all_df = all_df.sort_values("rmse_val").reset_index(drop=True)

if len(all_df) == 0:
    raise RuntimeError("No valid rows in combined_csv after filtering; check logs/ directory.")

best_overall_row = all_df.iloc[0].to_dict()
best_kernel_name = best_overall_row["kernel_group"]
best_overall_params = {k: best_overall_row[k] for k in search_spaces[best_kernel_name].keys()}

print("\n=== Best per kernel (subset+PCA search) ===")
for k, v in best_per_kernel.items():
    print(f"[{k}] RMSE_val={v['best_row']['rmse_val']:.6f} | params={v['best_params']}")
print("\n=== Best overall (subset+PCA search) ===")
print(f"Kernel: {best_kernel_name}")
print("Params:", best_overall_params)

# ---------- Final train on full Train+Val ----------
from joblib import dump
X_train_full = np.vstack([X_train, X_val]).astype(np.float32)
y_train_full = np.concatenate([y_train, y_val]).astype(np.float32)

if USE_PCA_FOR_FINAL:
    # Fit scaler+PCA on full Train+Val, then SVR on reduced space
    scaler_full = StandardScaler().fit(X_train_full)
    Xtf = scaler_full.transform(X_train_full)
    if PCA_COMPONENTS is not None:
        pca_full = PCA(n_components=PCA_COMPONENTS, random_state=SEED).fit(Xtf)
    else:
        pca_probe = PCA(n_components=min(Xtf.shape), random_state=SEED).fit(Xtf)
        cume = np.cumsum(pca_probe.explained_variance_ratio_)
        k = int(np.searchsorted(cume, PCA_VAR_TARGET if PCA_VAR_TARGET else 0.99) + 1)
        pca_full = PCA(n_components=k, random_state=SEED).fit(Xtf)
        print(f"[FINAL PCA] Target variance={PCA_VAR_TARGET or 0.99:.2f}; components={k}; "
              f"explained={cume[k-1]:.4f}")
    Xtf_red = pca_full.transform(Xtf)

    svr_kwargs = dict(C=best_overall_params["C"], epsilon=best_overall_params["epsilon"],
                      kernel=best_overall_params["kernel"], cache_size=CACHE_MB)
    if "gamma"  in best_overall_params: svr_kwargs["gamma"]  = best_overall_params["gamma"]
    if "degree" in best_overall_params: svr_kwargs["degree"] = best_overall_params["degree"]

    final_model = SVR(**svr_kwargs)
    t0 = time.perf_counter()
    final_model.fit(Xtf_red, y_train_full)
    print(f"\nFinal fit (WITH PCA) in {(time.perf_counter()-t0)/60:.2f} min.")

    # Test
    X_test_f  = scaler_full.transform(X_test.astype(np.float32))
    X_test_rf = pca_full.transform(X_test_f)
    y_pred = final_model.predict(X_test_rf)

    # Save artifacts
    dump({"scaler": scaler_full, "pca": pca_full, "svr": final_model}, out / "svr_with_pca.joblib")
else:
    # Refit WITHOUT PCA: full pipeline = scale -> SVR on raw 2048D
    final_pipe = Pipeline([
        ("scale", StandardScaler()),
        ("svr", SVR(
            C=best_overall_params["C"],
            epsilon=best_overall_params["epsilon"],
            kernel=best_overall_params["kernel"],
            gamma=best_overall_params.get("gamma", "scale"),
            degree=best_overall_params.get("degree", 3),
            cache_size=CACHE_MB
        ))
    ])
    t0 = time.perf_counter()
    final_pipe.fit(X_train_full, y_train_full)
    print(f"\nFinal fit (NO PCA) in {(time.perf_counter()-t0)/60:.2f} min.")
    y_pred = final_pipe.predict(X_test.astype(np.float32))
    dump(final_pipe, out / "svr_pipeline_best.joblib")

# ---------- Metrics + parity plot ----------
rmse_test = rmse(y_test, y_pred)
mae_test  = mean_absolute_error(y_test, y_pred)
r2_test   = r2_score(y_test, y_pred)
print("\n=== Test Performance ===")
print(f"RMSE: {rmse_test:.6f}")
print(f"MAE : {mae_test:.6f}")
print(f"R^2 : {r2_test:.6f}")

plt.figure(figsize=(4,4))
plt.scatter(y_test, y_pred, s=5, alpha=0.6)
mn, mx = min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())
plt.plot([mn, mx], [mn, mx], lw=2)
plt.xlabel("True U0"); plt.ylabel("Predicted U0")
plt.title(f"SVR Parity (Test) — {best_kernel_name} (search via PCA)")
plt.tight_layout(); plt.show()

# ---------- Save logs ----------
all_df.to_csv(out / "val_search_results_subset_pca_all.csv", index=False)
summary_rows = []
for k, v in best_per_kernel.items():
    br = v["best_row"]
    summary_rows.append({
        "kernel": k, **v["best_params"],
        "rmse_train": br["rmse_train"], "rmse_val": br["rmse_val"],
        "mae_val": br["mae_val"], "r2_val": br["r2_val"], "secs": br["secs"]
    })
pd.DataFrame(summary_rows).to_csv(out / "best_per_kernel_subset_pca.csv", index=False)


Search subset: 5000 / 103209

--- Searching kernel: rbf ---
Total combos: 18 | Already done: 0 | Remaining: 18
[rbf] Batch 1/1 (18 configs)...


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   3 out of  18 | elapsed:  5.0min remaining: 25.0min
[Parallel(n_jobs=20)]: Done   5 out of  18 | elapsed:  5.0min remaining: 13.0min
[Parallel(n_jobs=20)]: Done   7 out of  18 | elapsed:  5.0min remaining:  7.9min
[Parallel(n_jobs=20)]: Done   9 out of  18 | elapsed:  5.0min remaining:  5.0min
[Parallel(n_jobs=20)]: Done  11 out of  18 | elapsed:  5.0min remaining:  3.2min
[Parallel(n_jobs=20)]: Done  13 out of  18 | elapsed:  5.0min remaining:  1.9min
[Parallel(n_jobs=20)]: Done  15 out of  18 | elapsed:  5.0min remaining:  1.0min
[Parallel(n_jobs=20)]: Done  18 out of  18 | elapsed:  5.0min finished


[rbf] rbf, C=300, eps=0.005, gamma=scale | val RMSE=660.161951 | 300.5s
[rbf] rbf, C=300, eps=0.005, gamma=0.001 | val RMSE=849.320152 | 301.2s
[rbf] rbf, C=300, eps=0.005, gamma=0.0003 | val RMSE=972.532852 | 299.8s
[rbf] rbf, C=300, eps=0.01, gamma=scale | val RMSE=660.161865 | 299.9s
[rbf] rbf, C=300, eps=0.01, gamma=0.001 | val RMSE=849.320307 | 298.8s
[rbf] rbf, C=300, eps=0.01, gamma=0.0003 | val RMSE=972.532836 | 300.9s
[rbf] rbf, C=600, eps=0.005, gamma=scale | val RMSE=631.593856 | 301.1s
[rbf] rbf, C=600, eps=0.005, gamma=0.001 | val RMSE=781.747675 | 299.3s
[rbf] rbf, C=600, eps=0.005, gamma=0.0003 | val RMSE=906.970348 | 300.2s
[rbf] rbf, C=600, eps=0.01, gamma=scale | val RMSE=631.593809 | 300.4s
[rbf] rbf, C=600, eps=0.01, gamma=0.001 | val RMSE=781.747580 | 299.8s
[rbf] rbf, C=600, eps=0.01, gamma=0.0003 | val RMSE=906.970243 | 300.7s
[rbf] rbf, C=900, eps=0.005, gamma=scale | val RMSE=616.683708 | 300.6s
[rbf] rbf, C=900, eps=0.005, gamma=0.001 | val RMSE=758.529204 | 2

In [5]:
# svr_rbf_subset_final.py
import os, time, json, math
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from itertools import product
from joblib import Parallel, delayed, dump
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ====================== CONFIG ======================
SEED = 42
np.random.seed(SEED)

# Search subset (no PCA for search)
SUBSET_N = 4000           # 3k–5k is good; faster than 8k
USE_PCA_FOR_SEARCH = False  # must be False for Option A

# RBF-only, tiny focused grid (4–6 combos)
param_grid_rbf = {
    "kernel":  ["rbf"],
    "C":       [300, 600, 900],         # push higher; 300 looked good
    "epsilon": [0.005, 0.01],           # a bit tighter around best
    "gamma":   ["scale", 1e-3, 3e-4],   # add mid-low between 1e-3 and 1e-4
}
# param_grid_rbf = {
#     "kernel":  ["rbf"],
#     "C":       [300, 600, 900],
#     "epsilon": [0.01],
#     "gamma":   ["scale", 1e-3],
# }

CACHE_MB = 4000           # bigger cache helps RBF
N_JOBS = os.cpu_count() or 4
BATCH = max(2, N_JOBS * 2)

# Final training controls (subset to keep runtime sane)
FINAL_USE_SUBSET = True
FINAL_N = 25000          # train final exact RBF on 25k samples

# ==================== DATA ASSUMPTION ====================
# Assumes in-memory: X_train, y_train, X_val, y_val, X_test, y_test

# ---------- Output dirs ----------
out = Path("artifacts_svr"); out.mkdir(exist_ok=True)
logs_dir = out / "logs"; logs_dir.mkdir(exist_ok=True)

# ---------- Build training subset ----------
if SUBSET_N is not None and SUBSET_N < len(X_train):
    idx = np.random.choice(len(X_train), SUBSET_N, replace=False)
    X_train_sub, y_train_sub = X_train[idx], y_train[idx]
else:
    X_train_sub, y_train_sub = X_train, y_train

print(f"Search subset: {len(X_train_sub)} / {len(X_train)}")

# ---------- Helpers ----------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def param_row_key(params: dict) -> str:
    return json.dumps(params, sort_keys=True)

def append_row(path: Path, row: dict):
    write_header = not path.exists()
    pd.DataFrame([row]).to_csv(path, mode="a", header=write_header, index=False)

def grid_to_param_list(grid: dict):
    keys = list(grid.keys())
    return [dict(zip(keys, vals)) for vals in product(*[grid[k] for k in keys])]

def eval_one(params, X_tr, y_tr, X_va, y_va):
    pipe = Pipeline([
        ("scale", StandardScaler()),
        ("svr", SVR(
            C=params["C"], epsilon=params["epsilon"],
            kernel=params["kernel"], gamma=params.get("gamma", "scale"),
            cache_size=CACHE_MB, tol=1e-3, shrinking=True
        ))
    ])
    t0 = time.perf_counter()
    pipe.fit(X_tr, y_tr)
    pred_tr  = pipe.predict(X_tr)
    pred_val = pipe.predict(X_va)
    secs = time.perf_counter() - t0
    return {
        **params,
        "rmse_train": rmse(y_tr, pred_tr),
        "rmse_val": rmse(y_va, pred_val),
        "mae_val": mean_absolute_error(y_va, pred_val),
        "r2_val": r2_score(y_va, pred_val),
        "secs": secs
    }

# ---------- Search spaces (RBF only) ----------
search_spaces = { "rbf": param_grid_rbf }

# ---------- Prepare search matrices (no PCA path) ----------
X_train_sub_search = X_train_sub.astype(np.float32)
X_val_search       = X_val.astype(np.float32)
y_train_sub_search = y_train_sub.astype(np.float32)
y_val_search       = y_val.astype(np.float32)

# ---------- Chunked, parallel, resumable search ----------
best_per_kernel = {}
combined_csv = out / "val_search_results_subset_streaming_rbf.csv"
# Start fresh for this run; prevents NaN kernel_group collisions
if combined_csv.exists():
    combined_csv.unlink()

for kernel_name, grid in search_spaces.items():
    print(f"\n--- Searching kernel: {kernel_name} ---")
    kernel_csv = logs_dir / f"{kernel_name}_stream.csv"

    plist = grid_to_param_list(grid)
    done_keys = set()
    if kernel_csv.exists():
        try:
            prev = pd.read_csv(kernel_csv)
            for _, row in prev.iterrows():
                params = {k: row[k] for k in grid.keys() if k in row}
                done_keys.add(param_row_key(params))
        except Exception:
            pass
    todo = [p for p in plist if param_row_key(p) not in done_keys]
    total = len(plist); remaining = len(todo)
    print(f"Total combos: {total} | Already done: {total-remaining} | Remaining: {remaining}")

    for start in range(0, remaining, BATCH):
        chunk = todo[start:start+BATCH]
        print(f"[{kernel_name}] Batch {start//BATCH + 1}/{math.ceil(remaining/BATCH)} "
              f"({len(chunk)} configs)...")

        results = Parallel(n_jobs=N_JOBS, backend="loky", verbose=10, batch_size=1)(
            delayed(eval_one)(
                p,
                X_train_sub_search, y_train_sub_search,
                X_val_search,       y_val_search
            ) for p in chunk
        )

        for row in results:
            row_stream = dict(row); row_stream["kernel_group"] = kernel_name
            append_row(kernel_csv, row_stream)
            append_row(combined_csv, row_stream)
            print(f"[{kernel_name}] {row.get('kernel')}, C={row.get('C')}, "
                  f"eps={row.get('epsilon')}, gamma={row.get('gamma','-')} | "
                  f"val RMSE={row['rmse_val']:.6f} | {row['secs']:.1f}s")

    df_k = pd.read_csv(kernel_csv).sort_values("rmse_val").reset_index(drop=True)
    best_row = df_k.iloc[0].to_dict()
    best_params = {k: best_row[k] for k in grid.keys()}
    best_per_kernel[kernel_name] = {"best_row": best_row, "best_params": best_params}
    print(f"Best {kernel_name}: val RMSE={best_row['rmse_val']:.6f} | params={best_params}")

# ---------- Best overall ----------
all_df = pd.read_csv(combined_csv)
# guard against junk/old rows
all_df = all_df[all_df["kernel_group"].notna()]
all_df = all_df[all_df["kernel_group"].isin(search_spaces.keys())]
all_df = all_df.sort_values("rmse_val").reset_index(drop=True)

if len(all_df) == 0:
    raise RuntimeError("No valid rows in combined_csv after filtering; check logs/ directory.")

best_overall_row = all_df.iloc[0].to_dict()
best_kernel_name = best_overall_row["kernel_group"]
best_overall_params = {k: best_overall_row[k] for k in search_spaces[best_kernel_name].keys()}

print("\n=== Best per kernel (subset search) ===")
for k, v in best_per_kernel.items():
    print(f"[{k}] RMSE_val={v['best_row']['rmse_val']:.6f} | params={v['best_params']}")
print("\n=== Best overall (subset search) ===")
print(f"Kernel: {best_kernel_name}")
print("Params:", best_overall_params)

# ---------- Final train on Train+Val (optionally on a subset) ----------
X_train_full = np.vstack([X_train, X_val]).astype(np.float32)
y_train_full = np.concatenate([y_train, y_val]).astype(np.float32)

final_pipe = Pipeline([
    ("scale", StandardScaler()),
    ("svr", SVR(
        C=best_overall_params["C"],
        epsilon=best_overall_params["epsilon"],
        kernel=best_overall_params["kernel"],
        gamma=best_overall_params.get("gamma", "scale"),
        degree=best_overall_params.get("degree", 3),
        cache_size=CACHE_MB, tol=1e-3, shrinking=True
    ))
])

# Choose subset (25k) or full
if FINAL_USE_SUBSET:
    n_all = len(X_train_full)
    n_final = min(FINAL_N, n_all)
    rng = np.random.default_rng(SEED)
    idx_final = rng.choice(n_all, n_final, replace=False)
    X_final = X_train_full[idx_final]
    y_final = y_train_full[idx_final]
    pd.DataFrame({"idx_final": idx_final}).to_csv(out / f"final_subset_indices_{n_final}.csv", index=False)
    print(f"\n[FINAL] Training on subset: {n_final} / {n_all}")
else:
    X_final, y_final = X_train_full, y_train_full
    print(f"\n[FINAL] Training on FULL Train+Val: {len(X_final)}")

print("\n[FINAL] Starting fit (exact RBF)...")
t0 = time.perf_counter()
final_pipe.fit(X_final, y_final)
fit_mins = (time.perf_counter() - t0) / 60.0
print(f"[FINAL] Done in {fit_mins:.2f} min.")

# ---------- Test evaluation ----------
y_pred = final_pipe.predict(X_test.astype(np.float32))
rmse_test = rmse(y_test, y_pred)
mae_test  = mean_absolute_error(y_test, y_pred)
r2_test   = r2_score(y_test, y_pred)

print("\n=== Test Performance (Final, subset={}) ===".format(FINAL_USE_SUBSET))
print(f"RMSE: {rmse_test:.6f}")
print(f"MAE : {mae_test:.6f}")
print(f"R^2 : {r2_test:.6f}")

# ---------- Parity plot ----------
plt.figure(figsize=(4,4))
plt.scatter(y_test, y_pred, s=5, alpha=0.6)
mn, mx = min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())
plt.plot([mn, mx], [mn, mx], lw=2)
plt.xlabel("True U0"); plt.ylabel("Predicted U0")
plt.title(f"SVR Parity (Test) — RBF, subset={FINAL_USE_SUBSET}")
plt.tight_layout(); plt.show()

# ---------- Save artifacts ----------
all_df.to_csv(out / "val_search_results_subset_rbf_all.csv", index=False)
summary_rows = []
for k, v in best_per_kernel.items():
    br = v["best_row"]
    summary_rows.append({
        "kernel": k, **v["best_params"],
        "rmse_train": br["rmse_train"], "rmse_val": br["rmse_val"],
        "mae_val": br["mae_val"], "r2_val": br["r2_val"], "secs": br["secs"]
    })
pd.DataFrame(summary_rows).to_csv(out / "best_per_kernel_subset_rbf.csv", index=False)

dump(final_pipe, out / ("svr_pipeline_final_subset.joblib" if FINAL_USE_SUBSET else "svr_pipeline_final_full.joblib"))
pd.DataFrame({"y_true": y_test, "y_pred": y_pred}).to_csv(out / "test_preds_final.csv", index=False)


Search subset: 4000 / 103209

--- Searching kernel: rbf ---
Total combos: 18 | Already done: 0 | Remaining: 18
[rbf] Batch 1/1 (18 configs)...


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   3 out of  18 | elapsed:  5.1min remaining: 25.6min
[Parallel(n_jobs=20)]: Done   5 out of  18 | elapsed:  5.2min remaining: 13.5min
[Parallel(n_jobs=20)]: Done   7 out of  18 | elapsed:  5.8min remaining:  9.1min
[Parallel(n_jobs=20)]: Done   9 out of  18 | elapsed:  6.0min remaining:  6.0min
[Parallel(n_jobs=20)]: Done  11 out of  18 | elapsed:  6.2min remaining:  3.9min
[Parallel(n_jobs=20)]: Done  13 out of  18 | elapsed:  6.3min remaining:  2.4min
[Parallel(n_jobs=20)]: Done  15 out of  18 | elapsed:  6.4min remaining:  1.3min
[Parallel(n_jobs=20)]: Done  18 out of  18 | elapsed:  6.7min finished


[rbf] rbf, C=300, eps=0.005, gamma=scale | val RMSE=879.611271 | 304.9s
[rbf] rbf, C=300, eps=0.005, gamma=0.001 | val RMSE=978.098027 | 303.3s
[rbf] rbf, C=300, eps=0.005, gamma=0.0003 | val RMSE=848.604794 | 309.5s
[rbf] rbf, C=300, eps=0.01, gamma=scale | val RMSE=879.611369 | 310.5s
[rbf] rbf, C=300, eps=0.01, gamma=0.001 | val RMSE=978.098104 | 309.1s
[rbf] rbf, C=300, eps=0.01, gamma=0.0003 | val RMSE=848.604897 | 305.5s
[rbf] rbf, C=600, eps=0.005, gamma=scale | val RMSE=820.151224 | 359.5s
[rbf] rbf, C=600, eps=0.005, gamma=0.001 | val RMSE=937.045471 | 346.5s
[rbf] rbf, C=600, eps=0.005, gamma=0.0003 | val RMSE=786.292332 | 368.9s
[rbf] rbf, C=600, eps=0.01, gamma=scale | val RMSE=820.151489 | 359.8s
[rbf] rbf, C=600, eps=0.01, gamma=0.001 | val RMSE=937.045645 | 350.3s
[rbf] rbf, C=600, eps=0.01, gamma=0.0003 | val RMSE=786.292511 | 369.8s
[rbf] rbf, C=900, eps=0.005, gamma=scale | val RMSE=797.498679 | 384.4s
[rbf] rbf, C=900, eps=0.005, gamma=0.001 | val RMSE=914.390759 | 3

InvalidParameterError: The 'gamma' parameter of SVR must be a str among {'auto', 'scale'} or a float in the range [0.0, inf). Got '0.0003' instead.

In [6]:
# === Final exact RBF SVR on a 25k subset (reproducible) ===
import time, numpy as np, pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump

# ---- Best params from your search (with gamma string -> float fix) ----
best_params = {"kernel": "rbf", "C": 900, "epsilon": 0.005, "gamma": "0.0003"}
gamma_val = best_params["gamma"]
if isinstance(gamma_val, str) and gamma_val not in ("scale", "auto"):
    try:
        gamma_val = float(gamma_val)
    except Exception:
        gamma_val = "scale"

# ---- Build Train+Val, pick reproducible 25k subset ----
SEED = 42
FINAL_N = 25_000
out = Path("artifacts_svr"); out.mkdir(exist_ok=True)

X_train_full = np.vstack([X_train, X_val]).astype(np.float32)
y_train_full = np.concatenate([y_train, y_val]).astype(np.float32)
n_all = len(X_train_full)

rng = np.random.default_rng(SEED)
idx_final = rng.choice(n_all, min(FINAL_N, n_all), replace=False)
X_final = X_train_full[idx_final]
y_final = y_train_full[idx_final]

# save indices for reproducibility
pd.DataFrame({"idx_final": idx_final}).to_csv(out / f"final_subset_indices_{len(idx_final)}.csv", index=False)
print(f"[FINAL] Training on subset: {len(idx_final)} / {n_all}")

# ---- Pipeline and fit (timed) ----
final_pipe = Pipeline([
    ("scale", StandardScaler()),
    ("svr", SVR(
        C=best_params["C"],
        epsilon=best_params["epsilon"],
        kernel=best_params["kernel"],
        gamma=gamma_val,
        cache_size=4000,
        tol=1e-3,
        shrinking=True
    ))
])

print("\n[FINAL] Starting fit (exact RBF on 25k)...")
t0 = time.perf_counter()
final_pipe.fit(X_final, y_final)
mins = (time.perf_counter() - t0) / 60.0
print(f"[FINAL] Done in {mins:.2f} min.")

# ---- Test evaluation ----
X_test_f = X_test.astype(np.float32)
y_pred = final_pipe.predict(X_test_f)
rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
mae  = float(mean_absolute_error(y_test, y_pred))
r2   = float(r2_score(y_test, y_pred))

print("\n=== Test Performance (RBF, 25k subset) ===")
print(f"RMSE: {rmse:.6f}")
print(f"MAE : {mae:.6f}")
print(f"R^2 : {r2:.6f}")

# ---- Save artifacts ----
dump(final_pipe, out / "svr_pipeline_final_subset25k.joblib")
pd.DataFrame({"y_true": y_test, "y_pred": y_pred}).to_csv(out / "test_preds_final_subset25k.csv", index=False)
print(f"[SAVE] Model -> {out/'svr_pipeline_final_subset25k.joblib'}")
print(f"[SAVE] Predictions -> {out/'test_preds_final_subset25k.csv'}")


[FINAL] Training on subset: 25000 / 116110

[FINAL] Starting fit (exact RBF on 25k)...
[FINAL] Done in 51.27 min.

=== Test Performance (RBF, 25k subset) ===
RMSE: 628.537507
MAE : 401.022466
R^2 : 0.668534
[SAVE] Model -> artifacts_svr\svr_pipeline_final_subset25k.joblib
[SAVE] Predictions -> artifacts_svr\test_preds_final_subset25k.csv


In [7]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# -------------------------------
# Load the same 25k subset indices
# -------------------------------
idx_path = "artifacts_svr/final_subset_indices_25000.csv"  # adjust if named differently
idx_final = pd.read_csv(idx_path)["idx_final"].values

# -------------------------------
# Build the 25k training pool
# -------------------------------
X_train_full = np.vstack([X_train, X_val]).astype(np.float32)
y_train_full = np.concatenate([y_train, y_val]).astype(np.float32)

X_pool = X_train_full[idx_final]
y_pool = y_train_full[idx_final]

# -------------------------------
# Choose 4000-sample training subset
# -------------------------------
SEED = 42
rng = np.random.default_rng(SEED)
N_SMALL = 4000

subset_idx = rng.choice(len(X_pool), N_SMALL, replace=False)
X_small = X_pool[subset_idx]
y_small = y_pool[subset_idx]

print(f"Training on subset: {len(X_small)} samples")

# -------------------------------
# Define best SVR model
# -------------------------------
best_params = {"kernel": "rbf", "C": 900, "epsilon": 0.005, "gamma": 3e-4}

pipe = Pipeline([
    ("scale", StandardScaler()),
    ("svr", SVR(
        C=best_params["C"],
        epsilon=best_params["epsilon"],
        kernel=best_params["kernel"],
        gamma=best_params["gamma"],
        cache_size=4000,
        tol=1e-3
    ))
])

# -------------------------------
# Fit and evaluate on training subset
# -------------------------------
pipe.fit(X_small, y_small)
y_pred_train = pipe.predict(X_small)

rmse_train = np.sqrt(mean_squared_error(y_small, y_pred_train))
mae_train  = mean_absolute_error(y_small, y_pred_train)

print("=== Training subset performance (4000 samples) ===")
print(f"RMSE: {rmse_train:.4f}")
print(f"MAE : {mae_train:.4f}")


Training on subset: 4000 samples
=== Training subset performance (4000 samples) ===
RMSE: 531.0762
MAE : 223.6871
