# Elastic Net Evaluation

This notebook trains **ElasticNet regression models per drug** using gene expression features.  
ElasticNet combines L1 (Lasso) and L2 (Ridge) regularization. We use `ElasticNetCV` to tune hyperparameters using 5-fold cross-validation.

Elastic Net combines both:
- **L1 regularization** (Lasso) → promotes sparsity
- **L2 regularization** (Ridge) → stabilizes coefficients

This allows it to both regularize and select features, making it a simple but powerful linear model.

We evaluate performance using:
- **RMSE (Root Mean Squared Error)**
- **R² (Coefficient of Determination)**

and visualize their **distribution across all drugs**.


In [4]:
import os, numpy as np, pandas as pd, joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import GroupKFold
from sklearn.impute import SimpleImputer

# ---- paths
DATA_PARQUET = "../../data/gdsc_bulk_embeddings_voom.parquet"
OUTDIR = "models_elasticnet_voom_emb"
os.makedirs(OUTDIR, exist_ok=True)

In [5]:
# ---- load & columns
data = pd.read_parquet(DATA_PARQUET)
exclude_cols = {"SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"}
emb_cols = [c for c in data.columns if c not in exclude_cols]  # your embedding features

# ---- drugs to fit
best_drugs = [1845, 2540, 2038, 2508, 1096, 1931, 2515, 1089, 427, 1526]

In [None]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer

# Paths
DATA_PARQUET = "../../data/gdsc_bulk_embeddings_voom.parquet"
OUTDIR = "embeddings"
os.makedirs(OUTDIR, exist_ok=True)

# Load dataset
data = pd.read_parquet(DATA_PARQUET)
exclude_cols = {"SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"}
gene_cols = [c for c in data.columns if c not in exclude_cols]

# Drugs to fit
best_drugs = [1845, 2540, 2038, 2508, 1096, 1931, 2515, 1089, 427, 1526]

# CV for hyperparameter tuning
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Elastic Net inside a Pipeline to avoid leakage
enet_cv = ElasticNetCV(
    alphas=np.logspace(-3, 2, 40),      # wider, log-spaced grid
    l1_ratio=[0.05, 0.2, 0.5, 0.8, 0.95],
    cv=kf,
    max_iter=20000,
    random_state=42,
    n_jobs=-1,
    selection="random"  # often faster for high-D
    # scoring is R^2 by default; you can set scoring="neg_mean_squared_error" if preferred
)

pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),  # robust to occasional NaNs
    ("scale", StandardScaler(with_mean=True, with_std=True)),
    ("model", enet_cv),
])

for drug_id in best_drugs:
    df = data.loc[data["DRUG_ID"] == drug_id].dropna(subset=["LN_IC50"])
    if df.shape[0] < 100:
        print(f"⚠️ Skipping drug {drug_id} (only {df.shape[0]} samples)")
        continue

    X = df[gene_cols].values
    y = df["LN_IC50"].values

    pipeline.fit(X, y)

    # Access chosen hyperparams if you want to log them
    chosen_alpha = pipeline.named_steps["model"].alpha_
    chosen_l1 = pipeline.named_steps["model"].l1_ratio_
    print(f"Drug {drug_id}: alpha={chosen_alpha:.4g}, l1_ratio={chosen_l1:.3g}")

    # Save the whole pipeline + feature order
    outfile = os.path.join(OUTDIR, f"elasticnet_drug{drug_id}_log1p.joblib")
    joblib.dump({"pipeline": pipeline, "gene_cols": gene_cols}, outfile)
    print(f"💾 Saved model for drug {drug_id} to {outfile}")


ValueError: Passing extra keyword arguments to ElasticNetCV.fit is only supported if enable_metadata_routing=True, which you can set using `sklearn.set_config`. See the User Guide <https://scikit-learn.org/stable/metadata_routing.html> for more details. Extra parameters passed are: {'groups'}