In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Cell 2: Load Data
data = pd.read_parquet("data/gdsc_single_cell_aligned.parquet")
gene_emb = pd.read_parquet("data/gene_embeddings.parquet")

# If "gene" is a column instead of index, fix it
if "gene" in gene_emb.columns:
    gene_emb.set_index("gene", inplace=True)

print("Expression data shape:", data.shape)
print("Gene embedding matrix shape:", gene_emb.shape)

Expression data shape: (575197, 2003)
Gene embedding matrix shape: (37307, 512)


In [None]:
# Cell 2: Train per-drug MLP models with gene embeddings
os.makedirs("models_mlp", exist_ok=True)

gene_cols = [col for col in data.columns if col not in ["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"]]
embed_dim = gene_emb.shape[1]
all_results = []

for drug_id in sorted(data["DRUG_ID"].unique())[:20]:  # First 20 drugs
    print(f"\n🧠 Drug ID: {drug_id}")
    
    df = data[data["DRUG_ID"] == drug_id].dropna(subset=["LN_IC50"])
    
    if df.shape[0] < 10:
        print("❗ Skipped (not enough samples)")
        continue
    
    # Expression matrix
    X_expr = df[gene_cols]
    y = df["LN_IC50"].values
    
    # Filter HVGs that exist in the embedding file
    valid_genes = [g for g in gene_cols if g in gene_emb.index]
    if len(valid_genes) == 0:
        print("⚠️ No valid genes with embeddings found.")
        continue
    
    expr_matrix = X_expr[valid_genes].values
    emb_matrix = gene_emb.loc[valid_genes].values  # (genes, 512)
    
    # Weighted avg of gene embeddings
    emb_features = expr_matrix @ emb_matrix  # (samples, 512)

    # Final input = expression + embedding
    X_final = np.hstack([expr_matrix, emb_features])

    # Drop any rows with NaNs
    valid_rows = ~np.isnan(X_final).any(axis=1)
    X_final = X_final[valid_rows]
    y = y[valid_rows]
    
    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_final, y, test_size=0.2, random_state=42
    )
    
    # Train MLP
    model = MLPRegressor(hidden_layer_sizes=(256, 128), max_iter=500, random_state=42)
    model.fit(X_train, y_train)
    
    # Eval
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    
    print(f"✅ RMSE: {rmse:.4f} | R²: {r2:.4f}")
    
    # Save
    path = f"models_mlp/mlp_drug_{drug_id}.pkl"
    joblib.dump(model, path)
    
    all_results.append({
        "DRUG_ID": drug_id,
        "num_samples": len(df),
        "rmse": rmse,
        "r2": r2,
        "model_path": path
    })



🧠 Drug ID: 1
✅ RMSE: 6.2404 | R²: -25.5605

🧠 Drug ID: 3
✅ RMSE: 7.2750 | R²: -10.2046

🧠 Drug ID: 5
✅ RMSE: 5.5731 | R²: -10.4041

🧠 Drug ID: 6
✅ RMSE: 7.0630 | R²: -67.3452

🧠 Drug ID: 9
✅ RMSE: 8.2615 | R²: -30.1136

🧠 Drug ID: 11
✅ RMSE: 6.6312 | R²: -9.6132

🧠 Drug ID: 17




✅ RMSE: 7.2452 | R²: -59.2025

🧠 Drug ID: 29
✅ RMSE: 5.9654 | R²: -9.4958

🧠 Drug ID: 30


In [None]:
# Cell 4: Save performance summary
results_df = pd.DataFrame(all_results)
results_df.to_csv("models_mlp/model_performance_summary.csv", index=False)
results_df.head()