In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import os
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Cell 2: Load Data
data = pd.read_parquet("data/gdsc_single_cell_aligned.parquet")
gene_emb = pd.read_parquet("data/gene_embeddings.parquet")

# If "gene" is a column instead of index, fix it
if "gene" in gene_emb.columns:
    gene_emb.set_index("gene", inplace=True)

print("Expression data shape:", data.shape)
print("Gene embedding matrix shape:", gene_emb.shape)


Expression data shape: (575197, 2003)
Gene embedding matrix shape: (37307, 512)


In [None]:
# Cell 3: Train per-drug models with gene embeddings
os.makedirs("models_xgb", exist_ok=True)

gene_cols = [col for col in data.columns if col not in ["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"]]
embed_dim = gene_emb.shape[1]
all_results = []
i = 0

for drug_id in sorted(data["DRUG_ID"].unique()):
    if i >= 20:
        break
    i += 1
    print(f"\n🔬 Drug ID: {drug_id}")
    
    df = data[data["DRUG_ID"] == drug_id].dropna(subset=["LN_IC50"])
    
    if df.shape[0] < 10:
        print("❗ Skipped (not enough samples)")
        continue
    
    # Expression matrix
    X_expr = df[gene_cols]
    y = df["LN_IC50"].values
    
    # Filter HVGs that exist in the embedding file
    valid_genes = [g for g in gene_cols if g in gene_emb.index]
    if len(valid_genes) == 0:
        print("⚠️ No valid genes with embeddings found.")
        continue
    
    expr_matrix = X_expr[valid_genes].values
    emb_matrix = gene_emb.loc[valid_genes].values  # (genes, 512)
    
    # Weighted avg of gene embeddings
    emb_features = expr_matrix @ emb_matrix  # (samples, 512)

    # Final input = expression + embedding
    X_final = np.hstack([expr_matrix, emb_features])
    
    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_final, y, test_size=0.2, random_state=42
    )
    
    # Train XGBoost
    model = xgb.XGBRegressor(n_estimators=100, max_depth=5, random_state=42)
    model.fit(X_train, y_train)
    
    # Eval
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    
    print(f"✅ RMSE: {rmse:.4f} | R²: {r2:.4f}")
    
    # Save
    path = f"models_xgb/xgb_drug_{drug_id}.pkl"
    joblib.dump(model, path)
    
    all_results.append({
        "DRUG_ID": drug_id,
        "num_samples": len(df),
        "rmse": rmse,
        "r2": r2,
        "model_path": path
    })



🔬 Drug ID: 1
✅ RMSE: 1.1666 | R²: -0.4301

🔬 Drug ID: 3
✅ RMSE: 2.2814 | R²: -0.1874

🔬 Drug ID: 5
✅ RMSE: 1.4611 | R²: 0.0876

🔬 Drug ID: 6
✅ RMSE: 0.9872 | R²: -0.1367

🔬 Drug ID: 9
✅ RMSE: 1.5252 | R²: -0.2649

🔬 Drug ID: 11
✅ RMSE: 1.8954 | R²: 0.0157

🔬 Drug ID: 17
✅ RMSE: 0.8744 | R²: 0.0582

🔬 Drug ID: 29
✅ RMSE: 1.5486 | R²: 0.2870

🔬 Drug ID: 30
✅ RMSE: 1.3339 | R²: -0.5855

🔬 Drug ID: 32
✅ RMSE: 1.7521 | R²: 0.1972

🔬 Drug ID: 34
✅ RMSE: 0.7298 | R²: -0.2664

🔬 Drug ID: 35
✅ RMSE: 1.4396 | R²: 0.0577

🔬 Drug ID: 37
✅ RMSE: 0.9672 | R²: 0.1533

🔬 Drug ID: 38
✅ RMSE: 1.2816 | R²: 0.0035

🔬 Drug ID: 41
✅ RMSE: 1.5849 | R²: -0.0025

🔬 Drug ID: 45
✅ RMSE: 1.3262 | R²: -0.1579

🔬 Drug ID: 51
✅ RMSE: 2.0970 | R²: 0.2912

🔬 Drug ID: 52
✅ RMSE: 0.9847 | R²: -0.2507

🔬 Drug ID: 53
✅ RMSE: 1.4950 | R²: -0.1767

🔬 Drug ID: 54
✅ RMSE: 1.1124 | R²: -0.1473

🔬 Drug ID: 55
✅ RMSE: 1.8844 | R²: 0.1817

🔬 Drug ID: 56
✅ RMSE: 1.7682 | R²: -0.1003

🔬 Drug ID: 59
✅ RMSE: 1.1256 | R²: -0.0891

🔬 

KeyboardInterrupt: 

In [None]:
# Cell 4: Save performance summary
results_df = pd.DataFrame(all_results)
results_df.to_csv("models_xgb/model_performance_summary.csv", index=False)
results_df.head()