In [1]:
import pandas as pd

# Paths
GDSC_PARQUET = "../../../../bulk_state_of_the_art/data/processed/gdsc_final_cleaned.parquet"
BULK_PARQUET = "../../data/filtered_datasets/bulk_embeddings_voom.parquet"   # bulk matrix filtered to overlap HVGs

OUT_PARQUET  = "../../data/gdsc_bulk_embeddings_voom.parquet"


In [2]:
# Load GDSC drug response data
gdsc_df = pd.read_parquet(GDSC_PARQUET)

# Load bulk overlap gene matrix
bulk_df = pd.read_parquet(BULK_PARQUET)

if "model_id" in bulk_df.columns:
    bulk_df = bulk_df.rename(columns={"model_id": "SANGER_MODEL_ID"})
if bulk_df.index.name == "model_id":
    bulk_df.index.name = "SANGER_MODEL_ID"

print("✅ Loaded data:")
print("GDSC shape:", gdsc_df.shape)
print("Bulk (overlap genes):", bulk_df.shape)


✅ Loaded data:
GDSC shape: (571985, 3)
Bulk (overlap genes): (1362, 3073)


In [3]:
# Preview GDSC columns
print("📌 GDSC columns:", gdsc_df.columns.tolist())

# Make sure required fields exist
required_cols = {"SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"}
assert required_cols.issubset(gdsc_df.columns), "❌ Missing required columns in GDSC!"

# Drop missing entries
gdsc_df = gdsc_df.dropna(subset=["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"])
gdsc_df["SANGER_MODEL_ID"] = gdsc_df["SANGER_MODEL_ID"].astype(str)

print("✅ Cleaned GDSC data:", gdsc_df.shape)


📌 GDSC columns: ['SANGER_MODEL_ID', 'DRUG_ID', 'LN_IC50']
✅ Cleaned GDSC data: (571985, 3)


In [4]:
# List of drugs you care about
selected_drugs = [1845, 2540, 2038, 2508, 1096, 1931, 2515, 1089, 427, 1526]

# 🔹 Filter GDSC first (only these drugs)
gdsc_df = gdsc_df[gdsc_df["DRUG_ID"].isin(selected_drugs)]

print("✅ Filtered GDSC data:", gdsc_df.shape)

# 🔹 Now merge with much smaller dataset
merged = gdsc_df.merge(bulk_df, on="SANGER_MODEL_ID", how="inner")

print("✅ Merged with bulk gene expression:")
print("Shape:", merged.shape)
print(merged.head(3))

# Save
merged.to_parquet(OUT_PARQUET, index=False)
print("💾 Saved training data to:", OUT_PARQUET)


✅ Filtered GDSC data: (8238, 3)
✅ Merged with bulk gene expression:
Shape: (8238, 3075)
  SANGER_MODEL_ID  DRUG_ID   LN_IC50   emb_000   emb_001   emb_002   emb_003  \
0       SIDM01132      427  5.463586 -0.727664  1.814604  0.828872  1.003094   
1       SIDM00848      427  4.186610 -0.849247  1.660583  0.726541  1.192063   
2       SIDM00269      427  4.004458 -0.833046  1.799614  0.714298  1.102611   

    emb_004   emb_005   emb_006  ...  emb_3062  emb_3063  emb_3064  emb_3065  \
0 -1.001114 -0.287234  1.069429  ...  0.694305 -0.754860 -2.229427  0.854278   
1 -1.035015 -0.455297  1.070964  ...  0.818762 -0.375695 -2.179980  0.944036   
2 -1.217057 -0.355538  0.966995  ...  0.971395 -0.387273 -2.284683  0.675352   

   emb_3066  emb_3067  emb_3068  emb_3069  emb_3070  emb_3071  
0  1.463058 -0.430300  0.447072  1.813935 -1.090608 -0.814830  
1  1.357933 -0.281219  0.410931  1.632804 -1.220379 -0.839649  
2  1.357754 -0.268246  0.324645  1.755962 -1.065003 -0.813306  

[3 rows x 307

In [5]:


merged.to_parquet(OUT_PARQUET, index=False)
print("💾 Saved training data to:", OUT_PARQUET)


💾 Saved training data to: ../../data/gdsc_bulk_embeddings_voom.parquet
