In [5]:
import pandas as pd


In [6]:
# Load GDSC drug response data
gdsc_df = pd.read_parquet("../../../../bulk_state_of_the_art/data/processed/gdsc_final_cleaned.parquet")
# Load aligned bulk and SC embeddings
df = pd.read_csv("../../data/processed/pancancer_embeddings_30_pcs.csv", index_col=0)

print("✅ Loaded data:")
print("GDSC shape:", gdsc_df.shape)
print("Bulk embeddings:", df.shape)


✅ Loaded data:
GDSC shape: (571985, 3)
Bulk embeddings: (140, 30)


In [7]:
# Preview columns
print("📌 GDSC columns:", gdsc_df.columns)

# Make sure it has the required fields
required_cols = {"SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"}
assert required_cols.issubset(set(gdsc_df.columns)), "❌ GDSC dataset missing required columns!"

# Drop missing entries
gdsc_df = gdsc_df.dropna(subset=["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"])
gdsc_df["SANGER_MODEL_ID"] = gdsc_df["SANGER_MODEL_ID"].astype(str)

print("✅ Cleaned GDSC data:", gdsc_df.shape)


📌 GDSC columns: Index(['SANGER_MODEL_ID', 'DRUG_ID', 'LN_IC50'], dtype='object')
✅ Cleaned GDSC data: (571985, 3)


In [8]:
# Merge using SANGER_MODEL_ID
merged = gdsc_df.merge(df, on="SANGER_MODEL_ID", how="inner")

print("✅ Merged with bulk embeddings:")
print("Shape:", merged.shape)
print(merged.head(3))

# Save
merged.to_parquet("../../data/processed/gdsc_pancancer_embeddings_30_pcs.parquet", index=False)
print("💾 Saved to: data/training_data_bulk.csv")


✅ Merged with bulk embeddings:
Shape: (84683, 33)
  SANGER_MODEL_ID  DRUG_ID   LN_IC50       PC1       PC2        PC3       PC4  \
0       SIDM01111        1  2.033564 -5.927806  2.096332 -12.498588  6.431199   
1       SIDM00965        1  3.047965 -6.875136  2.150296 -11.770959  2.239541   
2       SIDM00921        1  3.009075 -4.583251 -3.857921  -3.565151  5.554976   

         PC5       PC6       PC7  ...      PC21      PC22      PC23      PC24  \
0 -11.229197 -0.144861 -7.687607  ... -1.172920 -3.893046  1.233242 -0.689263   
1  -4.746090 -6.712371 -4.839766  ...  0.812725 -0.839441  0.278099 -1.440266   
2   2.155770  2.658704 -5.520007  ... -0.692494  2.679631  1.143657  1.657525   

       PC25      PC26      PC27      PC28      PC29      PC30  
0 -0.280161 -0.979857  0.066761 -0.991211  2.449783 -1.100517  
1  1.127867  0.496000  0.403855 -0.737856  0.576991 -0.337974  
2  0.374371  0.350960  0.499228  0.249566 -0.499244  0.014701  

[3 rows x 33 columns]
💾 Saved to: data/trai