In [7]:
import pandas as pd


In [8]:
# Load GDSC drug response data
gdsc_df = pd.read_parquet("../../../../bulk_state_of_the_art/data/processed/gdsc_final_cleaned.parquet")
# Load aligned bulk and SC embeddings
df = pd.read_parquet("../../data/processed/pancancer_scran_norm_top8000_30_pcs.parquet")

print("✅ Loaded data:")
print("GDSC shape:", gdsc_df.shape)
print("Bulk embeddings:", df.shape)


✅ Loaded data:
GDSC shape: (571985, 3)
Bulk embeddings: (140, 30)


In [9]:
# Preview columns
print("📌 GDSC columns:", gdsc_df.columns)

# Make sure it has the required fields
required_cols = {"SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"}
assert required_cols.issubset(set(gdsc_df.columns)), "❌ GDSC dataset missing required columns!"

# Drop missing entries
gdsc_df = gdsc_df.dropna(subset=["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"])
gdsc_df["SANGER_MODEL_ID"] = gdsc_df["SANGER_MODEL_ID"].astype(str)

print("✅ Cleaned GDSC data:", gdsc_df.shape)


📌 GDSC columns: Index(['SANGER_MODEL_ID', 'DRUG_ID', 'LN_IC50'], dtype='object')
✅ Cleaned GDSC data: (571985, 3)


In [10]:
# Merge using SANGER_MODEL_ID
merged = gdsc_df.merge(df, on="SANGER_MODEL_ID", how="inner")

print("✅ Merged with bulk embeddings:")
print("Shape:", merged.shape)
print(merged.head(3))

# Save
merged.to_parquet("../../data/processed/gdsc_pancancer_scran_norm_top8000_30_pcs.parquet", index=False)
print("💾 Saved to: data/training_data_bulk.csv")


✅ Merged with bulk embeddings:
Shape: (84683, 33)
  SANGER_MODEL_ID  DRUG_ID   LN_IC50        PC1        PC2        PC3  \
0       SIDM01111        1  2.033564  14.349047 -15.438894 -16.920568   
1       SIDM00965        1  3.047965 -26.882206 -15.254739   0.895702   
2       SIDM00921        1  3.009075  -1.564049  11.030488 -13.788393   

         PC4        PC5        PC6        PC7  ...       PC21       PC22  \
0  -6.622880 -45.079546  -4.714353  16.416442  ...   4.665252   4.232623   
1   3.618226 -35.574114 -10.524117  22.682785  ... -15.790137 -10.618848   
2  28.695032 -28.532984   6.682863  -0.432798  ...   8.170706  -0.438329   

        PC23       PC24       PC25      PC26       PC27      PC28      PC29  \
0  23.387031  -2.679159   3.743278 -0.291716  -9.571731  9.949305  6.658633   
1   5.793209  -8.035947 -14.255610 -3.835953  12.236363  2.128316 -1.957364   
2   4.870733 -12.646082  -9.495665 -4.453162   6.780867 -1.465400  2.992637   

        PC30  
0  -2.719322  
1  -4