In [1]:
import pandas as pd


In [2]:
# Load GDSC drug response data
gdsc_df = pd.read_parquet("../../../../bulk_state_of_the_art/data/processed/gdsc_final_cleaned.parquet")
# Load aligned bulk and SC embeddings
df = pd.read_parquet("../../data/processed/pancancer_embeddings_30_pcs.parquet")

print("✅ Loaded data:")
print("GDSC shape:", gdsc_df.shape)
print("Bulk embeddings:", df.shape)


✅ Loaded data:
GDSC shape: (571985, 3)
Bulk embeddings: (140, 31)


In [3]:
# Preview columns
print("📌 GDSC columns:", gdsc_df.columns)

# Make sure it has the required fields
required_cols = {"SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"}
assert required_cols.issubset(set(gdsc_df.columns)), "❌ GDSC dataset missing required columns!"

# Drop missing entries
gdsc_df = gdsc_df.dropna(subset=["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"])
gdsc_df["SANGER_MODEL_ID"] = gdsc_df["SANGER_MODEL_ID"].astype(str)

print("✅ Cleaned GDSC data:", gdsc_df.shape)


📌 GDSC columns: Index(['SANGER_MODEL_ID', 'DRUG_ID', 'LN_IC50'], dtype='object')
✅ Cleaned GDSC data: (571985, 3)


In [4]:
# Merge using SANGER_MODEL_ID
merged = gdsc_df.merge(df, on="SANGER_MODEL_ID", how="inner")

print("✅ Merged with bulk embeddings:")
print("Shape:", merged.shape)
print(merged.head(3))

# Save
merged.to_parquet("../../data/processed/gdsc_pancancer_embeddings_30_pcs.parquet", index=False)
print("💾 Saved to: data/training_data_bulk.csv")


✅ Merged with bulk embeddings:
Shape: (84683, 33)
  SANGER_MODEL_ID  DRUG_ID   LN_IC50         0         1         2         3  \
0       SIDM01111        1  2.033564 -2.465400  1.202431 -0.944438 -9.463418   
1       SIDM00965        1  3.047965 -3.944899  0.260618 -0.084734 -5.278831   
2       SIDM00921        1  3.009075 -2.428348  3.057883 -6.248041  0.675703   

           4         5         6  ...        20        21        22        23  \
0   7.780813 -1.961875 -1.021320  ...  1.132296  4.170240 -2.783202  1.540082   
1  11.148554 -5.640492  2.199688  ...  1.640393  0.575384 -0.514569  2.981020   
2   0.930572 -1.085604  2.428111  ... -0.584436  0.425164  1.251663  0.168285   

         24        25        26       27        28        29  
0  2.382949  0.333869  0.671884  2.00359 -0.020226  0.290388  
1 -0.428230 -0.465656  0.599798  0.71557 -1.602822 -0.792969  
2  0.872158 -0.321614 -0.077460  0.06201  0.284773  0.079346  

[3 rows x 33 columns]
💾 Saved to: data/training_dat