# 🔗 Merge PCA Embeddings with Drug Response Dataset

In this notebook, we merge the PCA-transformed scFoundation embeddings with our drug response bulk dataset using the shared `SANGER_MODEL_ID`. This prepares the data for downstream modeling.


In [1]:
import polars as pl
import pandas as pd


## 📥 Load Datasets

We load both:
- `bulk_with_pca.parquet`: the original drug response dataset with PCA features.
- `scfoundation_bulk_pca_top{N}.parquet`: the selected PCA-transformed scFoundation embeddings.


In [3]:
bulk_path = "../../data/processed/bulk_voom_pca.parquet"
scf_pca_path = "../../data/embeddings/scfoundation_bulk_pca_top30.parquet"  # ← adjust as needed

bulk_df = pl.read_parquet(bulk_path)
scf_pcs_df = pl.read_parquet(scf_pca_path)

print("Bulk shape:", bulk_df.shape)
print("scFoundation PCs shape:", scf_pcs_df.shape)


Bulk shape: (571985, 33)
scFoundation PCs shape: (1362, 31)


## 🔗 Merge on SANGER_MODEL_ID

We merge the two datasets using their shared cell line identifier.


In [4]:
merged_df = bulk_df.join(scf_pcs_df, on="SANGER_MODEL_ID", how="inner")

print("✅ Merged shape:", merged_df.shape)
merged_df.head()


✅ Merged shape: (571985, 63)


SANGER_MODEL_ID,DRUG_ID,LN_IC50,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,SCF_PC1,SCF_PC2,SCF_PC3,SCF_PC4,SCF_PC5,SCF_PC6,SCF_PC7,SCF_PC8,SCF_PC9,SCF_PC10,SCF_PC11,SCF_PC12,SCF_PC13,SCF_PC14,SCF_PC15,SCF_PC16,SCF_PC17,SCF_PC18,SCF_PC19,SCF_PC20,SCF_PC21,SCF_PC22,SCF_PC23,SCF_PC24,SCF_PC25,SCF_PC26,SCF_PC27,SCF_PC28,SCF_PC29,SCF_PC30
str,i32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SIDM00263""",1,3.966813,-143.778935,22.048178,4.280085,19.168067,50.931954,23.76384,35.105506,-7.557743,-25.872173,-2.265482,15.551653,28.878574,0.729251,-2.310477,35.078013,6.821901,0.43975,24.288151,38.4536,3.593022,-13.151688,-18.905431,0.039227,-15.913891,13.568836,1.8595,-5.765744,0.822856,12.114901,-6.122686,-28.846116,197.069926,-19.870734,44.943251,120.252984,-11.736488,68.092467,-29.633144,91.464971,41.620078,52.285631,-35.3301,-32.147308,6.214343,29.959419,-7.097029,38.830909,-74.465648,52.187677,-78.937371,-6.240742,-7.679112,25.018095,19.223719,43.85122,12.267507,-27.04944,0.123455,-35.590873,25.689015
"""SIDM00269""",1,2.69209,-111.73255,14.364681,-0.653152,25.571131,23.467472,31.98018,16.000345,-7.928904,-22.619169,2.675483,17.459836,8.891877,23.599511,5.672281,33.762119,0.189518,-5.222018,20.134318,40.333982,2.107782,-14.876624,-7.578567,-11.483378,5.171668,6.457112,-0.230956,-5.694266,10.885021,11.310926,-9.606058,-32.939312,178.0382,-36.558014,76.119477,61.616892,-41.88993,48.596424,-37.190455,82.099426,46.702968,27.989684,31.634995,-29.200714,31.125367,13.706163,23.301741,55.576769,-62.513778,71.743581,-95.87785,-11.069128,-19.595294,6.265651,17.007056,-6.088063,-8.697047,-40.467749,-24.798641,-56.610817,17.283725
"""SIDM00203""",1,2.47799,-148.006587,19.783259,8.130793,25.784234,40.346497,31.19686,1.732747,-9.666782,-25.915788,0.8522,24.626296,12.840916,4.213448,1.068404,29.450339,0.9981,-15.276629,9.182074,53.597773,15.451569,-21.942138,7.481636,1.204341,1.737601,10.989763,3.207945,-0.843249,0.481544,4.961943,-1.700099,-50.438404,224.057089,-11.252632,68.074396,118.967199,-35.606544,15.422671,-40.264347,75.286824,31.04061,-0.754466,-17.128703,-32.427248,18.577229,10.784739,29.71613,58.01021,-71.174012,81.13907,-113.359174,-15.710645,6.542916,-22.992477,27.231422,19.003388,-9.602187,-16.024635,12.005549,-22.627371,16.511505
"""SIDM01111""",1,2.033564,15.118393,12.841382,15.732082,41.373588,37.787587,0.417079,-4.834522,-6.587268,-27.756335,-1.28612,11.462113,10.154481,13.119134,-6.688721,25.686997,7.416178,-22.239027,8.614505,43.671845,0.069962,-11.825064,-1.911688,-6.976662,1.358369,7.157779,-2.333678,1.894014,8.626621,5.491428,6.344718,29.76666,21.063254,40.867959,121.35719,114.116486,0.979702,-9.181698,-47.983472,68.70998,23.675246,3.372241,10.180311,-17.352855,21.749337,2.267221,19.96751,41.28069,-99.118645,71.601726,-98.911898,-10.254515,1.635299,-0.909957,19.593254,-1.49993,3.332152,-8.211007,-15.952711,-30.330544,0.927991
"""SIDM00909""",1,2.966007,78.295151,13.048277,-6.142006,30.198731,-12.473088,20.035509,-11.306615,34.484778,12.678192,-12.416095,23.642698,1.597616,-4.026455,12.684305,-6.056986,23.436657,7.150125,-1.144317,4.565311,-2.37132,5.29878,-5.207006,1.686584,-1.352605,6.404186,-0.311345,4.75044,8.890706,7.572978,-3.559355,82.63295,-44.422875,-0.794799,107.392113,-21.778333,-63.576354,-26.693674,129.934174,50.581084,6.868829,-33.734528,-16.218644,5.256505,-36.967547,-39.589633,0.016937,4.041995,-12.705802,-3.540488,3.325514,19.854883,-12.721371,-3.329058,6.16778,-2.854354,9.516999,4.914386,-12.083485,-21.174966,9.79956


## 💾 Save Merged Dataset

We'll save the merged dataset to a new Parquet file for downstream training and evaluation.


In [6]:
output_path = f"../../data/processed/bulk_voom_30_pcs_embeddings_30_pcs.parquet"  # adjust number if needed
merged_df.write_parquet(output_path)
print(f"✅ Saved to {output_path}")


✅ Saved to ../../data/processed/bulk_voom_30_pcs_embeddings_30_pcs.parquet
