# 🔗 Merge PCA Embeddings with Drug Response Dataset

In this notebook, we merge the PCA-transformed scFoundation embeddings with our drug response bulk dataset using the shared `SANGER_MODEL_ID`. This prepares the data for downstream modeling.


In [1]:
import polars as pl
import pandas as pd


## 📥 Load Datasets

We load both:
- `bulk_with_pca.parquet`: the original drug response dataset with PCA features.
- `scfoundation_bulk_pca_top{N}.parquet`: the selected PCA-transformed scFoundation embeddings.


In [2]:
bulk_path = "../../data/bulk/bulk_with_pca.parquet"
scf_pca_path = "../../data/embeddings/scfoundation_bulk_pca_top30.parquet"  # ← adjust as needed

bulk_df = pl.read_parquet(bulk_path)
scf_pcs_df = pl.read_parquet(scf_pca_path)

print("Bulk shape:", bulk_df.shape)
print("scFoundation PCs shape:", scf_pcs_df.shape)


Bulk shape: (571985, 23)
scFoundation PCs shape: (1431, 31)


## 🔗 Merge on SANGER_MODEL_ID

We merge the two datasets using their shared cell line identifier.


In [3]:
merged_df = bulk_df.join(scf_pcs_df, on="SANGER_MODEL_ID", how="inner")

print("✅ Merged shape:", merged_df.shape)
merged_df.head()


✅ Merged shape: (571985, 53)


SANGER_MODEL_ID,DRUG_ID,LN_IC50,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,SCF_PC1,SCF_PC2,SCF_PC3,SCF_PC4,SCF_PC5,SCF_PC6,SCF_PC7,SCF_PC8,SCF_PC9,SCF_PC10,SCF_PC11,SCF_PC12,SCF_PC13,SCF_PC14,SCF_PC15,SCF_PC16,SCF_PC17,SCF_PC18,SCF_PC19,SCF_PC20,SCF_PC21,SCF_PC22,SCF_PC23,SCF_PC24,SCF_PC25,SCF_PC26,SCF_PC27,SCF_PC28,SCF_PC29,SCF_PC30
str,i32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""SIDM00374""",1009,4.13448,-28.505416,15.954326,-15.403949,77.513751,-0.946082,-6.623787,-0.623556,-10.833356,38.380781,20.099759,2.175345,-16.464194,-10.640596,20.195702,9.752242,14.056471,53.575456,15.126258,3.522584,7.078355,11.742195,5.695343,-0.980334,1.374318,1.677247,-0.751574,2.265758,0.109941,1.176413,-1.322149,-0.528211,-0.990986,0.875835,-1.111073,0.924814,-0.413847,0.053407,-0.414999,0.509911,-1.369889,0.06681,-0.710609,2.056372,-0.384537,-2.237147,-0.324567,0.040354,-0.083504,0.197229,-0.108249
"""SIDM00255""",268,-2.236015,-72.539809,70.288927,47.149003,-35.875317,-14.800827,-61.821066,-22.695374,9.278978,-14.422315,11.052211,16.081292,-8.262185,6.421972,55.240793,-0.754531,-31.702417,11.489405,-33.517716,14.455342,24.926537,3.332758,-9.180953,5.904081,-2.008322,-2.860039,-1.119856,0.428514,1.568974,1.41685,0.53232,1.367796,0.047008,1.239683,-1.417038,0.879389,0.679442,0.825101,0.434853,0.623342,0.910059,0.424401,1.252568,0.261637,-1.331824,-1.260733,0.094333,-0.012018,0.273703,0.18038,0.306647
"""SIDM01182""",1012,1.321538,-11.762381,-15.510162,-13.228404,-12.442003,29.096062,2.482356,1.617503,27.191232,-0.356424,-9.37755,1.051495,6.730509,0.671253,15.4325,-1.134384,4.364938,-5.846964,14.106168,-7.001153,-7.485627,0.536576,-5.31786,0.857577,-1.52326,0.952468,-1.815738,-0.326957,-2.981958,-0.331892,0.252378,-1.03911,-2.24098,0.318295,0.711528,0.414811,0.843296,-1.001374,0.66648,0.639443,-0.639352,0.024757,0.703827,-0.793418,0.607032,0.52742,-0.343146,-0.18588,-0.493611,-0.034714,0.672391
"""SIDM01160""",1023,3.875126,0.441912,-3.967296,31.176757,19.107893,-30.850983,25.624592,-11.967721,-9.186016,-5.26644,-4.365537,0.739011,-0.654428,-5.116252,-5.523746,5.747823,-10.201446,1.273372,-11.593635,-0.940643,9.067345,-2.219563,3.435795,-0.59132,-6.10627,-1.400329,3.127556,2.096093,0.052474,1.649546,-0.801234,-0.326037,0.456042,1.090816,1.426403,0.393986,1.265444,1.079708,-0.139628,-0.667466,-1.38687,0.605876,0.063188,-0.164596,0.310941,0.279967,0.357943,0.244326,-0.219723,1.020183,-0.20317
"""SIDM00547""",1197,4.457386,4.734531,-15.370917,-26.561868,-8.833916,6.84145,17.041301,-7.185372,6.106195,-9.474964,12.378004,1.889727,-4.694882,7.752625,3.787118,-6.225207,9.836332,-1.443811,1.359039,-9.641303,2.451467,-4.084301,3.194926,-1.897031,2.301683,-2.020154,-1.465294,-0.101874,-0.808305,-1.192596,-0.198156,-1.752649,-0.036972,-2.315455,0.615455,-0.142696,0.112447,-0.815359,0.011182,-0.59948,0.7406,-1.416897,0.318786,-1.222698,-2.190403,0.320682,-0.069942,0.374796,-0.667876,-0.497415,0.16118


## 💾 Save Merged Dataset

We'll save the merged dataset to a new Parquet file for downstream training and evaluation.


In [5]:
output_path = f"../../data/bulk/bulk_with_scfoundation_pca_top30.parquet"  # adjust number if needed
merged_df.write_parquet(output_path)
print(f"✅ Saved to {output_path}")


✅ Saved to ../../data/bulk/bulk_with_scfoundation_pca_top30.parquet
