In [1]:
import pandas as pd


In [7]:
# Load both DataFrames
gene_pca_df = pd.read_parquet("../../data/processed/gdsc_pancancer_embeddings_30_pcs.parquet")
print(gene_pca_df.head())

emb_pca_df = pd.read_parquet("../../data/processed/gdsc_pseudobulk_voom_30_pcs.parquet")
print(emb_pca_df.head())

# Rename PCA columns in gene_pca_df
gene_pca_cols = [col for col in gene_pca_df.columns if col.startswith("SCF_PC")]
voom_renaming = {col: col.replace("SCF_PC", "VOOM_PC") for col in gene_pca_cols}
gene_pca_df = gene_pca_df.rename(columns=voom_renaming)

# Confirm
print("✅ Renamed columns in gene_pca_df:", list(voom_renaming.values())[:5], "...")


  SANGER_MODEL_ID  DRUG_ID   LN_IC50         0         1         2          3  \
0       SIDM01111        1  2.033564 -2.465400  1.202431 -0.944438  -9.463418   
1       SIDM00965        1  3.047965 -3.944899  0.260618 -0.084734  -5.278831   
2       SIDM00921        1  3.009075 -2.428348  3.057883 -6.248041   0.675703   
3       SIDM00395        1  3.381088 -2.083047  9.835143  3.641217 -11.338659   
4       SIDM00369        1 -1.395483  7.560895  4.969285  0.769833   4.234478   

           4         5         6  ...        20        21        22        23  \
0   7.780813 -1.961875 -1.021320  ...  1.132296  4.170240 -2.783202  1.540082   
1  11.148554 -5.640492  2.199688  ...  1.640393  0.575384 -0.514569  2.981020   
2   0.930572 -1.085604  2.428111  ... -0.584436  0.425164  1.251663  0.168285   
3   4.221368 -9.624701 -8.043923  ...  1.982270  1.046272  2.171780 -1.251443   
4  -0.730354 -3.121336  1.328352  ...  1.308641 -1.104988  0.215362  1.251073   

         24        25     

In [8]:
# Identify merge keys
merge_keys = ["SANGER_MODEL_ID", "DRUG_ID"]

# Identify PCA columns in embeddings DataFrame
emb_pca_cols = [col for col in emb_pca_df.columns if col.startswith("PC")]

print("✅ Embeddings PCA columns:", emb_pca_cols[:5], "...")


✅ Embeddings PCA columns: ['PC1', 'PC2', 'PC3', 'PC4', 'PC5'] ...


In [9]:
# Keep only merge keys + PCA columns for merging
emb_pca_keep = emb_pca_df[merge_keys + emb_pca_cols].copy()

print("✅ Prepared embeddings DataFrame for merging:", emb_pca_keep.shape)


✅ Prepared embeddings DataFrame for merging: (84683, 32)


In [10]:
# Merge gene PCA (renamed) with embeddings PCA
merged_df = gene_pca_df.merge(
    emb_pca_keep,
    on=merge_keys,
    how="inner"
)

# Confirm structure
print("✅ Merged shape:", merged_df.shape)
print("✅ Columns preview:", merged_df.columns[:10].tolist(), "...")
print(merged_df.head())


✅ Merged shape: (84683, 63)
✅ Columns preview: ['SANGER_MODEL_ID', 'DRUG_ID', 'LN_IC50', '0', '1', '2', '3', '4', '5', '6'] ...
  SANGER_MODEL_ID  DRUG_ID   LN_IC50         0         1         2          3  \
0       SIDM01111        1  2.033564 -2.465400  1.202431 -0.944438  -9.463418   
1       SIDM00965        1  3.047965 -3.944899  0.260618 -0.084734  -5.278831   
2       SIDM00921        1  3.009075 -2.428348  3.057883 -6.248041   0.675703   
3       SIDM00395        1  3.381088 -2.083047  9.835143  3.641217 -11.338659   
4       SIDM00369        1 -1.395483  7.560895  4.969285  0.769833   4.234478   

           4         5         6  ...       PC21       PC22       PC23  \
0   7.780813 -1.961875 -1.021320  ...  -5.437439 -24.638963  -5.188729   
1  11.148554 -5.640492  2.199688  ...   2.719752  -3.145686   6.335053   
2   0.930572 -1.085604  2.428111  ...   1.018955 -37.570435   6.468278   
3   4.221368 -9.624701 -8.043923  ...  14.913421 -10.509331 -52.265645   
4  -0.730354 -3

In [1]:
# Save to parquet
merged_df.to_parquet("../../data/processed/gdsc_pancancer_conc_30_pcs.parquet", index=False)
print("✅ Saved concatenated PCA feature table to '../../data/processed/gdsc_pancancer_conc_30_pcs.parquet'.")


NameError: name 'merged_df' is not defined