In [8]:
import pandas as pd


In [None]:
# Load both DataFrames
gene_pca_df = pd.read_parquet("../../data/processed/gdsc_pancancer_embeddings_30_pcs.parquet")
emb_pca_df = pd.read_parquet("../../data/processed/gdsc_pancancer_pseudobulk_30_pcs.parquet")

# Rename PCA columns in gene_pca_df
gene_pca_cols = [col for col in gene_pca_df.columns if col.startswith("SCF_PC")]
voom_renaming = {col: col.replace("SCF_PC", "VOOM_PC") for col in gene_pca_cols}
gene_pca_df = gene_pca_df.rename(columns=voom_renaming)

# Confirm
print("✅ Renamed columns in gene_pca_df:", list(voom_renaming.values())[:5], "...")


✅ Renamed columns in gene_pca_df: [] ...


In [14]:
# Identify merge keys
merge_keys = ["SANGER_MODEL_ID", "DRUG_ID"]

# Identify PCA columns in embeddings DataFrame
emb_pca_cols = [col for col in emb_pca_df.columns if col.startswith("SC_PC")]

print("✅ Embeddings PCA columns:", emb_pca_cols[:5], "...")


✅ Embeddings PCA columns: ['SC_PC1', 'SC_PC2', 'SC_PC3', 'SC_PC4', 'SC_PC5'] ...


In [15]:
# Keep only merge keys + PCA columns for merging
emb_pca_keep = emb_pca_df[merge_keys + emb_pca_cols].copy()

print("✅ Prepared embeddings DataFrame for merging:", emb_pca_keep.shape)


✅ Prepared embeddings DataFrame for merging: (84683, 32)


In [16]:
# Merge gene PCA (renamed) with embeddings PCA
merged_df = gene_pca_df.merge(
    emb_pca_keep,
    on=merge_keys,
    how="inner"
)

# Confirm structure
print("✅ Merged shape:", merged_df.shape)
print("✅ Columns preview:", merged_df.columns[:10].tolist(), "...")
print(merged_df.head())


✅ Merged shape: (84683, 63)
✅ Columns preview: ['SANGER_MODEL_ID', 'DRUG_ID', 'LN_IC50', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'] ...
  SANGER_MODEL_ID  DRUG_ID   LN_IC50       PC1        PC2        PC3  \
0       SIDM01111        1  2.033564 -5.927806   2.096332 -12.498588   
1       SIDM00965        1  3.047965 -6.875136   2.150296 -11.770959   
2       SIDM00921        1  3.009075 -4.583251  -3.857921  -3.565151   
3       SIDM00395        1  3.381088 -2.114197  10.887882 -12.880151   
4       SIDM00369        1 -1.395483  7.081503  -0.773096   0.600023   

         PC4        PC5       PC6       PC7  ...   SC_PC21   SC_PC22  \
0   6.431199 -11.229197 -0.144861 -7.687607  ...  1.387883 -0.821981   
1   2.239541  -4.746090 -6.712371 -4.839766  ...  1.058641  3.336473   
2   5.554976   2.155770  2.658704 -5.520007  ...  1.501859 -1.195208   
3  13.894074 -13.900619 -2.299033 -2.982558  ... -3.784482  1.070083   
4  -1.613353   5.353582  1.364210 -0.744943  ...  3.252580  1.156

In [17]:
# Save to parquet
merged_df.to_parquet("../../data/processed/gdsc_pancancer_embeddings_expression_30_pcs.parquet", index=False)
print("✅ Saved concatenated PCA feature table to '../../data/processed/gdsc_pancancer_embeddings_30_pcs.parquet'.")


✅ Saved concatenated PCA feature table to '../../data/processed/gdsc_pancancer_embeddings_30_pcs.parquet'.
