# 🧬 PCA + Merge: Pseudo-Bulk Datasets

This notebook loads the pancancer and breast cancer pseudo-bulk expression matrices, applies PCA, and merges them into the main GDSC dataset for downstream modeling.


In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl


## 📥 Load Pancancer and Breast Cancer Pseudo-Bulk Expression Data


In [None]:
# Load both datasets
pan_df = pd.read_csv("pancancer_pseudo_bulk_expression.csv", index_col=0)
breast_df = pd.read_csv("breast_cancer_pseudo_bulk_expression.csv", index_col=0)

# Tag origin (optional)
pan_df["pseudo_source"] = "pancancer"
breast_df["pseudo_source"] = "breast"

print("✅ Loaded pancancer:", pan_df.shape)
print("✅ Loaded breast:", breast_df.shape)


## 🧹 Combine and Apply PCA
We combine both datasets and apply PCA to expression values only (exclude labels/identifiers).


In [None]:
NUM_PCS = 30

# Combine (drop metadata column for PCA)
combined = pd.concat([pan_df, breast_df])
expr_data = combined.drop(columns=["pseudo_source"])

# Fit PCA
pca = PCA(n_components=NUM_PCS, random_state=42)
pcs = pca.fit_transform(expr_data)

# Rebuild DataFrame
pcs_df = pd.DataFrame(pcs, columns=[f"PSEUDO_PC{i+1}" for i in range(NUM_PCS)])
pcs_df["SANGER_MODEL_ID"] = combined.index
pcs_df["pseudo_source"] = combined["pseudo_source"].values

print("✅ PCA complete. Shape:", pcs_df.shape)


## 📊 Explained Variance from PCA


In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(x=np.arange(1, NUM_PCS + 1), y=pca.explained_variance_ratio_ * 100, color="steelblue")
plt.title("Explained Variance by Pseudo-Bulk PCA Components")
plt.xlabel("Principal Component")
plt.ylabel("Variance Explained (%)")
plt.grid(True)
plt.tight_layout()
plt.show()


## 🔗 Merge PCA with Drug Response Labels (GDSC)


In [None]:
gdsc_path = "../../../data/bulk/bulk_with_pca.parquet"
gdsc_df = pl.read_parquet(gdsc_path).to_pandas()

merged_df = gdsc_df.merge(pcs_df, on="SANGER_MODEL_ID", how="inner")

print("✅ Merged dataset shape:", merged_df.shape)
merged_df.head()


## 💾 Save Merged Dataset


In [None]:
output_path = "../../../data/bulk/bulk_with_pseudobulk_pca.parquet"
pl.DataFrame(merged_df).write_parquet(output_path)
print(f"📁 Saved to {output_path}")
