In [3]:
import pandas as pd

# 📥 Load scran-normalized matrix from parquet (cells × genes)
umi_df = pd.read_parquet("../../data/processed/pancancer_scran_norm_top8000.parquet")
print("✅ Loaded scran-normalized data:", umi_df.shape)

# 📥 Load mapping file (Cell line → SANGER_MODEL_ID)
mapping_df = pd.read_csv("../../data/cell_sanger_map.csv").drop_duplicates()
mapping_df.columns = ['SANGER_MODEL_ID', 'CELL_LINE_NAME']
mapping_df['CELL_LINE_NAME_NORM'] = mapping_df['CELL_LINE_NAME'].str.replace('-', '', regex=False).str.upper()

# 🔍 Extract cell line name from cell barcodes
umi_df.index = umi_df.index.astype(str)
cell_line = umi_df.index.str.split('_').str[0]
cell_line_norm = cell_line.str.replace('-', '', regex=False).str.upper()

# 🗺️ Map normalized names to SANGER_MODEL_ID
name_to_sidm = dict(zip(mapping_df['CELL_LINE_NAME_NORM'], mapping_df['SANGER_MODEL_ID']))
sidm = cell_line_norm.map(name_to_sidm)

# 🧹 Keep only mapped cells
mask = sidm.notna()
umi_df = umi_df.loc[mask].copy()
umi_df["SANGER_MODEL_ID"] = sidm[mask].values

print("✅ Filtered to mapped cell lines:", umi_df.shape)

# 📊 Aggregate into pseudobulk (mean expression per SANGER_MODEL_ID)
pseudobulk_df = umi_df.groupby("SANGER_MODEL_ID").mean()

print("✅ Aggregated pseudobulk (mean, scran-normalized):", pseudobulk_df.shape)

✅ Loaded scran-normalized data: (39726, 8000)
✅ Filtered to mapped cell lines: (39726, 8001)
✅ Aggregated pseudobulk (mean, scran-normalized): (140, 8000)


In [4]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pseudobulk_df = pseudobulk_df.dropna(axis=1, how="any")

# 🎯 Apply PCA (30 components)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(pseudobulk_df)

pca = PCA(n_components=30, random_state=42)
pcs = pca.fit_transform(X_scaled)

pcs_df = pd.DataFrame(
    pcs,
    index=pseudobulk_df.index,
    columns=[f"PC{i+1}" for i in range(30)]
)

print("✅ PCA applied: shape =", pcs_df.shape)

# 💾 Save PCA-reduced pseudobulk
parquet_path = "../../data/processed/pancancer_scran_norm_top8000_30_pcs.parquet"
pcs_df.to_parquet(parquet_path, engine="pyarrow", compression="snappy")

print(f"💾 Saved pseudobulk scran-normalized PCA(30) to:\n- {parquet_path}")

✅ PCA applied: shape = (140, 30)
💾 Saved pseudobulk scran-normalized PCA(30) to:
- ../../data/processed/pancancer_scran_norm_top8000_30_pcs.parquet
