# Step 11: Generate Pseudo-Bulk Expression Profiles

This step aggregates single-cell gene expression into pseudo-bulk profiles per cell line (SIDM), which can then be used for model training or downstream analysis.


## 11.1 Load Processed Data


In [2]:
import pandas as pd
import numpy as np
import scanpy as sc

# Load the AnnData object (post-dimred + SIDM annotated)
adata = sc.read("../../data/processed/pancancer_dimred.h5ad")


## 11.2 Create Pseudo-Bulk Expression Matrix


In [3]:
# Convert adata.X to dense array if it's sparse
X = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X

# Build expression DataFrame with cell lines (SIDM) as index
expr_df = pd.DataFrame(X, columns=adata.var_names, index=adata.obs['SIDM'])

# Group by SIDM and average expression per gene
pseudobulk_df = expr_df.groupby(expr_df.index).mean()

# Show result
print("✅ Pseudo-bulk matrix shape:", pseudobulk_df.shape)
pseudobulk_df.head()


  pseudobulk_df = expr_df.groupby(expr_df.index).mean()


✅ Pseudo-bulk matrix shape: (140, 30314)


Cell_line,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2ML1-AS1,A2ML1-AS2,A3GALT2,A4GALT,...,PRICKLE4,RABL6,RAET1E-AS1,RGS5,SERPINA3,SPATA13,TBC1D26,TIMM10B,TMBIM4,TMEM256-PLSCR3
SIDM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDM00078,0.178898,0.07112,0.0,0.0,0.004585,0.01297,0.0,0.0,0.0,0.183968,...,0.014445,0.773556,0.0,0.0,0.0,0.017994,0.0,0.511412,0.0,0.007003
SIDM00080,0.55511,0.060693,0.0,0.316258,0.0,0.0,0.0,0.0,0.0,0.025178,...,0.033993,1.152639,0.0,0.002405,0.0,0.194795,0.0,0.412414,0.0,0.005125
SIDM00082,0.552159,0.045216,0.0,0.071382,0.011844,0.0,0.0,0.0,0.0,0.25347,...,0.024424,0.541629,0.0,0.041777,0.0,0.052144,0.0,0.37841,0.0,0.002469
SIDM00088,0.093321,0.04605,0.007379,0.003401,0.007016,0.0,0.0,0.0,0.0,0.0,...,0.007488,0.759662,0.0,0.027415,0.0,0.043423,0.0,0.31217,0.0,0.005103
SIDM00092,0.463124,0.075481,0.0,0.002935,0.0,0.0,0.0,0.0,0.0,0.135651,...,0.028437,0.868323,0.0,0.0017,0.0,0.042845,0.0,0.215714,0.0,0.01111


## 11.3 Save Output


In [4]:
# Save as CSV
pseudobulk_df.to_csv("../../data/processed/pseudobulk_expression.csv")
print("✅ Saved pseudo-bulk dataset to '../../data/pseudobulk_expression.csv'")


✅ Saved pseudo-bulk dataset to '../../data/pseudobulk_expression.csv'


In [5]:
# Save all single-cell PCs (no aggregation)
expr_df.to_parquet("../../data/processed/pancancer_sc_expression.parquet", index=True)
print("✅ Saved single-cell dataset with PCs to '../../data/pancancer_singlecell_pcs.parquet'")


✅ Saved single-cell dataset with PCs to '../../data/pancancer_singlecell_pcs.parquet'
