In [1]:
import pandas as pd
import numpy as np
import scanpy as sc

# Load the AnnData object (already has SIDM mapping in .obs)
adata = sc.read("../../data/breast_cancer_dimred.h5ad")

# Convert adata.X to dense array if it's sparse
X = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X

# Build a DataFrame with cells as rows and genes as columns
expr_df = pd.DataFrame(X, columns=adata.var_names, index=adata.obs['SIDM'])

# Group by SIDM (cell line) and take the mean expression per gene
pseudobulk_df = expr_df.groupby(expr_df.index).mean()

# Preview the result
print("✅ Pseudo-bulk matrix shape:", pseudobulk_df.shape)
print(pseudobulk_df.head())

# Save to CSV file
pseudobulk_df.to_csv("../../data/pseudobulk_expression_mean.csv")

print("✅ Saved pseudo-bulk dataset to '../../data/pseudobulk_expression_mean.csv'")


  pseudobulk_df = expr_df.groupby(expr_df.index).mean()


✅ Pseudo-bulk matrix shape: (28, 47096)
           ENSG00000000003  ENSG00000000419  ENSG00000000457  ENSG00000000460  \
SIDM                                                                            
SIDM00097         0.159171         0.821657         0.037504         0.023015   
SIDM00122         0.111349         0.780787         0.014637         0.035092   
SIDM00135         0.052900         0.693550         0.015469         0.008199   
SIDM00148         0.088177         0.819970         0.042099         0.012329   
SIDM00272         0.002693         0.677463         0.094412         0.021891   

           ENSG00000001036  ENSG00000001084  ENSG00000001167  ENSG00000001460  \
SIDM                                                                            
SIDM00097         0.209755         0.081563         0.030173         0.042859   
SIDM00122         0.708178         0.052642         0.076766         0.008354   
SIDM00135         0.685742         0.152940         0.093721        