In [2]:
import pandas as pd
import scanpy as sc

# 📥 Load cleaned UMI matrix
umi_path = "../../data/SCP542/other/UMIcount_data.txt"  # Update to cleaned file
umi_df = pd.read_csv(umi_path, sep="\t", index_col=0)

# 🔁 Transpose to (cells × genes) for AnnData
umi_df = umi_df.transpose()

# 🔢 Ensure numeric data
umi_df = umi_df.apply(pd.to_numeric, errors="coerce").fillna(0)

# 🧬 Build AnnData object
adata = sc.AnnData(X=umi_df.values)
adata.obs_names = umi_df.index  # Cell barcodes
adata.var_names = umi_df.columns  # Gene names

print("✅ AnnData created with shape:", adata.shape)

# 💾 Save AnnData
adata.write("../../data/pancancer_raw_umi.h5ad")
print("✅ Saved raw UMI data to 'pancancer_raw_umi.h5ad'")


✅ AnnData created with shape: (56982, 30314)
✅ Saved raw UMI data to 'pancancer_raw_umi.h5ad'
