In [1]:
import pandas as pd

# 📥 Load cleaned UMI matrix
umi_path = "../../data/original/SCP542/other/UMIcount_data.txt"  # Update to cleaned file
umi_df = pd.read_csv(umi_path, sep="\t", index_col=0)

# 🔁 Transpose to (cells × genes)
umi_df = umi_df.transpose()

# 🔢 Ensure numeric data
umi_df = umi_df.apply(pd.to_numeric, errors="coerce").fillna(0)

print("✅ Matrix ready with shape:", umi_df.shape)

# 💾 Save as Parquet (compressed, columnar format)
parquet_path = "../../data/processed/pancancer_raw_umi.parquet"
umi_df.to_parquet(parquet_path, engine="pyarrow", compression="snappy")

print(f"✅ Saved raw UMI data to '{parquet_path}'")


✅ Matrix ready with shape: (56982, 30314)
✅ Saved raw UMI data to '../../data/processed/pancancer_raw_umi.parquet'


In [4]:
import scanpy as sc

# 🧬 Build AnnData object
adata = sc.AnnData(X=umi_df.values)
adata.obs_names = umi_df.index  # Cell barcodes
adata.var_names = umi_df.columns  # Gene names

print("✅ AnnData created with shape:", adata.shape)

# 💾 Save AnnData
adata.write("../../data/pancancer_raw_umi.h5ad", compression="gzip")
print("✅ Saved raw UMI data to 'pancancer_raw_umi.h5ad'")


✅ AnnData created with shape: (56982, 30314)
✅ Saved raw UMI data to 'pancancer_raw_umi.h5ad'


In [5]:
import pandas as pd

# Reload AnnData (just to be safe)
adata = sc.read("../../data/pancancer_raw_umi.h5ad")

# Load mapping file
mapping_df = pd.read_csv("../../data/cell_sanger_map.csv").drop_duplicates()
mapping_df.columns = ['SANGER_MODEL_ID', 'CELL_LINE_NAME']
mapping_df['CELL_LINE_NAME_NORM'] = (
    mapping_df['CELL_LINE_NAME']
    .str.replace('-', '', regex=False)
    .str.upper()
)

# Extract cell line from barcodes
adata.obs['cell_line'] = adata.obs_names.str.split('_').str[0]
adata.obs['cell_line_norm'] = adata.obs['cell_line'].str.replace('-', '', regex=False).str.upper()

# Map to SANGER_MODEL_ID
name_to_sidm = dict(zip(mapping_df['CELL_LINE_NAME_NORM'], mapping_df['SANGER_MODEL_ID']))
adata.obs['SIDM'] = adata.obs['cell_line_norm'].map(name_to_sidm)

# Filter only mapped cells
adata = adata[~adata.obs['SIDM'].isna()].copy()

print("✅ Filtered AnnData object to mapped cell lines:", adata.shape)
print("Unique cell lines (SANGER_MODEL_ID):", adata.obs['SIDM'].nunique())

# Save filtered dataset
adata.write("../../data/pancancer_raw_umi_filtered.h5ad", compression = "gzip")
print("✅ Saved filtered UMI AnnData to 'pancancer_raw_umi_filtered.h5ad'")


✅ Filtered AnnData object to mapped cell lines: (39726, 30314)
Unique cell lines (SANGER_MODEL_ID): 140
✅ Saved filtered UMI AnnData to 'pancancer_raw_umi_filtered.h5ad'
