In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
import scipy.sparse as sp

# Paths
H5AD = "../../data/processed/breast_cancer_raw_annotated.h5ad"
MAP_CSV = "../../../../maps/gene_info.csv"
CELL_MAP = "../../../../maps/cell_sanger_map.csv"
OUT_PARQUET = "../../data/processed/breast_sc_log1pCP10k__symbols_sidm_mapped.parquet"

In [2]:
adata = sc.read_h5ad(H5AD)
print("Loaded AnnData:", adata)


Loaded AnnData: AnnData object with n_obs × n_vars = 35276 × 47096
    var: 'gene_symbol'


In [3]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
print("✅ Normalized and log-transformed data")

✅ Normalized and log-transformed data


In [4]:
X = adata.X.toarray() if sp.issparse(adata.X) else adata.X
print("Dense matrix shape:", X.shape)

Dense matrix shape: (35276, 47096)


In [5]:
ens_novers = pd.Index(adata.var_names.astype(str)).str.split(".").str[0]
gmap = pd.read_csv(MAP_CSV)
gmap["ensembl_novers"] = gmap["feature_id"].astype(str).str.split(".").str[0]
gmap["symbol"] = gmap["feature_name"].astype(str)
ens2sym = dict(zip(gmap["ensembl_novers"], gmap["symbol"]))

mapped_symbols = pd.Series(ens_novers).map(ens2sym)
colnames = mapped_symbols.fillna(pd.Series(ens_novers)).values

df = pd.DataFrame(X, index=adata.obs_names.astype(str), columns=colnames)
print("Initial DataFrame shape:", df.shape)

# Drop unmapped genes
is_ensembl_like = pd.Index(df.columns).str.match(r"ENSG\d+$")
print("Unmapped genes to drop:", int(is_ensembl_like.sum()))
df = df.loc[:, ~is_ensembl_like]

# Collapse duplicates by mean
if not df.columns.is_unique:
    df = df.T.groupby(level=0).mean().T
    print("Collapsed duplicate symbols by mean.")
print("After symbol mapping:", df.shape)

Initial DataFrame shape: (35276, 47096)
Unmapped genes to drop: 74
After symbol mapping: (35276, 47022)


In [6]:
mapping_df = pd.read_csv(CELL_MAP).drop_duplicates()
mapping_df.columns = ['SANGER_MODEL_ID', 'CELL_LINE_NAME']

# Normalize names in both datasets
adata.obs['cell_line'] = adata.obs.index.str.split('_').str[0]
adata.obs['cell_line_norm'] = adata.obs['cell_line'].str.replace('-', '', regex=False).str.upper()
mapping_df['CELL_LINE_NAME_NORM'] = mapping_df['CELL_LINE_NAME'].str.replace('-', '', regex=False).str.upper()

# Create dictionary and map
name_to_sidm = dict(zip(mapping_df['CELL_LINE_NAME_NORM'], mapping_df['SANGER_MODEL_ID']))
adata.obs['SANGER_MODEL_ID'] = adata.obs['cell_line_norm'].map(name_to_sidm)

# Filter unmapped cells
unmapped = adata.obs[adata.obs['SANGER_MODEL_ID'].isna()]['cell_line'].unique()
print("❗ Unmapped cell lines:", unmapped)
print("Total unmapped:", len(unmapped))
adata = adata[~adata.obs['SANGER_MODEL_ID'].isna()].copy()
print("✅ Filtered to mapped cell lines. Final n_cells:", adata.n_obs)

❗ Unmapped cell lines: ['MCF12A' 'KPL1' 'ZR751' 'MX1']
Total unmapped: 4
✅ Filtered to mapped cell lines. Final n_cells: 30827


In [12]:
# %%
import pandas as pd

print("🧠 AnnData summary:")
print(adata)
print("\nobs keys:", list(adata.obs.keys()))
print("\nvar shape:", adata.var.shape)
print("obs shape:", adata.obs.shape)

print("\nFirst few obs rows:")
display(adata.obs.head())

print("\nIndex sample (first 5):")
print(adata.obs.index[:5].tolist())

# Check if SANGER_MODEL_ID exists and how many are missing
if "SANGER_MODEL_ID" in adata.obs.columns:
    missing = adata.obs["SANGER_MODEL_ID"].isna().sum()
    unique = adata.obs["SANGER_MODEL_ID"].nunique()
    print(f"\n✅ 'SANGER_MODEL_ID' column found.")
    print(f"Unique SANGER_MODEL_IDs: {unique}")
    print(f"Missing values: {missing}")
    print("\nTop 10 value counts:")
    print(adata.obs['SANGER_MODEL_ID'].value_counts().head(10))
else:
    print("\n❌ 'SANGER_MODEL_ID' column not found in adata.obs")


🧠 AnnData summary:
AnnData object with n_obs × n_vars = 30827 × 47096
    obs: 'cell_line', 'cell_line_norm', 'SANGER_MODEL_ID'
    var: 'gene_symbol'
    uns: 'log1p'

obs keys: ['cell_line', 'cell_line_norm', 'SANGER_MODEL_ID']

var shape: (47096, 1)
obs shape: (30827, 3)

First few obs rows:


Unnamed: 0,cell_line,cell_line_norm,SANGER_MODEL_ID
AU565_AAACCAGTTTGG,AU565,AU565,SIDM00898
AU565_AAACGTGCAGCG,AU565,AU565,SIDM00898
AU565_AAAGCCACATGC,AU565,AU565,SIDM00898
AU565_AAAGTCGGCTGG,AU565,AU565,SIDM00898
AU565_AAAGTGCCTAAA,AU565,AU565,SIDM00898



Index sample (first 5):
['AU565_AAACCAGTTTGG', 'AU565_AAACGTGCAGCG', 'AU565_AAAGCCACATGC', 'AU565_AAAGTCGGCTGG', 'AU565_AAAGTGCCTAAA']

✅ 'SANGER_MODEL_ID' column found.
Unique SANGER_MODEL_IDs: 28
Missing values: 0

Top 10 value counts:
SANGER_MODEL_ID
SIDM00629    2821
SIDM00673    2818
SIDM00122    2296
SIDM00963    2158
SIDM00872    1623
SIDM00628    1573
SIDM01056    1316
SIDM00866    1280
SIDM00933    1260
SIDM00885    1147
Name: count, dtype: int64


In [13]:
# %%
# Align df and adata based on shared barcodes
common_barcodes = df.index.intersection(adata.obs.index)
print(f"✅ Shared barcodes between df and adata: {len(common_barcodes)}")

# Subset both to the intersection
df = df.loc[common_barcodes].copy()
meta = adata.obs.loc[common_barcodes, ["SANGER_MODEL_ID"]]

# Attach SANGER_MODEL_ID
df.insert(0, "SANGER_MODEL_ID", meta["SANGER_MODEL_ID"].values)

# Verification
print("✅ All cells have valid SANGER_MODEL_IDs:", df["SANGER_MODEL_ID"].isna().sum() == 0)
summary = df["SANGER_MODEL_ID"].value_counts().sort_index()
print("\n📊 Cells per SANGER_MODEL_ID (top 10):")
print(summary.head(10))
print(f"\nTotal mapped cell lines: {summary.shape[0]}")
print(f"Total mapped cells: {df.shape[0]}")

# Set index to SANGER_MODEL_ID for downstream analyses
df = df.set_index("SANGER_MODEL_ID")
print("\nIndex set to SANGER_MODEL_ID.")
print("Final shape:", df.shape)

# Save
df.to_parquet(OUT_PARQUET)
print(f"\n💾 Saved mapped single-cell matrix to: {OUT_PARQUET}")


✅ Shared barcodes between df and adata: 30827
✅ All cells have valid SANGER_MODEL_IDs: True

📊 Cells per SANGER_MODEL_ID (top 10):
SANGER_MODEL_ID
SIDM00097     825
SIDM00122    2296
SIDM00135     879
SIDM00148     839
SIDM00272    1119
SIDM00528     753
SIDM00628    1573
SIDM00629    2821
SIDM00630     877
SIDM00673    2818
Name: count, dtype: int64

Total mapped cell lines: 28
Total mapped cells: 30827

Index set to SANGER_MODEL_ID.
Final shape: (30827, 47024)

💾 Saved mapped single-cell matrix to: ../../data/processed/breast_sc_log1pCP10k__symbols_sidm_mapped.parquet
