In [1]:
import pandas as pd
import scanpy as sc

# Load AnnData object
adata = sc.read("../../data/breast_cancer_dimred.h5ad")

# Load and clean the mapping DataFrame
mapping_df = pd.read_csv("../../data/cell_sanger_map.csv").drop_duplicates()
mapping_df.columns = ['SANGER_MODEL_ID', 'CELL_LINE_NAME']

print("✅ Loaded mapping_df and adata")
print("Mapping sample:")
print(mapping_df.head())
print("AnnData shape:", adata.shape)


✅ Loaded mapping_df and adata
Mapping sample:
  SANGER_MODEL_ID CELL_LINE_NAME
0       SIDM00853            GCT
1       SIDM00567         ONS-76
2       SIDM00042            PL4
3       SIDM00455     PA-TU-8902
4       SIDM00881        HCC1428
AnnData shape: (34945, 47096)


In [2]:
# Extract the cell line name prefix from the index (before the underscore)
adata.obs['cell_line'] = adata.obs.index.str.split('_').str[0]

# Normalize cell line names (remove dashes, uppercase)
adata.obs['cell_line_norm'] = adata.obs['cell_line'].str.replace('-', '', regex=False).str.upper()

print("✅ Extracted and normalized cell line names from adata.obs.index")
print("Unique normalized cell lines (first 10):", adata.obs['cell_line_norm'].unique()[:10])


✅ Extracted and normalized cell line names from adata.obs.index
Unique normalized cell lines (first 10): ['AU565' 'HCC1937' 'HCC38' 'MDAMB468' 'EFM19' 'HCC1187' 'JIMT1' 'MDAMB361'
 'HCC1500' 'HCC70']


In [3]:
# Normalize mapping_df cell line names similarly
mapping_df['CELL_LINE_NAME_NORM'] = mapping_df['CELL_LINE_NAME'].str.replace('-', '', regex=False).str.upper()

# Print for validation
print("✅ Normalized cell line names in mapping_df")
print(mapping_df[['CELL_LINE_NAME', 'CELL_LINE_NAME_NORM']].drop_duplicates().head())


✅ Normalized cell line names in mapping_df
  CELL_LINE_NAME CELL_LINE_NAME_NORM
0            GCT                 GCT
1         ONS-76               ONS76
2            PL4                 PL4
3     PA-TU-8902            PATU8902
4        HCC1428             HCC1428


In [4]:
# Create a mapping from normalized name to SIDM
name_to_sidm = dict(zip(mapping_df['CELL_LINE_NAME_NORM'], mapping_df['SANGER_MODEL_ID']))

# Apply the mapping
adata.obs['SIDM'] = adata.obs['cell_line_norm'].map(name_to_sidm)

# Preview mapped values
print("✅ Mapped cell lines to SIDM codes")
print(adata.obs[['cell_line', 'cell_line_norm', 'SIDM']].drop_duplicates().head(10))


✅ Mapped cell lines to SIDM codes
                      cell_line cell_line_norm       SIDM
AU565_AAACCAGTTTGG        AU565          AU565  SIDM00898
HCC1937_AAAACAACTGTT    HCC1937        HCC1937  SIDM00874
HCC38_AAAAGGCTGCGC        HCC38          HCC38  SIDM00675
MDAMB468_AAAAACATCCGA  MDAMB468       MDAMB468  SIDM00628
EFM19_AAAATAATTCGG        EFM19          EFM19  SIDM01056
HCC1187_AAAATGCGAAAC    HCC1187        HCC1187  SIDM00885
JIMT1_AAAAATGGCCAG        JIMT1          JIMT1  SIDM01037
MDAMB361_AAAATGTCCCAG  MDAMB361       MDAMB361  SIDM00528
HCC1500_AAAATCAGGACC    HCC1500        HCC1500  SIDM00879
HCC70_AAAAGACTATAG        HCC70          HCC70  SIDM00673


In [5]:
unmapped = adata.obs[adata.obs['SIDM'].isna()]['cell_line'].unique()
print("❗ Unmapped cell lines:", unmapped)
print("Total unmapped:", len(unmapped))


❗ Unmapped cell lines: ['MCF12A' 'KPL1' 'ZR751' 'MX1']
Total unmapped: 4


In [6]:
# Keep only cells that have a mapped SIDM code
adata = adata[~adata.obs['SIDM'].isna()].copy()
print("✅ Filtered adata to only include mapped cell lines")

✅ Filtered adata to only include mapped cell lines


In [7]:
# Overwrite the original file with updated AnnData object
adata.write("../../data/breast_cancer_dimred.h5ad")

print("✅ Overwritten the original AnnData file with SIDM-enhanced metadata.")


✅ Overwritten the original AnnData file with SIDM-enhanced metadata.
