## Step 1: Map Pan-Cancer CPM Data to GDSC Cell Lines
We extract cell line identifiers from CPM column names, normalize them, and align them with GDSC cell lines via the SIDM mapping table.


In [31]:
import pandas as pd
import numpy as np

# === Load metadata ===
meta = pd.read_csv(
    "../../data/SCP542/metadata/Metadata.txt",
    sep="\t",
    index_col=0,
    skiprows=[1],  # Skip the row with data types
    dtype=str,
    low_memory=False
)
meta.index = meta.index.str.strip()
print("✅ Metadata loaded:", meta.shape)

# === Load CPM expression matrix ===
cpm_df = pd.read_csv("../../data/SCP542/expression/CPM_data.txt", sep="\t")

# Rename the first column to "GENE" just in case, then set it as index
cpm_df.rename(columns={cpm_df.columns[0]: "GENE"}, inplace=True)
cpm_df.set_index("GENE", inplace=True)

# Transpose to have cells as rows, genes as columns
cpm_t = cpm_df.T
cpm_t.index = cpm_t.index.str.strip()

print("✅ CPM matrix loaded and transposed:", cpm_t.shape)


# === Load and normalize GDSC mapping ===
mapping_df = pd.read_csv("../../data/cell_sanger_map.csv").drop_duplicates()
mapping_df.columns = ['SANGER_MODEL_ID', 'CELL_LINE_NAME']
mapping_df["CELL_LINE_NAME_NORM"] = mapping_df["CELL_LINE_NAME"].str.replace("-", "", regex=False).str.upper()
print("✅ GDSC mapping loaded:", mapping_df.shape)


✅ Metadata loaded: (53513, 20)
✅ CPM matrix loaded and transposed: (53513, 22722)
✅ GDSC mapping loaded: (978, 3)


In [32]:
# Normalize metadata cell line names
meta["cell_line_clean"] = meta["Cell_line"].str.split("_").str[0]
meta["cell_line_norm"] = meta["cell_line_clean"].str.replace("-", "", regex=False).str.upper()

# Map to SIDM using GDSC mapping
name_to_sidm = dict(zip(mapping_df["CELL_LINE_NAME_NORM"], mapping_df["SANGER_MODEL_ID"]))
meta["SIDM"] = meta["cell_line_norm"].map(name_to_sidm)

# Print mapping results
mapped = meta["SIDM"].notna().sum()
print(f"✅ Successfully mapped {mapped} cells")
print("❗ Unmapped cell lines:", meta[meta["SIDM"].isna()]["Cell_line"].unique())


✅ Successfully mapped 36871 cells
❗ Unmapped cell lines: ['NCIH2126_LUNG' 'SW579_THYROID' 'HEC251_ENDOMETRIUM' 'COLO741_SKIN'
 'WM88_SKIN' 'SNU899_UPPER_AERODIGESTIVE_TRACT' 'HEC108_ENDOMETRIUM'
 'SNU308_BILIARY_TRACT' 'TM31_CENTRAL_NERVOUS_SYSTEM'
 'KPNSI9S_AUTONOMIC_GANGLIA' 'SQ1_LUNG' 'BICR6_UPPER_AERODIGESTIVE_TRACT'
 'SH10TC_STOMACH' 'UMUC1_URINARY_TRACT' 'CCFSTTG1_CENTRAL_NERVOUS_SYSTEM'
 'TEN_ENDOMETRIUM' 'RERFLCAD1_LUNG' 'COV434_OVARY' 'SNU1079_BILIARY_TRACT'
 'YD38_UPPER_AERODIGESTIVE_TRACT' 'PANC1_PANCREAS' 'VMCUB1_URINARY_TRACT'
 'JHOC5_OVARY' 'SNU1077_ENDOMETRIUM' 'LI7_LIVER' 'ACCMESO1_PLEURA'
 'HMC18_BREAST' 'EFE184_ENDOMETRIUM' 'PECAPJ49_UPPER_AERODIGESTIVE_TRACT'
 'BICR56_UPPER_AERODIGESTIVE_TRACT' 'PK59_PANCREAS' 'HUH6_LIVER'
 'HS852T_SKIN' 'LMSU_STOMACH' 'SNUC4_LARGE_INTESTINE' 'OVSAHO_OVARY'
 'GOS3_CENTRAL_NERVOUS_SYSTEM' 'SNU738_CENTRAL_NERVOUS_SYSTEM'
 'PATU8988S_PANCREAS' 'HEC59_ENDOMETRIUM' 'HS729_SOFT_TISSUE'
 'KPL1_BREAST' 'NCIH2077_LUNG' 'KMRC3_KIDNEY' 'ZR751_B

In [33]:
# Check overlap between CPM and metadata barcodes
common_cells = set(cpm_t.index).intersection(set(meta.index))
print(f"🔗 Cells in common between CPM and metadata: {len(common_cells)}")

# Join CPM with SIDM labels
cpm_t = cpm_t.join(meta[["SIDM"]])
print(f"✅ Joined CPM with metadata: {cpm_t.shape}")

# Keep only rows with a valid SIDM
cpm_t = cpm_t[cpm_t["SIDM"].notna()]
print(f"✅ Filtered CPM with SIDM-mapped cells: {cpm_t.shape}")


🔗 Cells in common between CPM and metadata: 53513
✅ Joined CPM with metadata: (53513, 22723)
✅ Filtered CPM with SIDM-mapped cells: (36871, 22723)


## Step 2: Generate Pseudo-Bulk Expression Profiles from CPM
Aggregate CPM-normalized expression by `SIDM` (cell line) to simulate pseudo-bulk RNA-seq expression.


In [34]:
# Group by SIDM and compute mean expression
pseudobulk_df = cpm_t.groupby("SIDM").mean()

# Preview and save
print("✅ Final pseudo-bulk shape:", pseudobulk_df.shape)
print(pseudobulk_df.head())

pseudobulk_df.to_csv("../../data/pseudobulk_SCP542_mean.csv")
print("✅ Saved pseudo-bulk dataset to '../../data/pseudobulk_SCP542_mean.csv'")


✅ Final pseudo-bulk shape: (133, 22722)
           RP11-34P13.7  AP006222.2  RP4-669L17.10  RP4-669L17.2  \
SIDM                                                               
SIDM00078      0.000000   16.509676       0.000000           0.0   
SIDM00080      0.280949   16.679514       0.000000           0.0   
SIDM00082      0.000000   12.877808       0.972180           0.0   
SIDM00088      0.161733   29.311156       0.000000           0.0   
SIDM00092      0.236158   15.777871       0.489434           0.0   

           RP5-857K21.2  RP5-857K21.4  RP11-206L10.9    FAM87B  LINC00115  \
SIDM                                                                        
SIDM00078           0.0           0.0       0.627544  0.000000   1.783035   
SIDM00080           0.0           0.0       0.342141  2.095253   0.861588   
SIDM00082           0.0           0.0       1.472128  0.000000   0.953661   
SIDM00088           0.0           0.0       1.017598  0.000000   2.129605   
SIDM00092           0