# 🧬 Generate Pseudo-Bulk Embeddings from Single-Cell Embeddings

This notebook aggregates per-cell embeddings into per-cell-line (pseudo-bulk) embeddings by averaging all embeddings for each `SANGER_MODEL_ID`.


In [1]:
import numpy as np
import pandas as pd


In [2]:
# Paths
embedding_path = "embeddings/pancancer_embeddings_scfoundation.npy"
metadata_path = "data/pancancer_filtered_for_scfoundation.csv"

# Load metadata
metadata = pd.read_csv(metadata_path, header=0)
embeddings = np.load(embedding_path)

# Sanity check
assert len(metadata) == embeddings.shape[0], "Mismatch between metadata and embeddings!"

print(f"✅ Metadata shape: {metadata.shape}")
print(f"✅ Embeddings shape: {embeddings.shape}")


✅ Metadata shape: (39715, 19265)
✅ Embeddings shape: (39715, 3072)


In [17]:
# Extract base cell line name
cell_ids = metadata.iloc[:, 0].astype(str)
cell_lines = cell_ids.str.extract(r"^([^_\.\-]+)")[0]

# Normalize: remove dashes and uppercase
cell_lines_norm = cell_lines.str.replace('-', '', regex=False).str.upper()

print("📌 Sample extracted cell lines:", cell_lines.unique()[:5])
print("📌 Sample normalized cell lines:", cell_lines_norm.unique()[:5])


📌 Sample extracted cell lines: ['C32' 'NCIH446' 'MFE319' 'SKNAS' 'NCIH2452']
📌 Sample normalized cell lines: ['C32' 'NCIH446' 'MFE319' 'SKNAS' 'NCIH2452']


In [18]:
embedding_df = pd.DataFrame(embeddings)
embedding_df["CELL_LINE_NAME_NORM"] = cell_lines_norm

print("✅ Added CELL_LINE_NAME_NORM to embedding_df")
print("📌 Sample rows:\n", embedding_df.head())


✅ Added CELL_LINE_NAME_NORM to embedding_df
📌 Sample rows:
           0         1         2         3         4         5         6  \
0 -3.488941  1.394992  0.397312  0.215693 -0.962930 -0.749498 -0.785672   
1 -1.825061  0.575112 -0.896027 -0.297870 -0.880810  0.054988  0.284411   
2 -2.441955  0.972837 -0.828298 -0.115014  0.506222 -0.652835 -0.173292   
3 -2.680915  0.816473 -0.711007 -0.689018  0.233623 -0.389738  0.635181   
4 -2.624959  1.031120 -0.383724  0.208368 -0.058529 -0.733210 -0.343108   

          7         8         9  ...      3063      3064      3065      3066  \
0  0.801274  0.457623  0.195104  ...  0.516343 -0.878057  0.264915  0.206443   
1  0.127365  0.111809  1.150152  ... -0.057807 -1.709587  0.067718 -0.725656   
2 -0.166358  0.409530  0.693187  ...  0.915426 -1.490793 -0.031970  1.141678   
3  0.362828 -0.482695  0.726599  ...  0.606620 -2.069781  0.637838  0.752461   
4  0.048420  0.366079  0.505155  ...  1.252378 -1.838147  0.541138  0.013296   

       3

In [19]:
mapping_path = "cell_sanger_map.csv"
mapping_df = pd.read_csv(mapping_path).drop_duplicates()
mapping_df.columns = ['SANGER_MODEL_ID', 'CELL_LINE_NAME']

# Normalize
mapping_df['CELL_LINE_NAME_NORM'] = mapping_df['CELL_LINE_NAME'].str.replace('-', '', regex=False).str.upper()

print("✅ Loaded mapping_df")
print("📌 Mapping sample:\n", mapping_df.head())
print("📌 Sample normalized mapping names:\n", mapping_df['CELL_LINE_NAME_NORM'].unique()[:5])


✅ Loaded mapping_df
📌 Mapping sample:
   SANGER_MODEL_ID CELL_LINE_NAME CELL_LINE_NAME_NORM
0       SIDM00853            GCT                 GCT
1       SIDM00567         ONS-76               ONS76
2       SIDM00042            PL4                 PL4
3       SIDM00455     PA-TU-8902            PATU8902
4       SIDM00881        HCC1428             HCC1428
📌 Sample normalized mapping names:
 ['GCT' 'ONS76' 'PL4' 'PATU8902' 'HCC1428']


In [20]:
# Merge
merged_df = embedding_df.merge(mapping_df[['SANGER_MODEL_ID', 'CELL_LINE_NAME_NORM']],
                                on="CELL_LINE_NAME_NORM", how="left")

# Show a sample of the merge
print("📌 Sample merged rows:\n", merged_df[['CELL_LINE_NAME_NORM', 'SANGER_MODEL_ID']].drop_duplicates().head(10))

# Count mapped and unmapped
num_total = len(merged_df)
num_mapped = merged_df['SANGER_MODEL_ID'].notna().sum()
num_unmapped = merged_df['SANGER_MODEL_ID'].isna().sum()

print(f"✅ Total rows: {num_total}")
print(f"✅ Mapped rows: {num_mapped}")
print(f"❗ Unmapped rows: {num_unmapped}")

if num_unmapped > 0:
    print("❗ Example unmapped names:", merged_df[merged_df['SANGER_MODEL_ID'].isna()]['CELL_LINE_NAME_NORM'].unique()[:5])


📌 Sample merged rows:
    CELL_LINE_NAME_NORM SANGER_MODEL_ID
0                  C32       SIDM00890
1              NCIH446       SIDM00965
2               MFE319       SIDM00333
3                SKNAS       SIDM01101
4             NCIH2452       SIDM00722
5                 JHH7       SIDM00614
6                KNS42       SIDM00607
8                 MCF7       SIDM00148
9               HT1197       SIDM00676
10               HCC38       SIDM00675
✅ Total rows: 39715
✅ Mapped rows: 39715
❗ Unmapped rows: 0


In [21]:
# Remove unmapped
mapped_df = merged_df[~merged_df['SANGER_MODEL_ID'].isna()].copy()

# Group by SIDM
pseudo_bulk_embeddings = mapped_df.drop(columns=["CELL_LINE_NAME_NORM"]).groupby("SANGER_MODEL_ID").mean()

print("✅ Grouped by SANGER_MODEL_ID")
print("📌 Pseudo-bulk shape:", pseudo_bulk_embeddings.shape)


✅ Grouped by SANGER_MODEL_ID
📌 Pseudo-bulk shape: (140, 3072)


In [23]:
output_path = "data/pseudobulk_sc_embeddings_aligned.csv"
pseudo_bulk_embeddings.to_csv(output_path)

print(f"✅ Pseudo-bulk embeddings saved to: {output_path}")
print(f"Final shape: {pseudo_bulk_embeddings.shape}")
print(pseudo_bulk_embeddings.head())


✅ Pseudo-bulk embeddings saved to: data/pseudobulk_sc_embeddings_aligned.csv
Final shape: (140, 3072)
                     0         1         2         3         4         5     \
SANGER_MODEL_ID                                                               
SIDM00078       -2.195639  0.357319 -0.862047 -0.790186  0.284951  0.375345   
SIDM00080       -2.959613  1.425849  0.573694  0.053191 -0.848321 -0.502302   
SIDM00082       -3.326083  1.041083  0.139121 -0.128704 -0.890527 -0.332816   
SIDM00088       -2.943385  1.145860 -0.228418  0.207987 -0.040858 -0.138065   
SIDM00092       -2.059386  1.363968 -0.845383 -0.103630  0.807727 -0.527988   

                     6         7         8         9     ...      3062  \
SANGER_MODEL_ID                                          ...             
SIDM00078       -0.467491  0.113661  0.304275  0.899682  ... -0.499465   
SIDM00080       -0.681997  1.324688  0.123628  0.169735  ... -0.374767   
SIDM00082       -0.323405  0.875065  0.159454  0