In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
print("Loading pseudo-bulk expression data and gene embeddings...")

# Expression: cell lines x genes
expr_df = pd.read_parquet("data/gdsc_single_cell_aligned.parquet")
print(f"Expression matrix shape: {expr_df.shape}")
print(expr_df.head())
expr_df = expr_df.set_index("SANGER_MODEL_ID")

# Embeddings: genes x 512
gene_embeds = pd.read_parquet("data/gene_embeddings.parquet")
print(f"Gene embedding matrix shape: {gene_embeds.shape}")
print(gene_embeds.head())

# Keep only common genes
common_genes = list(set(expr_df.columns) & set(gene_embeds.index))
expr_df = expr_df[common_genes]
gene_embeds = gene_embeds.loc[common_genes]

print(f"Expression matrix shape: {expr_df.shape}")
print(f"Gene embedding matrix shape: {gene_embeds.shape}")


Loading pseudo-bulk expression data and gene embeddings...
Expression matrix shape: (575197, 2003)
  SANGER_MODEL_ID  DRUG_ID   LN_IC50  SIDG22565  SIDG00978  SIDG03466  \
0       SIDM00374     1009  4.134480   0.000000   0.095310   1.232560   
1       SIDM00255      268 -2.236015   3.141995   0.019803   1.905088   
2       SIDM01182     1012  1.321538   1.249902   3.724005   0.239017   
3       SIDM01160     1023  3.875126   2.380472   0.000000   0.104360   
4       SIDM00547     1197  4.457386   1.075002   0.737164   0.148420   

   SIDG40295  SIDG03584  SIDG09724  SIDG16884  ...  SIDG09225  SIDG33597  \
0   0.000000   1.860975   0.095310   0.000000  ...   0.173953   1.470176   
1   0.000000   3.955657   2.503892   1.860975  ...   2.885917   0.009950   
2   0.029559   1.442202   0.000000   0.009950  ...   3.348500   0.165514   
3   2.385086   2.057963   0.457425   0.371564  ...   2.254445   5.023091   
4   0.000000   1.515127   0.048790   0.207014  ...   0.307485   3.288402   

   SI

In [4]:
print("Projecting expression values into gene embedding space...")

expr_matrix = expr_df.values                # shape: [cells x genes]
embed_matrix = gene_embeds.values          # shape: [genes x 512]

cell_embeddings = expr_matrix @ embed_matrix  # shape: [cells x 512]

# Create output DataFrame
projected_df = pd.DataFrame(cell_embeddings, index=expr_df.index)
projected_df.index.name = "SANGER_MODEL_ID"

Projecting expression values into gene embedding space...


In [5]:
os.makedirs("data/", exist_ok=True)
projected_df.to_parquet("data/projected_cell_embeddings.parquet")
print("✅ Projected cell embeddings saved to data/projected_cell_embeddings.parquet")

✅ Projected cell embeddings saved to data/projected_cell_embeddings.parquet


In [None]:
print(projected_df.head())
print(projected_df.shape)
print("Done!")

                        0           1           2           3           4    \
SANGER_MODEL_ID                                                               
SIDM00374        -24.609437  177.888497 -349.928097  170.851536  -88.744320   
SIDM00255        293.870301  217.897788 -130.545389 -743.235378  -74.628916   
SIDM01182        129.950107   17.931049 -137.786495 -234.101681  189.414975   
SIDM01160        123.714384  234.402464   92.232606   99.429767  -62.015606   
SIDM00547        -69.518765   70.101059  -53.708288   62.750489  -71.456198   

                        5           6           7           8           9    \
SANGER_MODEL_ID                                                               
SIDM00374        193.092302 -344.860870   -4.115366  617.891574   58.882229   
SIDM00255        442.328807  122.458742  323.813067  191.203231  -63.319645   
SIDM01182        206.345454  251.811201  160.569900  719.796956  -10.040541   
SIDM01160        429.530658 -188.272192  216.461961

: 