In [1]:
import polars as pl
import numpy as np
import os

# Ensure output directory exists
os.makedirs("pseudo_bulk", exist_ok=True)


In [2]:
# Load and optimize GDSC bulk drug response data
gdsc_bulk = pl.read_parquet("gdsc/gdsc_final_cleaned.parquet").select([
    pl.col("SANGER_MODEL_ID").cast(pl.Utf8),
    pl.col("DRUG_ID").cast(pl.Int32),
    pl.col("LN_IC50").cast(pl.Float32)
])

print("GDSC bulk data loaded successfully.")
print(f"Shape: {gdsc_bulk.shape}")
gdsc_bulk.head()


GDSC bulk data loaded successfully.
Shape: (575197, 3)


SANGER_MODEL_ID,DRUG_ID,LN_IC50
str,i32,f32
"""SIDM00374""",1009,4.13448
"""SIDM00255""",268,-2.236015
"""SIDM01182""",1012,1.321538
"""SIDM01160""",1023,3.875126
"""SIDM00547""",1197,4.457386


In [3]:
# Load single-cell gene expression data
cell_gene_matrix = pl.read_parquet("sc_data/rnaseq_fpkm.parquet")

# Transpose so that rows = cell lines, columns = genes
# Polars doesn't support transpose directly, so we use pandas just for this step
import pandas as pd

cell_gene_df = pd.read_parquet("sc_data/rnaseq_fpkm.parquet")
transposed_df = cell_gene_df.set_index(cell_gene_df.columns[0]).transpose()

# Reset index to turn cell line names into a column
transposed_df.index.name = "SANGER_MODEL_ID"
transposed_df.reset_index(inplace=True)

# Convert back to Polars
cell_gene_matrix = pl.from_pandas(transposed_df)

print("Transposed single-cell data to shape: rows = cell lines, cols = genes")
print(f"Shape: {cell_gene_matrix.shape}")
cell_gene_matrix.head()


Transposed single-cell data to shape: rows = cell lines, cols = genes
Shape: (1431, 37607)


SANGER_MODEL_ID,model_name,dataset_name,data_source,gene_id,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,SIDG00011,SIDG00012,SIDG00017,SIDG00019,SIDG00020,SIDG00021,SIDG00022,SIDG00023,SIDG00024,SIDG00025,SIDG00026,SIDG00027,SIDG00028,SIDG00029,SIDG00030,SIDG00031,SIDG00032,SIDG00033,SIDG00034,SIDG00035,SIDG00036,SIDG00037,…,SIDG42441,SIDG42442,SIDG42443,SIDG42444,SIDG42445,SIDG42446,SIDG42447,SIDG42448,SIDG42449,SIDG42450,SIDG42451,SIDG42452,SIDG42453,SIDG42454,SIDG42455,SIDG42456,SIDG42457,SIDG42458,SIDG42459,SIDG42460,SIDG42461,SIDG42462,SIDG42463,SIDG42464,SIDG42466,SIDG42467,SIDG42468,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SIDM00001""",,,,,0.16,0.73,0.01,0.1,0.0,0.0,0.0,0.92,0.0,0.0,0.16,0.0,7.28,4.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.49,20.05,1.53,2.48,25.7,0.89,11.79,0.75,203.63,10.6,…,0.0,5.53,3.35,7.32,4.41,0.75,1.41,4.8,0.0,2.16,3.21,0.1,0.0,0.0,0.0,2.24,1.05,7.2,0.21,5.03,4.34,10.32,11.04,72.98,0.58,2.32,2.7,0.86,0.0,2.73,8.44,0.0,3.53,4.27,2.5,1.81,0.62
"""SIDM00002""",,,,,0.22,1.69,0.0,0.21,0.2,0.16,0.01,0.03,0.0,0.29,5.4,0.0,25.13,2.51,0.11,0.02,0.0,0.03,0.0,6.18,0.11,2.05,0.67,9.66,1.22,4.59,31.14,0.55,22.66,0.12,53.64,8.52,…,1.55,4.29,7.68,5.48,2.58,0.75,5.24,5.82,0.03,2.5,7.11,2.55,0.0,0.0,0.0,21.82,3.23,10.65,0.15,4.06,8.3,24.31,9.22,30.5,1.34,3.38,15.36,0.58,0.0,8.85,15.77,0.0,6.67,12.7,3.24,1.49,0.16
"""SIDM00003""",,,,,0.14,3.33,0.0,8.41,0.24,0.0,0.0,0.0,0.01,0.09,0.0,0.0,18.38,3.74,0.51,0.0,0.0,0.01,0.0,0.0,0.0,4.51,8.32,21.28,2.81,2.51,35.65,0.06,16.7,0.12,20.04,10.36,…,0.01,3.84,4.06,3.19,1.19,0.34,2.8,4.76,0.0,1.73,1.46,0.03,0.0,0.0,0.0,3.64,5.83,6.88,0.11,2.45,2.53,21.46,23.29,79.94,0.67,2.01,4.92,3.83,0.09,8.62,51.84,0.0,7.28,12.9,2.19,1.7,0.56
"""SIDM00005""",,,,,0.87,2.04,0.0,0.23,1.48,0.05,0.15,2.42,0.02,0.14,0.75,0.15,22.81,8.78,0.08,0.0,0.0,0.0,0.0,0.0,0.0,8.7,2.01,31.56,1.1,101.2,34.33,0.18,17.1,0.56,19.34,6.19,…,0.0,3.34,12.35,7.07,6.6,2.79,1.07,4.81,0.0,1.75,2.32,1.39,0.01,0.05,0.01,5.8,5.12,25.7,0.26,1.74,7.89,4.28,10.93,72.11,1.63,2.62,7.53,1.01,0.0,3.96,6.87,0.0,3.37,7.11,7.52,5.02,0.33
"""SIDM00006""",,,,,0.15,2.03,0.01,23.22,0.28,0.0,0.01,0.0,0.01,0.16,3.86,0.0,7.87,1.81,0.12,0.0,0.0,0.02,0.0,0.07,0.02,3.49,5.16,16.44,4.01,3.11,32.19,0.08,18.61,1.53,67.8,7.13,…,0.71,4.09,3.72,4.28,2.41,3.21,4.52,5.17,0.12,1.75,3.39,2.79,0.0,0.0,0.0,8.07,3.21,7.11,0.1,2.53,5.38,15.49,11.14,29.36,0.85,2.23,4.29,4.49,0.0,7.51,141.7,0.0,3.48,8.75,2.94,1.29,0.11


In [4]:
# All gene columns (exclude SANGER_MODEL_ID)
gene_cols = [col for col in cell_gene_matrix.columns if col != "SANGER_MODEL_ID"]

# Normalize before aggregation (log1p on gene columns)
normalized_before = cell_gene_matrix.with_columns([
    pl.col(col).cast(pl.Float64).log1p().alias(col) for col in gene_cols
])

# Group by cell line (if needed), then normalize after aggregation
# In this setup, there's no need to aggregate — already pseudo-bulked
# But if duplicates exist, you could do:
pseudo_bulk = normalized_before.group_by("SANGER_MODEL_ID").agg([
    pl.col(col).mean().alias(col) for col in gene_cols
])

normalized_after = pseudo_bulk  # already normalized above

print("Normalization complete.")
print(f"Normalized Before Shape: {normalized_before.shape}")


Normalization complete.
Normalized Before Shape: (1431, 37607)


In [5]:
# Compute variance on gene columns
def top_variable_genes(df, cols, top_n=2000):
    data = df.select(cols).to_pandas().to_numpy()
    var = np.var(data, axis=0)
    top_indices = np.argsort(var)[-top_n:]
    selected = [cols[i] for i in top_indices]
    return ["SANGER_MODEL_ID"] + selected

top_genes = top_variable_genes(normalized_after, gene_cols)

filtered_after = normalized_after.select(top_genes)

print("Filtered to top 2000 highly variable genes.")
print(f"Filtered Shape: {filtered_after.shape}")


Filtered to top 2000 highly variable genes.
Filtered Shape: (1431, 2001)


In [6]:
# Merge with GDSC
merged = gdsc_bulk.join(filtered_after, on="SANGER_MODEL_ID", how="left")

# Save
merged.write_parquet("pseudo_bulk/gdsc_single_cell_aligned.parquet")

print("Merged GDSC with pseudo-bulk expression.")
print(f"Final shape: {merged.shape}")
merged.head()


Merged GDSC with pseudo-bulk expression.
Final shape: (575197, 2003)


SANGER_MODEL_ID,DRUG_ID,LN_IC50,SIDG25786,SIDG14622,SIDG00004,SIDG42247,SIDG37202,SIDG36305,SIDG04653,SIDG37321,SIDG02118,SIDG17735,SIDG00811,SIDG24309,SIDG09871,SIDG08545,SIDG23953,SIDG38190,SIDG01729,SIDG13511,SIDG04729,SIDG08553,SIDG04194,SIDG06584,SIDG42472,SIDG39049,SIDG37306,SIDG17963,SIDG21777,SIDG40391,SIDG01263,SIDG11425,SIDG17569,SIDG10150,SIDG10408,SIDG09705,…,SIDG16508,SIDG16496,SIDG16659,SIDG38483,SIDG16474,SIDG16470,SIDG16375,SIDG16368,SIDG16366,SIDG16359,SIDG16205,SIDG16201,SIDG16125,SIDG40918,SIDG28470,SIDG17240,SIDG28096,SIDG17129,SIDG28264,SIDG28269,SIDG16994,SIDG16958,SIDG05323,SIDG28468,SIDG28469,SIDG03587,SIDG14433,SIDG28471,SIDG28472,SIDG28473,SIDG28475,SIDG06942,SIDG05279,SIDG03212,SIDG40984,SIDG35827,SIDG28514
str,i32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SIDM00374""",1009,4.13448,1.541159,2.737609,0.307485,2.309561,0.357674,0.0,2.676903,0.00995,0.067659,1.925707,0.0,0.0,0.0,4.206333,0.019803,0.076961,5.793044,0.00995,1.631199,0.357674,2.421257,0.04879,3.14329,0.067659,1.545433,0.0,1.879465,0.067659,0.067659,3.617383,1.07841,2.190536,0.04879,0.0,…,0.0,,,,,,,0.0,0.0,,0.0,,0.00995,,,,0.0,0.0,0.0,0.0,,,,0.0,,0.783902,,,,,0.0,,0.239017,,,,0.0
"""SIDM00255""",268,-2.236015,3.214868,2.676903,0.0,3.4797,0.559616,0.019803,2.280339,0.113329,0.231112,0.019803,2.726545,4.130033,2.313525,3.609295,3.251149,0.0,1.717395,2.660959,2.840831,1.519513,2.985177,2.393339,2.68239,2.451867,0.0,0.0,2.151762,0.173953,0.00995,0.067659,1.706565,1.637053,3.810876,0.231112,…,0.0,,,,,,,0.0,0.182322,,0.0,,0.0,,,,0.336472,0.09531,0.875469,0.0,,,,1.238374,,1.05779,,,,,0.0,,0.04879,,,,0.0
"""SIDM01182""",1012,1.321538,1.456287,1.99606,0.019803,1.329724,1.153732,0.039221,2.777576,0.157004,0.0,2.016235,2.865624,2.906901,0.518794,0.631272,1.773256,0.215111,2.451005,0.00995,2.683758,1.826161,3.123246,1.717395,4.065259,1.474763,0.00995,0.0,2.969388,0.182322,0.039221,0.536493,0.122218,1.350667,3.367296,0.500775,…,0.0,,,,,,,0.0,0.0,,0.0,,0.0,,,,0.0,0.0,0.231112,0.0,,,,0.0,,0.41211,,,,,0.182322,,0.0,,,,0.0
"""SIDM01160""",1023,3.875126,0.463734,4.172231,0.0,1.244155,2.76001,0.609766,1.144223,0.131028,0.019803,1.134623,0.058269,0.500775,0.139762,2.791778,1.722767,0.364643,2.446685,0.223144,2.876386,1.368639,1.623341,0.039221,3.501043,2.048982,2.193886,2.237513,3.151025,3.046901,1.675226,0.788457,0.157004,1.627278,2.942331,1.791759,…,0.00995,,,,,,,0.0,0.0,,0.0,,0.0,,,,0.019803,0.0,0.0,0.0,,,,0.0,,0.746688,,,,,0.0,,0.019803,,,,0.0
"""SIDM00547""",1197,4.457386,0.039221,1.682688,0.039221,0.10436,1.401183,0.039221,0.500775,3.296947,0.0,2.398804,2.618125,0.058269,0.067659,3.569533,1.83737,0.14842,2.851862,0.00995,2.973487,2.276241,2.640485,0.039221,3.373484,0.693147,3.165897,0.0,2.921009,0.198851,0.019803,2.910174,1.418277,2.75557,4.174233,1.790091,…,0.0,,,,,,,0.0,0.039221,,0.0,,0.0,,,,0.0,0.0,0.29267,0.0,,,,0.604316,,0.500775,,,,,0.0,,0.0,,,,0.0
