In [2]:
# Cell 1: Imports and setup
import polars as pl
import numpy as np
import os
import pandas as pd

# Ensure output directory exists
os.makedirs("pseudo_bulk", exist_ok=True)


In [3]:
# Cell 2: Load and optimize GDSC bulk drug response data
gdsc_bulk = pl.read_parquet("gdsc/gdsc_final_cleaned.parquet").select([
    pl.col("SANGER_MODEL_ID").cast(pl.Utf8),
    pl.col("DRUG_ID").cast(pl.Int32),
    pl.col("LN_IC50").cast(pl.Float32)
])

print("GDSC bulk data loaded successfully.")
print(f"Shape: {gdsc_bulk.shape}")
gdsc_bulk.head()


GDSC bulk data loaded successfully.
Shape: (575197, 3)


SANGER_MODEL_ID,DRUG_ID,LN_IC50
str,i32,f32
"""SIDM00374""",1009,4.13448
"""SIDM00255""",268,-2.236015
"""SIDM01182""",1012,1.321538
"""SIDM01160""",1023,3.875126
"""SIDM00547""",1197,4.457386


In [4]:
# Cell 3: Load and transpose single-cell gene expression data
cell_gene_df = pd.read_parquet("sc_data/rnaseq_fpkm.parquet")
transposed_df = cell_gene_df.set_index(cell_gene_df.columns[0]).transpose()

# Ensure all values are numeric and fill NAs with zeros or a small value
transposed_df = transposed_df.apply(pd.to_numeric, errors='coerce').fillna(0.0)

# Reset index to turn cell line names into a column
transposed_df.index.name = "SANGER_MODEL_ID"
transposed_df.reset_index(inplace=True)

# Convert back to Polars
cell_gene_matrix = pl.from_pandas(transposed_df)

print("Transposed single-cell data to shape: rows = cell lines, cols = genes")
print(f"Shape: {cell_gene_matrix.shape}")
cell_gene_matrix.head()


Transposed single-cell data to shape: rows = cell lines, cols = genes
Shape: (1431, 37607)


SANGER_MODEL_ID,model_name,dataset_name,data_source,gene_id,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,SIDG00011,SIDG00012,SIDG00017,SIDG00019,SIDG00020,SIDG00021,SIDG00022,SIDG00023,SIDG00024,SIDG00025,SIDG00026,SIDG00027,SIDG00028,SIDG00029,SIDG00030,SIDG00031,SIDG00032,SIDG00033,SIDG00034,SIDG00035,SIDG00036,SIDG00037,…,SIDG42441,SIDG42442,SIDG42443,SIDG42444,SIDG42445,SIDG42446,SIDG42447,SIDG42448,SIDG42449,SIDG42450,SIDG42451,SIDG42452,SIDG42453,SIDG42454,SIDG42455,SIDG42456,SIDG42457,SIDG42458,SIDG42459,SIDG42460,SIDG42461,SIDG42462,SIDG42463,SIDG42464,SIDG42466,SIDG42467,SIDG42468,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SIDM00001""",0.0,0.0,0.0,0.0,0.16,0.73,0.01,0.1,0.0,0.0,0.0,0.92,0.0,0.0,0.16,0.0,7.28,4.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.49,20.05,1.53,2.48,25.7,0.89,11.79,0.75,203.63,10.6,…,0.0,5.53,3.35,7.32,4.41,0.75,1.41,4.8,0.0,2.16,3.21,0.1,0.0,0.0,0.0,2.24,1.05,7.2,0.21,5.03,4.34,10.32,11.04,72.98,0.58,2.32,2.7,0.86,0.0,2.73,8.44,0.0,3.53,4.27,2.5,1.81,0.62
"""SIDM00002""",0.0,0.0,0.0,0.0,0.22,1.69,0.0,0.21,0.2,0.16,0.01,0.03,0.0,0.29,5.4,0.0,25.13,2.51,0.11,0.02,0.0,0.03,0.0,6.18,0.11,2.05,0.67,9.66,1.22,4.59,31.14,0.55,22.66,0.12,53.64,8.52,…,1.55,4.29,7.68,5.48,2.58,0.75,5.24,5.82,0.03,2.5,7.11,2.55,0.0,0.0,0.0,21.82,3.23,10.65,0.15,4.06,8.3,24.31,9.22,30.5,1.34,3.38,15.36,0.58,0.0,8.85,15.77,0.0,6.67,12.7,3.24,1.49,0.16
"""SIDM00003""",0.0,0.0,0.0,0.0,0.14,3.33,0.0,8.41,0.24,0.0,0.0,0.0,0.01,0.09,0.0,0.0,18.38,3.74,0.51,0.0,0.0,0.01,0.0,0.0,0.0,4.51,8.32,21.28,2.81,2.51,35.65,0.06,16.7,0.12,20.04,10.36,…,0.01,3.84,4.06,3.19,1.19,0.34,2.8,4.76,0.0,1.73,1.46,0.03,0.0,0.0,0.0,3.64,5.83,6.88,0.11,2.45,2.53,21.46,23.29,79.94,0.67,2.01,4.92,3.83,0.09,8.62,51.84,0.0,7.28,12.9,2.19,1.7,0.56
"""SIDM00005""",0.0,0.0,0.0,0.0,0.87,2.04,0.0,0.23,1.48,0.05,0.15,2.42,0.02,0.14,0.75,0.15,22.81,8.78,0.08,0.0,0.0,0.0,0.0,0.0,0.0,8.7,2.01,31.56,1.1,101.2,34.33,0.18,17.1,0.56,19.34,6.19,…,0.0,3.34,12.35,7.07,6.6,2.79,1.07,4.81,0.0,1.75,2.32,1.39,0.01,0.05,0.01,5.8,5.12,25.7,0.26,1.74,7.89,4.28,10.93,72.11,1.63,2.62,7.53,1.01,0.0,3.96,6.87,0.0,3.37,7.11,7.52,5.02,0.33
"""SIDM00006""",0.0,0.0,0.0,0.0,0.15,2.03,0.01,23.22,0.28,0.0,0.01,0.0,0.01,0.16,3.86,0.0,7.87,1.81,0.12,0.0,0.0,0.02,0.0,0.07,0.02,3.49,5.16,16.44,4.01,3.11,32.19,0.08,18.61,1.53,67.8,7.13,…,0.71,4.09,3.72,4.28,2.41,3.21,4.52,5.17,0.12,1.75,3.39,2.79,0.0,0.0,0.0,8.07,3.21,7.11,0.1,2.53,5.38,15.49,11.14,29.36,0.85,2.23,4.29,4.49,0.0,7.51,141.7,0.0,3.48,8.75,2.94,1.29,0.11


In [5]:
# Cell 4: Normalize using log1p (safe for 0 and small values)
gene_cols = [col for col in cell_gene_matrix.columns if col != "SANGER_MODEL_ID"]

normalized_before = cell_gene_matrix.with_columns([
    pl.col(col).cast(pl.Float64).log1p().alias(col) for col in gene_cols
])

# Group by SANGER_MODEL_ID (if duplicates exist)
pseudo_bulk = normalized_before.group_by("SANGER_MODEL_ID").agg([
    pl.col(col).mean().alias(col) for col in gene_cols
])

print("Normalization complete.")
print(f"Normalized Shape: {pseudo_bulk.shape}")


Normalization complete.
Normalized Shape: (1431, 37607)


In [6]:
# Cell 5: Select top 2000 highly variable genes
def top_variable_genes(df, cols, top_n=2000):
    data = df.select(cols).to_pandas().to_numpy()
    var = np.var(data, axis=0)
    top_indices = np.argsort(var)[-top_n:]
    selected = [cols[i] for i in top_indices]
    return ["SANGER_MODEL_ID"] + selected

top_genes = top_variable_genes(pseudo_bulk, gene_cols)
filtered_after = pseudo_bulk.select(top_genes)

print("Filtered to top 2000 highly variable genes.")
print(f"Filtered Shape: {filtered_after.shape}")


Filtered to top 2000 highly variable genes.
Filtered Shape: (1431, 2001)


In [7]:
# Cell 6: Merge with GDSC and save
merged = gdsc_bulk.join(filtered_after, on="SANGER_MODEL_ID", how="left")

print("Merged GDSC with pseudo-bulk expression.")
print(f"Final shape: {merged.shape}")
merged.head()


Merged GDSC with pseudo-bulk expression.
Final shape: (575197, 2003)


SANGER_MODEL_ID,DRUG_ID,LN_IC50,SIDG22565,SIDG00978,SIDG03466,SIDG40295,SIDG03584,SIDG09724,SIDG16884,SIDG07256,SIDG38226,SIDG04097,SIDG26742,SIDG40031,SIDG22627,SIDG39064,SIDG21177,SIDG08780,SIDG14602,SIDG17435,SIDG37000,SIDG03613,SIDG37326,SIDG36219,SIDG21980,SIDG24980,SIDG34491,SIDG03352,SIDG17539,SIDG06860,SIDG22683,SIDG02388,SIDG08619,SIDG15056,SIDG11319,SIDG40701,…,SIDG14055,SIDG00795,SIDG37633,SIDG33598,SIDG14546,SIDG03949,SIDG39685,SIDG03546,SIDG36432,SIDG35053,SIDG12427,SIDG05997,SIDG12423,SIDG03980,SIDG11411,SIDG34262,SIDG37575,SIDG17142,SIDG32797,SIDG33581,SIDG13982,SIDG10609,SIDG14466,SIDG37069,SIDG40258,SIDG34183,SIDG14064,SIDG09225,SIDG33597,SIDG00640,SIDG33601,SIDG07872,SIDG36158,SIDG40707,SIDG13984,SIDG14133,SIDG19416
str,i32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SIDM00374""",1009,4.13448,0.0,0.09531,1.23256,0.0,1.860975,0.09531,0.0,5.8172,2.516082,2.321407,0.198851,4.454464,1.702928,0.678034,0.113329,0.00995,0.00995,4.65396,0.0,0.198851,2.508786,1.656321,0.019803,2.442347,0.0,0.00995,4.051089,0.029559,0.924259,0.0,1.342865,0.00995,0.908259,0.0,…,0.139762,0.00995,0.0,0.019803,5.354414,0.067659,0.0,1.948763,7.332592,0.029559,0.00995,0.0,0.989541,6.37139,6.484574,0.41211,0.0,0.00995,0.0,0.039221,1.446919,0.0,0.0,0.076961,0.0,0.385262,0.019803,0.173953,1.470176,0.00995,0.113329,0.039221,1.393766,5.47185,0.058269,0.00995,8.201465
"""SIDM00255""",268,-2.236015,3.141995,0.019803,1.905088,0.0,3.955657,2.503892,1.860975,0.00995,0.029559,3.347093,0.506818,4.51437,1.373716,1.131402,0.029559,2.468947,2.315501,0.019803,1.975469,1.238374,3.161247,0.963174,0.0,2.410542,0.0,0.14842,0.067659,0.940007,0.029559,0.329304,2.875258,0.00995,2.596746,0.157004,…,0.00995,0.0,0.322083,0.0,0.832909,5.260823,0.0,0.039221,0.019803,0.0,0.444686,0.157004,2.473171,0.00995,0.0,0.10436,0.00995,1.121678,0.0,0.0,0.0,0.019803,0.0,0.029559,5.709201,0.0,0.019803,2.885917,0.00995,0.0,0.029559,0.357674,2.512035,4.614427,0.058269,0.157004,7.795054
"""SIDM01182""",1012,1.321538,1.249902,3.724005,0.239017,0.029559,1.442202,0.0,0.00995,0.058269,2.992728,3.629129,0.19062,4.265493,2.026832,1.627278,2.335052,2.336987,0.845868,0.542324,0.683097,1.715598,3.941582,2.258633,0.019803,4.493344,0.0,0.916291,0.039221,2.982647,0.0,0.00995,0.966984,1.803359,1.860975,0.336472,…,0.139762,2.620311,0.322083,4.828554,6.391079,2.294553,0.029559,4.184794,4.696381,0.215111,0.215111,5.248286,3.339677,0.165514,0.364643,0.113329,2.872434,0.19062,0.076961,2.759377,0.518794,0.392042,0.231112,0.086178,3.837515,6.43146,3.892024,3.3485,0.165514,0.086178,0.04879,0.24686,3.275634,5.75092,3.068983,0.270027,0.0
"""SIDM01160""",1023,3.875126,2.380472,0.0,0.10436,2.385086,2.057963,0.457425,0.371564,0.173953,2.593761,1.671473,2.577182,3.222469,0.8671,1.913977,2.612273,1.406097,0.029559,0.951658,0.970779,0.207014,3.440739,2.767576,0.09531,4.493009,0.0,0.113329,0.00995,2.338917,0.00995,0.10436,0.04879,1.391282,1.702928,1.82777,…,5.901813,0.058269,3.353057,4.990637,2.286456,4.581697,0.00995,4.73145,1.108563,3.093766,0.09531,4.907125,3.903789,2.228939,0.00995,4.547541,1.61542,4.777189,0.019803,6.286854,6.774921,4.869149,2.513656,5.995307,0.04879,4.351954,5.932484,2.254445,5.023091,3.135059,5.982575,4.023207,0.00995,0.00995,5.367703,6.523768,2.87976
"""SIDM00547""",1197,4.457386,1.075002,0.737164,0.14842,0.0,1.515127,0.04879,0.207014,0.04879,3.058707,3.447126,2.349469,3.592644,1.10194,1.054312,3.755135,0.765468,0.00995,0.00995,1.226712,0.00995,1.759581,2.099244,1.223775,3.736717,0.0,0.336472,0.029559,1.731656,0.708036,0.0,1.560248,1.308333,2.231089,0.0,…,0.598837,1.015231,1.495149,6.052959,2.073172,1.818077,3.042139,1.570697,3.255786,4.944211,0.122218,2.109,0.157004,6.534109,7.319951,4.170688,1.574846,5.53252,0.086178,7.222646,4.381151,0.29267,2.392426,1.986504,0.19062,1.418277,5.042005,0.307485,3.288402,5.493514,3.757706,3.898735,0.239017,4.798184,5.294209,4.818829,0.0


In [8]:
# Cell 7: Save final dataset
merged.write_parquet("pseudo_bulk/gdsc_single_cell_aligned.parquet")
print("Saved aligned data to pseudo_bulk/gdsc_single_cell_aligned.parquet")


Saved aligned data to pseudo_bulk/gdsc_single_cell_aligned.parquet
