# Align gene expression and GDSC Data into Pseudo-Bulk Format

This notebook performs the first preprocessing step in our pipeline:
- Align GDSC drug response data with gene expression data.
- Normalize expression values using `log1p`.




In [1]:
# Cell 1: Imports and setup
import polars as pl
import numpy as np
import os
import pandas as pd


## 1. Load GDSC Drug Response Data

We load the cleaned GDSC data and select only the relevant columns:
- `SANGER_MODEL_ID` for cell line
- `DRUG_ID` for compound identity
- `LN_IC50` for log-transformed drug sensitivity


In [2]:
gdsc_bulk = pl.read_parquet("../../data/bulk/gdsc_final_cleaned.parquet").select([
    pl.col("SANGER_MODEL_ID").cast(pl.Utf8),
    pl.col("DRUG_ID").cast(pl.Int32),
    pl.col("LN_IC50").cast(pl.Float32)
])

print("✅ GDSC bulk data loaded.")
print(f"Shape: {gdsc_bulk.shape}")
gdsc_bulk.head()


✅ GDSC bulk data loaded.
Shape: (571985, 3)


SANGER_MODEL_ID,DRUG_ID,LN_IC50
str,i32,f32
"""SIDM00374""",1009,4.13448
"""SIDM00255""",268,-2.236015
"""SIDM01182""",1012,1.321538
"""SIDM01160""",1023,3.875126
"""SIDM00547""",1197,4.457386


## 2. Load and Transpose Gene Expression

We transpose the gene expression matrix to get a format where:
- Each row = a cell line
- Each column = a gene

We also convert all values to numeric and fill missing values with `0.0`.


In [3]:
# Cell 3: Load and transpose gene expression data
cell_gene_df = pd.read_parquet("../../data/bulk/rnaseq_fpkm.parquet")
transposed_df = cell_gene_df.set_index(cell_gene_df.columns[0]).transpose()

# Ensure all values are numeric and fill NAs with zeros or a small value
transposed_df = transposed_df.apply(pd.to_numeric, errors='coerce').fillna(0.0)

# Reset index to turn cell line names into a column
transposed_df.index.name = "SANGER_MODEL_ID"
transposed_df.reset_index(inplace=True)

# Convert back to Polars
cell_gene_matrix = pl.from_pandas(transposed_df)

print("Transposed gene expression data to shape: rows = cell lines, cols = genes")
print(f"Shape: {cell_gene_matrix.shape}")
cell_gene_matrix.head()


Transposed gene expression data to shape: rows = cell lines, cols = genes
Shape: (1431, 37607)


SANGER_MODEL_ID,model_name,dataset_name,data_source,gene_id,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,SIDG00011,SIDG00012,SIDG00017,SIDG00019,SIDG00020,SIDG00021,SIDG00022,SIDG00023,SIDG00024,SIDG00025,SIDG00026,SIDG00027,SIDG00028,SIDG00029,SIDG00030,SIDG00031,SIDG00032,SIDG00033,SIDG00034,SIDG00035,SIDG00036,SIDG00037,…,SIDG42441,SIDG42442,SIDG42443,SIDG42444,SIDG42445,SIDG42446,SIDG42447,SIDG42448,SIDG42449,SIDG42450,SIDG42451,SIDG42452,SIDG42453,SIDG42454,SIDG42455,SIDG42456,SIDG42457,SIDG42458,SIDG42459,SIDG42460,SIDG42461,SIDG42462,SIDG42463,SIDG42464,SIDG42466,SIDG42467,SIDG42468,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SIDM00001""",0.0,0.0,0.0,0.0,0.16,0.73,0.01,0.1,0.0,0.0,0.0,0.92,0.0,0.0,0.16,0.0,7.28,4.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.49,20.05,1.53,2.48,25.7,0.89,11.79,0.75,203.63,10.6,…,0.0,5.53,3.35,7.32,4.41,0.75,1.41,4.8,0.0,2.16,3.21,0.1,0.0,0.0,0.0,2.24,1.05,7.2,0.21,5.03,4.34,10.32,11.04,72.98,0.58,2.32,2.7,0.86,0.0,2.73,8.44,0.0,3.53,4.27,2.5,1.81,0.62
"""SIDM00002""",0.0,0.0,0.0,0.0,0.22,1.69,0.0,0.21,0.2,0.16,0.01,0.03,0.0,0.29,5.4,0.0,25.13,2.51,0.11,0.02,0.0,0.03,0.0,6.18,0.11,2.05,0.67,9.66,1.22,4.59,31.14,0.55,22.66,0.12,53.64,8.52,…,1.55,4.29,7.68,5.48,2.58,0.75,5.24,5.82,0.03,2.5,7.11,2.55,0.0,0.0,0.0,21.82,3.23,10.65,0.15,4.06,8.3,24.31,9.22,30.5,1.34,3.38,15.36,0.58,0.0,8.85,15.77,0.0,6.67,12.7,3.24,1.49,0.16
"""SIDM00003""",0.0,0.0,0.0,0.0,0.14,3.33,0.0,8.41,0.24,0.0,0.0,0.0,0.01,0.09,0.0,0.0,18.38,3.74,0.51,0.0,0.0,0.01,0.0,0.0,0.0,4.51,8.32,21.28,2.81,2.51,35.65,0.06,16.7,0.12,20.04,10.36,…,0.01,3.84,4.06,3.19,1.19,0.34,2.8,4.76,0.0,1.73,1.46,0.03,0.0,0.0,0.0,3.64,5.83,6.88,0.11,2.45,2.53,21.46,23.29,79.94,0.67,2.01,4.92,3.83,0.09,8.62,51.84,0.0,7.28,12.9,2.19,1.7,0.56
"""SIDM00005""",0.0,0.0,0.0,0.0,0.87,2.04,0.0,0.23,1.48,0.05,0.15,2.42,0.02,0.14,0.75,0.15,22.81,8.78,0.08,0.0,0.0,0.0,0.0,0.0,0.0,8.7,2.01,31.56,1.1,101.2,34.33,0.18,17.1,0.56,19.34,6.19,…,0.0,3.34,12.35,7.07,6.6,2.79,1.07,4.81,0.0,1.75,2.32,1.39,0.01,0.05,0.01,5.8,5.12,25.7,0.26,1.74,7.89,4.28,10.93,72.11,1.63,2.62,7.53,1.01,0.0,3.96,6.87,0.0,3.37,7.11,7.52,5.02,0.33
"""SIDM00006""",0.0,0.0,0.0,0.0,0.15,2.03,0.01,23.22,0.28,0.0,0.01,0.0,0.01,0.16,3.86,0.0,7.87,1.81,0.12,0.0,0.0,0.02,0.0,0.07,0.02,3.49,5.16,16.44,4.01,3.11,32.19,0.08,18.61,1.53,67.8,7.13,…,0.71,4.09,3.72,4.28,2.41,3.21,4.52,5.17,0.12,1.75,3.39,2.79,0.0,0.0,0.0,8.07,3.21,7.11,0.1,2.53,5.38,15.49,11.14,29.36,0.85,2.23,4.29,4.49,0.0,7.51,141.7,0.0,3.48,8.75,2.94,1.29,0.11


In [4]:
# Drop unwanted columns
cell_gene_matrix = cell_gene_matrix.drop(["model_name", "dataset_name", "data_source", "gene_id"])

# Get null count per column and convert to 2-column format manually
nulls_series = cell_gene_matrix.null_count()
nulls_df = pl.DataFrame({
    "column": nulls_series.columns,
    "null_count": [nulls_series[0, col] for col in nulls_series.columns]
})

# Filter to only show columns with any nulls
nan_summary = nulls_df.filter(pl.col("null_count") > 0)

# Print summary
if nan_summary.height == 0:
    print("✅ No NaN values found in the dataset.")
else:
    print("⚠️ NaN values found in the following columns:")
    print(nan_summary)

cell_gene_matrix.head()


✅ No NaN values found in the dataset.


SANGER_MODEL_ID,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,SIDG00011,SIDG00012,SIDG00017,SIDG00019,SIDG00020,SIDG00021,SIDG00022,SIDG00023,SIDG00024,SIDG00025,SIDG00026,SIDG00027,SIDG00028,SIDG00029,SIDG00030,SIDG00031,SIDG00032,SIDG00033,SIDG00034,SIDG00035,SIDG00036,SIDG00037,SIDG00038,SIDG00039,SIDG00040,SIDG00041,…,SIDG42441,SIDG42442,SIDG42443,SIDG42444,SIDG42445,SIDG42446,SIDG42447,SIDG42448,SIDG42449,SIDG42450,SIDG42451,SIDG42452,SIDG42453,SIDG42454,SIDG42455,SIDG42456,SIDG42457,SIDG42458,SIDG42459,SIDG42460,SIDG42461,SIDG42462,SIDG42463,SIDG42464,SIDG42466,SIDG42467,SIDG42468,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SIDM00001""",0.16,0.73,0.01,0.1,0.0,0.0,0.0,0.92,0.0,0.0,0.16,0.0,7.28,4.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.49,20.05,1.53,2.48,25.7,0.89,11.79,0.75,203.63,10.6,4.7,0.0,4.28,10.57,…,0.0,5.53,3.35,7.32,4.41,0.75,1.41,4.8,0.0,2.16,3.21,0.1,0.0,0.0,0.0,2.24,1.05,7.2,0.21,5.03,4.34,10.32,11.04,72.98,0.58,2.32,2.7,0.86,0.0,2.73,8.44,0.0,3.53,4.27,2.5,1.81,0.62
"""SIDM00002""",0.22,1.69,0.0,0.21,0.2,0.16,0.01,0.03,0.0,0.29,5.4,0.0,25.13,2.51,0.11,0.02,0.0,0.03,0.0,6.18,0.11,2.05,0.67,9.66,1.22,4.59,31.14,0.55,22.66,0.12,53.64,8.52,2.11,0.0,3.24,15.78,…,1.55,4.29,7.68,5.48,2.58,0.75,5.24,5.82,0.03,2.5,7.11,2.55,0.0,0.0,0.0,21.82,3.23,10.65,0.15,4.06,8.3,24.31,9.22,30.5,1.34,3.38,15.36,0.58,0.0,8.85,15.77,0.0,6.67,12.7,3.24,1.49,0.16
"""SIDM00003""",0.14,3.33,0.0,8.41,0.24,0.0,0.0,0.0,0.01,0.09,0.0,0.0,18.38,3.74,0.51,0.0,0.0,0.01,0.0,0.0,0.0,4.51,8.32,21.28,2.81,2.51,35.65,0.06,16.7,0.12,20.04,10.36,1.14,0.0,2.29,11.33,…,0.01,3.84,4.06,3.19,1.19,0.34,2.8,4.76,0.0,1.73,1.46,0.03,0.0,0.0,0.0,3.64,5.83,6.88,0.11,2.45,2.53,21.46,23.29,79.94,0.67,2.01,4.92,3.83,0.09,8.62,51.84,0.0,7.28,12.9,2.19,1.7,0.56
"""SIDM00005""",0.87,2.04,0.0,0.23,1.48,0.05,0.15,2.42,0.02,0.14,0.75,0.15,22.81,8.78,0.08,0.0,0.0,0.0,0.0,0.0,0.0,8.7,2.01,31.56,1.1,101.2,34.33,0.18,17.1,0.56,19.34,6.19,4.45,0.0,4.35,5.18,…,0.0,3.34,12.35,7.07,6.6,2.79,1.07,4.81,0.0,1.75,2.32,1.39,0.01,0.05,0.01,5.8,5.12,25.7,0.26,1.74,7.89,4.28,10.93,72.11,1.63,2.62,7.53,1.01,0.0,3.96,6.87,0.0,3.37,7.11,7.52,5.02,0.33
"""SIDM00006""",0.15,2.03,0.01,23.22,0.28,0.0,0.01,0.0,0.01,0.16,3.86,0.0,7.87,1.81,0.12,0.0,0.0,0.02,0.0,0.07,0.02,3.49,5.16,16.44,4.01,3.11,32.19,0.08,18.61,1.53,67.8,7.13,1.25,0.0,2.11,8.32,…,0.71,4.09,3.72,4.28,2.41,3.21,4.52,5.17,0.12,1.75,3.39,2.79,0.0,0.0,0.0,8.07,3.21,7.11,0.1,2.53,5.38,15.49,11.14,29.36,0.85,2.23,4.29,4.49,0.0,7.51,141.7,0.0,3.48,8.75,2.94,1.29,0.11


## 3. Normalize and Aggregate (Pseudo-Bulk)

We apply `log1p` transformation to expression values for normalization,
then group by `SANGER_MODEL_ID` to get a single profile per cell line (pseudo-bulk).


In [5]:
gene_cols = [col for col in cell_gene_matrix.columns if col != "SANGER_MODEL_ID"]

# Normalize using log1p
normalized = cell_gene_matrix.with_columns([
    pl.col(col).cast(pl.Float64).log1p().alias(col) for col in gene_cols
])

# Aggregate expression by cell line
pseudo_bulk = normalized.group_by("SANGER_MODEL_ID").agg([
    pl.col(col).mean().alias(col) for col in gene_cols
])

print("✅ Normalization and aggregation complete.")
print(f"Shape: {pseudo_bulk.shape}")


✅ Normalization and aggregation complete.
Shape: (1431, 37603)


## 4. Merge Pseudo-Bulk Expression with Drug Response

We align the normalized expression matrix with drug response data by matching `SANGER_MODEL_ID`.


In [6]:
merged = gdsc_bulk.join(pseudo_bulk, on="SANGER_MODEL_ID", how="left")

print("✅ Merged GDSC with pseudo-bulk expression.")
print(f"Final shape: {merged.shape}")
merged.head()


✅ Merged GDSC with pseudo-bulk expression.
Final shape: (571985, 37605)


SANGER_MODEL_ID,DRUG_ID,LN_IC50,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,SIDG00011,SIDG00012,SIDG00017,SIDG00019,SIDG00020,SIDG00021,SIDG00022,SIDG00023,SIDG00024,SIDG00025,SIDG00026,SIDG00027,SIDG00028,SIDG00029,SIDG00030,SIDG00031,SIDG00032,SIDG00033,SIDG00034,SIDG00035,SIDG00036,SIDG00037,SIDG00038,SIDG00039,…,SIDG42441,SIDG42442,SIDG42443,SIDG42444,SIDG42445,SIDG42446,SIDG42447,SIDG42448,SIDG42449,SIDG42450,SIDG42451,SIDG42452,SIDG42453,SIDG42454,SIDG42455,SIDG42456,SIDG42457,SIDG42458,SIDG42459,SIDG42460,SIDG42461,SIDG42462,SIDG42463,SIDG42464,SIDG42466,SIDG42467,SIDG42468,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
str,i32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SIDM00374""",1009,4.13448,0.086178,1.373716,0.0,0.307485,0.029559,0.0,0.0,0.039221,0.0,0.019803,0.00995,0.0,2.493205,1.22083,0.131028,0.0,0.0,0.019803,0.0,0.0,0.0,0.0,1.871802,2.727199,1.521699,0.565314,3.236716,0.019803,2.537657,0.09531,4.564765,2.317474,1.160021,0.0,…,0.182322,1.510722,1.342865,1.947338,0.86289,0.182322,2.247072,1.591274,0.00995,0.978326,2.291524,0.04879,0.0,0.0,0.0,0.788457,0.518794,2.884801,0.371564,1.229641,1.60543,2.398804,1.7492,3.769307,0.0,1.088562,1.791759,1.532557,0.0,1.545433,3.14329,0.0,2.143589,1.7613,1.713798,0.932164,0.425268
"""SIDM00255""",268,-2.236015,0.254642,0.732368,0.0,0.0,0.113329,0.0,0.00995,0.067659,0.0,0.09531,0.277632,0.0,2.74084,1.798404,0.039221,0.0,0.0,0.019803,0.0,0.00995,0.0,2.406044,1.137833,2.213754,1.671473,1.449269,3.249211,0.09531,3.12016,0.215111,3.894877,2.327278,1.495149,0.0,…,0.908259,1.981001,2.165619,1.951608,2.21047,0.993252,1.449269,1.829376,0.019803,1.205971,2.181547,1.510722,0.0,0.0,0.0,3.017983,1.693779,2.123458,0.157004,1.061257,1.90806,2.155245,2.400619,3.800868,0.173953,1.410987,2.450143,1.572774,0.0,2.276241,2.68239,0.0,1.860975,2.109,2.418589,1.196948,0.470004
"""SIDM01182""",1012,1.321538,0.19062,0.329304,0.0,0.019803,0.0,0.0,0.0,0.463734,0.00995,0.076961,1.196948,0.00995,3.17346,0.845868,0.0,0.076961,0.0,0.165514,0.0,0.0,0.598837,1.958685,1.795087,3.709417,1.156881,1.809927,3.424263,0.00995,2.433613,0.00995,4.218772,1.888584,1.223775,0.0,…,0.951658,1.442202,2.187174,1.576915,0.947789,0.530628,0.71784,2.004179,0.0,1.18479,1.640937,0.086178,0.0,0.0,0.0,1.408545,1.902108,2.282382,0.113329,1.534714,2.302585,2.890372,3.572907,3.708437,0.405465,1.015231,1.517323,1.731656,0.0,2.042518,4.065259,0.0,2.236445,2.291524,1.517323,0.854415,0.157004
"""SIDM01160""",1023,3.875126,0.039221,0.609766,0.0,0.0,0.10436,0.0,0.0,0.0,0.0,0.165514,2.247072,0.0,2.454447,1.34025,0.139762,4.280132,0.0,0.019803,0.0,0.0,2.008214,2.13061,1.166271,3.15359,1.934416,1.193922,3.716738,0.039221,3.261552,0.0,4.455509,2.346602,1.095273,0.0,…,0.476234,1.470176,2.119863,2.572612,1.015231,0.683097,1.967112,2.213754,0.0,1.366092,1.766442,0.559616,0.0,0.0,0.0,1.729884,1.928619,2.395164,0.10436,2.09433,1.629241,1.965713,2.048982,2.083185,0.883768,1.430311,2.372111,0.00995,0.0,2.109,3.501043,0.0,2.182675,2.131797,2.228939,0.970779,0.329304
"""SIDM00547""",1197,4.457386,0.058269,0.157004,0.067659,0.039221,0.04879,0.0,0.04879,0.506818,0.0,0.0,0.300105,0.0,2.776332,1.18479,0.00995,3.129826,0.04879,0.982078,0.0,0.0,2.858766,0.662688,0.989541,3.158276,0.65752,1.798404,3.67402,0.113329,2.444085,0.00995,4.373994,2.643334,1.747459,0.0,…,0.542324,1.583094,1.472472,1.701105,0.536493,0.924259,0.451076,1.551809,0.0,0.737164,0.815365,1.710188,0.0,0.0,0.0,1.673351,1.941615,2.703373,0.113329,1.832581,2.313525,2.554122,2.890927,4.019082,0.336472,0.86289,1.675226,0.00995,0.0,1.280934,3.373484,0.0,1.93297,1.998774,1.401183,0.936093,0.198851


## 5. Save Final Aligned Dataset

We save the aligned (but unfiltered/unprojected) dataset to be used in downstream PCA or HVG filtering steps.


In [None]:
merged.write_parquet("../../data/bulk/bulk_final.parquet")
print("📁 Saved aligned dataset to '../../data/bulk/bulk_final.parquet'")


📁 Saved aligned dataset to '../../data/bulk/bulk_final.parquet'


: 