# Align gene expression and GDSC Data into Pseudo-Bulk Format

This notebook performs the first preprocessing step in our pipeline:
- Align GDSC drug response data with gene expression data.
- Normalize expression values using `log1p`.




In [1]:
# Cell 1: Imports and setup
import polars as pl
import numpy as np
import os
import pandas as pd


## 1. Load GDSC Drug Response Data

We load the cleaned GDSC data and select only the relevant columns:
- `SANGER_MODEL_ID` for cell line
- `DRUG_ID` for compound identity
- `LN_IC50` for log-transformed drug sensitivity


In [2]:
gdsc_bulk = pl.read_parquet("../../data/processed/gdsc_final_cleaned.parquet").select([
    pl.col("SANGER_MODEL_ID").cast(pl.Utf8),
    pl.col("DRUG_ID").cast(pl.Int32),
    pl.col("LN_IC50").cast(pl.Float32)
])

print("✅ GDSC bulk data loaded.")
print(f"Shape: {gdsc_bulk.shape}")
gdsc_bulk.head()


✅ GDSC bulk data loaded.
Shape: (571985, 3)


SANGER_MODEL_ID,DRUG_ID,LN_IC50
str,i32,f32
"""SIDM00263""",1,3.966813
"""SIDM00269""",1,2.69209
"""SIDM00203""",1,2.47799
"""SIDM01111""",1,2.033564
"""SIDM00909""",1,2.966007


## 2. Load and Transpose Gene Expression

We transpose the gene expression matrix to get a format where:
- Each row = a cell line
- Each column = a gene

We also convert all values to numeric and fill missing values with `0.0`.


In [5]:
# Load and transpose gene expression data (TPM version)
cell_gene_df = pd.read_parquet("../../data/original/rnaseq_tpm.parquet")

# Transpose: rows = cell lines, cols = genes
transposed_df = cell_gene_df.set_index(cell_gene_df.columns[0]).transpose()

# Ensure numeric and fill NAs with small value (optional)
transposed_df = transposed_df.apply(pd.to_numeric, errors='coerce').fillna(0.0)

# Apply log2(TPM + 1) transformation
transformed_df = np.log2(transposed_df + 1)

# Reset index to turn cell line names into a column
transformed_df.index.name = "SANGER_MODEL_ID"
transformed_df.reset_index(inplace=True)

# Convert to Polars
cell_gene_matrix = pl.from_pandas(transformed_df)

print("Transformed gene expression data using log2(TPM + 1):")
print(f"Shape: {cell_gene_matrix.shape}")
cell_gene_matrix.head()


Transformed gene expression data using log2(TPM + 1):
Shape: (1432, 37607)


SANGER_MODEL_ID,model_name,dataset_name,data_source,gene_id,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,SIDG00011,SIDG00012,SIDG00017,SIDG00019,SIDG00020,SIDG00021,SIDG00022,SIDG00023,SIDG00024,SIDG00025,SIDG00026,SIDG00027,SIDG00028,SIDG00029,SIDG00030,SIDG00031,SIDG00032,SIDG00033,SIDG00034,SIDG00035,SIDG00036,SIDG00037,…,SIDG42441,SIDG42442,SIDG42443,SIDG42444,SIDG42445,SIDG42446,SIDG42447,SIDG42448,SIDG42449,SIDG42450,SIDG42451,SIDG42452,SIDG42453,SIDG42454,SIDG42455,SIDG42456,SIDG42457,SIDG42458,SIDG42459,SIDG42460,SIDG42461,SIDG42462,SIDG42463,SIDG42464,SIDG42466,SIDG42467,SIDG42468,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Unnamed: 1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SIDM00001""",0.0,0.0,0.0,0.0,0.613532,1.778209,0.042644,0.411426,0.0,0.0,0.0,2.02148,0.0,0.0,0.613532,0.0,4.657068,3.852998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.269033,6.081936,2.606442,3.211012,6.435462,1.9855,5.330917,1.807355,9.406928,5.1815,…,0.0,4.278728,3.602884,4.665052,3.970854,1.807355,2.508429,4.085765,0.0,3.033863,3.547203,0.411426,0.0,0.0,0.0,3.080658,2.169925,4.642124,0.765535,4.148934,3.949535,5.143638,5.238405,7.930383,1.550901,3.124328,3.320485,1.948601,0.0,3.334854,4.862947,0.0,3.672425,3.926948,3.22033,2.813525,1.613532
"""SIDM00002""",0.0,0.0,0.0,0.0,0.831877,2.809414,0.0,0.807355,0.773996,0.650765,0.056584,0.15056,0.0,1.02148,4.336283,0.0,6.497772,3.311794,0.475085,0.097611,0.0,0.15056,0.0,4.522307,0.475085,3.051372,1.757023,5.143638,2.41684,4.114367,6.804002,1.565597,6.350144,0.516015,7.583083,4.968091,…,2.702658,4.023255,4.82324,4.356848,3.346248,1.87578,4.294988,4.439623,0.15056,3.305971,4.715893,3.331992,0.0,0.0,0.0,6.296274,3.642702,5.280585,0.613532,3.948601,4.931683,6.450386,5.078524,6.77426,2.528571,3.702658,5.797532,1.613532,0.0,5.021035,5.834913,0.0,4.627607,5.528571,3.646163,2.655352,0.650765
"""SIDM00003""",0.0,0.0,0.0,0.0,0.575312,3.648465,0.0,4.913608,0.871844,0.0,0.0,0.0,0.042644,0.389567,0.0,0.0,6.015471,3.803227,1.469886,0.0,0.0,0.042644,0.0,0.0,0.0,4.055716,4.898692,6.223809,3.424922,3.277985,6.96035,0.275007,5.879461,0.505891,6.138323,5.205549,…,0.042644,3.838952,3.913608,3.590961,2.356144,1.124328,3.419539,4.128458,0.0,2.805292,2.599318,0.137504,0.0,0.0,0.0,3.766595,4.405992,4.634593,0.464668,3.246408,3.288359,6.235727,6.352264,8.118941,1.731183,2.992768,4.173927,3.834913,0.389567,4.948134,7.496894,0.0,4.713146,5.514122,3.102658,2.784504,1.555816
"""SIDM00005""",0.0,0.0,0.0,0.0,2.100978,3.122673,0.0,0.903038,2.720278,0.250962,0.650765,3.343408,0.111031,0.613532,1.937344,0.650765,6.446422,5.09508,0.378512,0.0,0.0,0.0,0.0,0.0,0.0,5.082362,3.104337,6.910253,2.367371,8.583008,7.030667,0.748461,6.036284,1.641546,6.211207,4.608218,…,0.0,3.767655,5.575312,4.792855,4.697107,3.528571,2.333424,4.261531,0.0,2.927896,3.288359,2.643856,0.056584,0.250962,0.056584,4.518535,4.346957,6.616622,0.9855,2.922198,4.945795,4.102658,5.402927,8.095555,2.83996,3.446256,4.880686,2.269033,0.0,3.997292,4.752749,0.0,3.78031,4.800641,4.878725,4.319762,1.169925


In [6]:
# Drop unwanted columns
cell_gene_matrix = cell_gene_matrix.drop(["model_name", "dataset_name", "data_source", "gene_id"])

# Get null count per column and convert to 2-column format manually
nulls_series = cell_gene_matrix.null_count()
nulls_df = pl.DataFrame({
    "column": nulls_series.columns,
    "null_count": [nulls_series[0, col] for col in nulls_series.columns]
})

# Filter to only show columns with any nulls
nan_summary = nulls_df.filter(pl.col("null_count") > 0)

# Print summary
if nan_summary.height == 0:
    print("✅ No NaN values found in the dataset.")
else:
    print("⚠️ NaN values found in the following columns:")
    print(nan_summary)

cell_gene_matrix.head()


✅ No NaN values found in the dataset.


SANGER_MODEL_ID,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,SIDG00011,SIDG00012,SIDG00017,SIDG00019,SIDG00020,SIDG00021,SIDG00022,SIDG00023,SIDG00024,SIDG00025,SIDG00026,SIDG00027,SIDG00028,SIDG00029,SIDG00030,SIDG00031,SIDG00032,SIDG00033,SIDG00034,SIDG00035,SIDG00036,SIDG00037,SIDG00038,SIDG00039,SIDG00040,SIDG00041,…,SIDG42441,SIDG42442,SIDG42443,SIDG42444,SIDG42445,SIDG42446,SIDG42447,SIDG42448,SIDG42449,SIDG42450,SIDG42451,SIDG42452,SIDG42453,SIDG42454,SIDG42455,SIDG42456,SIDG42457,SIDG42458,SIDG42459,SIDG42460,SIDG42461,SIDG42462,SIDG42463,SIDG42464,SIDG42466,SIDG42467,SIDG42468,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Unnamed: 1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SIDM00001""",0.613532,1.778209,0.042644,0.411426,0.0,0.0,0.0,2.02148,0.0,0.0,0.613532,0.0,4.657068,3.852998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.269033,6.081936,2.606442,3.211012,6.435462,1.9855,5.330917,1.807355,9.406928,5.1815,4.05745,0.0,3.930737,5.177519,…,0.0,4.278728,3.602884,4.665052,3.970854,1.807355,2.508429,4.085765,0.0,3.033863,3.547203,0.411426,0.0,0.0,0.0,3.080658,2.169925,4.642124,0.765535,4.148934,3.949535,5.143638,5.238405,7.930383,1.550901,3.124328,3.320485,1.948601,0.0,3.334854,4.862947,0.0,3.672425,3.926948,3.22033,2.813525,1.613532
"""SIDM00002""",0.831877,2.809414,0.0,0.807355,0.773996,0.650765,0.056584,0.15056,0.0,1.02148,4.336283,0.0,6.497772,3.311794,0.475085,0.097611,0.0,0.15056,0.0,4.522307,0.475085,3.051372,1.757023,5.143638,2.41684,4.114367,6.804002,1.565597,6.350144,0.516015,7.583083,4.968091,3.087463,0.0,3.646163,5.835924,…,2.702658,4.023255,4.82324,4.356848,3.346248,1.87578,4.294988,4.439623,0.15056,3.305971,4.715893,3.331992,0.0,0.0,0.0,6.296274,3.642702,5.280585,0.613532,3.948601,4.931683,6.450386,5.078524,6.77426,2.528571,3.702658,5.797532,1.613532,0.0,5.021035,5.834913,0.0,4.627607,5.528571,3.646163,2.655352,0.650765
"""SIDM00003""",0.575312,3.648465,0.0,4.913608,0.871844,0.0,0.0,0.0,0.042644,0.389567,0.0,0.0,6.015471,3.803227,1.469886,0.0,0.0,0.042644,0.0,0.0,0.0,4.055716,4.898692,6.223809,3.424922,3.277985,6.96035,0.275007,5.879461,0.505891,6.138323,5.205549,2.307429,0.0,3.160275,5.331275,…,0.042644,3.838952,3.913608,3.590961,2.356144,1.124328,3.419539,4.128458,0.0,2.805292,2.599318,0.137504,0.0,0.0,0.0,3.766595,4.405992,4.634593,0.464668,3.246408,3.288359,6.235727,6.352264,8.118941,1.731183,2.992768,4.173927,3.834913,0.389567,4.948134,7.496894,0.0,4.713146,5.514122,3.102658,2.784504,1.555816
"""SIDM00005""",2.100978,3.122673,0.0,0.903038,2.720278,0.250962,0.650765,3.343408,0.111031,0.613532,1.937344,0.650765,6.446422,5.09508,0.378512,0.0,0.0,0.0,0.0,0.0,0.0,5.082362,3.104337,6.910253,2.367371,8.583008,7.030667,0.748461,6.036284,1.641546,6.211207,4.608218,4.155425,0.0,4.124328,4.363171,…,0.0,3.767655,5.575312,4.792855,4.697107,3.528571,2.333424,4.261531,0.0,2.927896,3.288359,2.643856,0.056584,0.250962,0.056584,4.518535,4.346957,6.616622,0.9855,2.922198,4.945795,4.102658,5.402927,8.095555,2.83996,3.446256,4.880686,2.269033,0.0,3.997292,4.752749,0.0,3.78031,4.800641,4.878725,4.319762,1.169925


## 3. Normalize and Aggregate (Pseudo-Bulk)

We apply `log1p` transformation to expression values for normalization,
then group by `SANGER_MODEL_ID` to get a single profile per cell line (pseudo-bulk).


In [None]:
gene_cols = [col for col in cell_gene_matrix.columns if col != "SANGER_MODEL_ID"]

# Normalize using log1p
normalized = cell_gene_matrix.with_columns([
    pl.col(col).cast(pl.Float64).log1p().alias(col) for col in gene_cols
])

# Aggregate expression by cell line
pseudo_bulk = normalized.group_by("SANGER_MODEL_ID").agg([
    pl.col(col).mean().alias(col) for col in gene_cols
])

print("✅ Normalization and aggregation complete.")
print(f"Shape: {pseudo_bulk.shape}")


✅ Normalization and aggregation complete.
Shape: (1432, 37603)


: 

## 4. Merge Pseudo-Bulk Expression with Drug Response

We align the normalized expression matrix with drug response data by matching `SANGER_MODEL_ID`.


In [None]:
merged = gdsc_bulk.join(pseudo_bulk, on="SANGER_MODEL_ID", how="left")

print("✅ Merged GDSC with pseudo-bulk expression.")
print(f"Final shape: {merged.shape}")
merged.head()


## 5. Save Final Aligned Dataset

We save the aligned (but unfiltered/unprojected) dataset to be used in downstream PCA or HVG filtering steps.


In [None]:
merged.write_parquet("../../data/processed/bulk_final.parquet")
print("📁 Saved aligned dataset to '../../data/bulk/bulk_final.parquet'")


📁 Saved aligned dataset to '../../data/bulk/bulk_final.parquet'
