In [1]:
import polars as pl

In [2]:
# Load GDSC1 and GDSC2
gdsc = pl.read_parquet("../../data/processed/gdsc_with_split_drug_ids.parquet")

# Display first few rows
print(gdsc.head())

shape: (5, 21)
┌─────────┬─────────────┬─────────────┬───────────┬───┬──────────┬───────────┬────────┬────────────┐
│ DATASET ┆ NLME_RESULT ┆ NLME_CURVE_ ┆ COSMIC_ID ┆ … ┆ RMSE     ┆ Z_SCORE   ┆ SOURCE ┆ DRUG_ID_OR │
│ ---     ┆ _ID         ┆ ID          ┆ ---       ┆   ┆ ---      ┆ ---       ┆ ---    ┆ IGINAL     │
│ str     ┆ ---         ┆ ---         ┆ i64       ┆   ┆ f64      ┆ f64       ┆ str    ┆ ---        │
│         ┆ i64         ┆ i64         ┆           ┆   ┆          ┆           ┆        ┆ i64        │
╞═════════╪═════════════╪═════════════╪═══════════╪═══╪══════════╪═══════════╪════════╪════════════╡
│ GDSC1   ┆ 342         ┆ 15580432    ┆ 684057    ┆ … ┆ 0.026081 ┆ 1.299144  ┆ GDSC1  ┆ 1          │
│ GDSC1   ┆ 342         ┆ 15580806    ┆ 684059    ┆ … ┆ 0.110059 ┆ 0.156076  ┆ GDSC1  ┆ 1          │
│ GDSC1   ┆ 342         ┆ 15581198    ┆ 684062    ┆ … ┆ 0.087019 ┆ -0.035912 ┆ GDSC1  ┆ 1          │
│ GDSC1   ┆ 342         ┆ 15581542    ┆ 684072    ┆ … ┆ 0.01629  ┆ -0.434437

In [4]:
# Essential columns to keep
columns_to_keep = ["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"]

# Drop unnecessary columns
gdsc_cleaned = gdsc[columns_to_keep]

print(f"🔹 Cleaned dataset shape: {gdsc_cleaned.shape}")


🔹 Cleaned dataset shape: (575197, 3)


In [5]:
# Check for missing values
missing_ln_ic50 = gdsc_cleaned.filter(pl.col("LN_IC50").is_null()).shape[0]
print(f"\n📊 Missing Values in LN_IC50: {missing_ln_ic50}")

if missing_ln_ic50 > 0:
    # Fill NaNs with group mean
    gdsc_cleaned = (
        gdsc_cleaned
        .with_columns([
            pl.col("LN_IC50").fill_null(
                pl.col("LN_IC50").mean().over(["SANGER_MODEL_ID", "DRUG_ID"])
            )
        ])
        .with_columns([
            pl.col("LN_IC50").fill_null(0)  # Fill any remaining nulls with 0
        ])
    )



📊 Missing Values in LN_IC50: 0


In [6]:
gdsc_cleaned.write_parquet("../../data/processed/gdsc_final_cleaned.parquet")
print("✅ Final merged and cleaned GDSC dataset saved!")
print(f"📌 Final dataset shape: {gdsc_cleaned.shape}")


✅ Final merged and cleaned GDSC dataset saved!
📌 Final dataset shape: (575197, 3)
