In [5]:
import polars as pl

# Load the pseudo-bulk dataset
pseudo_bulk = pl.read_parquet("../../data/pseudo_bulk/gdsc_single_cell_aligned.parquet")

# Preview columns
print("🧠 All columns:")
print(pseudo_bulk.columns[:10])  # show first few for context

# Drop metadata columns if present
columns_to_drop = ["model_name", "dataset_name", "data_source", "gene_id"]
columns_present = [col for col in columns_to_drop if col in pseudo_bulk.columns]
cleaned = pseudo_bulk.drop(columns_present)

print(f"🧼 Dropped columns: {columns_present}")
print(f"✅ Cleaned shape: {cleaned.shape}")


🧠 All columns:
['SANGER_MODEL_ID', 'DRUG_ID', 'LN_IC50', 'model_name', 'dataset_name', 'data_source', 'gene_id', 'SIDG00001', 'SIDG00002', 'SIDG00003']
🧼 Dropped columns: ['model_name', 'dataset_name', 'data_source', 'gene_id']
✅ Cleaned shape: (575197, 37605)


In [None]:
# 1. Print a few rows
print("\n🔎 Sample rows:")
print(cleaned.head())

# 2. Check for NaNs (there shouldn’t be any)
null_counts = cleaned.null_count()
print("\n🧪 Null values per column (top 5):")
print(null_counts[:5])

# 3. Check that gene expression columns have numeric values
non_gene_cols = ["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"]
gene_cols = [col for col in cleaned.columns if col not in non_gene_cols]

# Check basic stats across all gene columns
stats = cleaned.select([
    pl.col(col).cast(pl.Float64).describe().alias(col) for col in gene_cols[:5]  # first 5 genes
])
print("\n📊 Example gene stats (first 5 genes):")
print(stats)



🔎 Sample rows:
shape: (5, 37_605)
┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ SANGER_MOD ┆ DRUG_ID ┆ LN_IC50   ┆ SIDG00001 ┆ … ┆ SIDG42475 ┆ SIDG42479 ┆ SIDG42480 ┆ SIDG42481 │
│ EL_ID      ┆ ---     ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ ---        ┆ i32     ┆ f32       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
│ str        ┆         ┆           ┆           ┆   ┆           ┆           ┆           ┆           │
╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ SIDM00374  ┆ 1009    ┆ 4.13448   ┆ 0.086178  ┆ … ┆ 1.7613    ┆ 1.713798  ┆ 0.932164  ┆ 0.425268  │
│ SIDM00255  ┆ 268     ┆ -2.236015 ┆ 0.254642  ┆ … ┆ 2.109     ┆ 2.418589  ┆ 1.196948  ┆ 0.470004  │
│ SIDM01182  ┆ 1012    ┆ 1.321538  ┆ 0.19062   ┆ … ┆ 2.291524  ┆ 1.517323  ┆ 0.854415  ┆ 0.157004  │
│ SIDM01160  ┆ 1023    ┆ 3.875126  ┆ 0.039221  ┆ … ┆ 2.1

AttributeError: 'Expr' object has no attribute 'describe'

: 

In [None]:
# Look at the distribution of values for one example gene
import matplotlib.pyplot as plt

sample_gene = gene_cols[0]  # pick first gene
gene_values = cleaned[sample_gene].to_numpy()

plt.hist(gene_values, bins=30)
plt.title(f"Distribution of Expression Values: {sample_gene}")
plt.xlabel("Expression (log1p)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


🔹 Number of unique cell lines in single-cell: 37606
✅ Number of matching cell line IDs: 0
🔗 Sample matching IDs: []
❌ Cell lines in GDSC not found in single-cell: 978
