In [5]:
import polars as pl

# Load the pseudo-bulk dataset
pseudo_bulk = pl.read_parquet("../../data/pseudo_bulk/gdsc_single_cell_aligned.parquet")

# Preview columns
print("üß† All columns:")
print(pseudo_bulk.columns[:10])  # show first few for context

# Drop metadata columns if present
columns_to_drop = ["model_name", "dataset_name", "data_source", "gene_id"]
columns_present = [col for col in columns_to_drop if col in pseudo_bulk.columns]
cleaned = pseudo_bulk.drop(columns_present)

print(f"üßº Dropped columns: {columns_present}")
print(f"‚úÖ Cleaned shape: {cleaned.shape}")


üß† All columns:
['SANGER_MODEL_ID', 'DRUG_ID', 'LN_IC50', 'model_name', 'dataset_name', 'data_source', 'gene_id', 'SIDG00001', 'SIDG00002', 'SIDG00003']
üßº Dropped columns: ['model_name', 'dataset_name', 'data_source', 'gene_id']
‚úÖ Cleaned shape: (575197, 37605)


In [None]:
# 1. Print a few rows
print("\nüîé Sample rows:")
print(cleaned.head())

# 2. Check for NaNs (there shouldn‚Äôt be any)
null_counts = cleaned.null_count()
print("\nüß™ Null values per column (top 5):")
print(null_counts[:5])

# 3. Check that gene expression columns have numeric values
non_gene_cols = ["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"]
gene_cols = [col for col in cleaned.columns if col not in non_gene_cols]

# Check basic stats across all gene columns
stats = cleaned.select([
    pl.col(col).cast(pl.Float64).describe().alias(col) for col in gene_cols[:5]  # first 5 genes
])
print("\nüìä Example gene stats (first 5 genes):")
print(stats)



üîé Sample rows:
shape: (5, 37_605)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ SANGER_MOD ‚îÜ DRUG_ID ‚îÜ LN_IC50   ‚îÜ SIDG00001 ‚îÜ ‚Ä¶ ‚îÜ SIDG42475 ‚îÜ SIDG42479 ‚îÜ SIDG42480 ‚îÜ SIDG42481 ‚îÇ
‚îÇ EL_ID      ‚îÜ ---     ‚îÜ ---       ‚îÜ ---       ‚îÜ   ‚îÜ ---       ‚îÜ ---       ‚îÜ ---       ‚îÜ ---       ‚îÇ
‚îÇ ---        ‚îÜ i32     ‚îÜ f32       ‚îÜ f64       ‚îÜ   ‚îÜ f64       ‚îÜ f64       ‚îÜ f64       ‚îÜ f64       ‚îÇ
‚îÇ str        ‚îÜ         ‚îÜ           ‚îÜ           ‚îÜ   ‚îÜ           ‚îÜ           ‚îÜ           ‚îÜ           ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚

AttributeError: 'Expr' object has no attribute 'describe'

: 

In [None]:
# Look at the distribution of values for one example gene
import matplotlib.pyplot as plt

sample_gene = gene_cols[0]  # pick first gene
gene_values = cleaned[sample_gene].to_numpy()

plt.hist(gene_values, bins=30)
plt.title(f"Distribution of Expression Values: {sample_gene}")
plt.xlabel("Expression (log1p)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


üîπ Number of unique cell lines in single-cell: 37606
‚úÖ Number of matching cell line IDs: 0
üîó Sample matching IDs: []
‚ùå Cell lines in GDSC not found in single-cell: 978
