In [None]:
import os
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA


In [2]:
# Create a directory to store statistics and plots
output_dir = "statistics"
os.makedirs(output_dir, exist_ok=True)


In [None]:
gdsc_bulk = pl.read_parquet("gdsc/gdsc_final_cleaned.parquet", usecols=["SANGER_MODEL_ID", "DRUG_ID", "LN_IC50"])
sc_data = pl.read_parquet("sc_data/rnaseq_all_data.parquet", usecols=["model_id", "gene_symbol", "fpkm"])
final_data = pl.read_parquet("pseudo_bulk/gdsc_single_cell_aligned.parquet")


In [None]:
# Statistics
stats_dict = {
    "Total Cell Lines (GDSC)": gdsc_bulk["SANGER_MODEL_ID"].nunique(),
    "Total Unique Drugs (GDSC)": gdsc_bulk["DRUG_ID"].nunique(),
    "Total (Cell Line, Drug) Pairs": gdsc_bulk.shape[0],
    "Total Unique Genes in Single-Cell": sc_data["gene_symbol"].nunique(),
    "Total Unique Cell Lines in Single-Cell": sc_data["model_id"].nunique(),
    "Total Genes After HVG Selection": final_data.shape[1] - 3,
    "Total Cell Lines in Final Dataset": final_data["SANGER_MODEL_ID"].nunique(),
    "Total Unique Drugs in Final Dataset": final_data["DRUG_ID"].nunique(),
    "Total (Cell Line, Drug) Pairs in Final Dataset": final_data.shape[0],
}

stats_df = pl.DataFrame(stats_dict.items(), columns=["Metric", "Value"])
stats_df.to_csv(f"{output_dir}/dataset_statistics.csv", index=False)
print("📂 Dataset statistics saved to 'statistics/dataset_statistics.csv' 🎉")


📂 Dataset statistics saved to 'statistics/dataset_statistics.csv' 🎉


In [5]:
plt.figure(figsize=(8, 5))
sns.histplot(final_data["LN_IC50"], bins=50, kde=True, color="blue")
plt.xlabel("Log IC50")
plt.ylabel("Frequency")
plt.title("Distribution of Log IC50 Values")
plt.grid()
plt.savefig(f"{output_dir}/ic50_distribution.png")
plt.close()


In [None]:
gene_columns = final_data.columns[3:]
correlations = final_data[gene_columns].corrwith(final_data["LN_IC50"]).sort_values()

correlations_df = pl.DataFrame({
    "Top Positively Correlated Genes": correlations.tail(10).index.tolist(),
    "Top Negatively Correlated Genes": correlations.head(10).index.tolist()
})
correlations_df.to_csv(f"{output_dir}/gene_ic50_correlations.csv", index=False)


In [7]:
drug_counts = final_data.groupby("DRUG_ID")["SANGER_MODEL_ID"].nunique().sort_values(ascending=False)

plt.figure(figsize=(10, 5))
sns.histplot(drug_counts, bins=50, kde=True, color="green")
plt.xlabel("Number of Cell Lines per Drug")
plt.ylabel("Frequency")
plt.title("Distribution of Cell Line Testing Per Drug")
plt.grid()
plt.savefig(f"{output_dir}/cell_lines_per_drug.png")
plt.close()


In [None]:
gene_std = final_data.iloc[:, 3:].std().sort_values(ascending=False)
gene_std_df = pl.DataFrame({"Gene": gene_std.index, "Standard Deviation": gene_std.values})
gene_std_df.to_csv(f"{output_dir}/most_variable_genes.csv", index=False)


In [13]:
# Extract gene expression matrix (assuming genes start at column index 3)
gene_data = final_data.iloc[:, 3:]

# Keep only numeric columns
numeric_data = gene_data.select_dtypes(include=[np.number])

# Drop columns and rows with too many NaNs
numeric_data = numeric_data.dropna(axis=1, how='all')
numeric_data = numeric_data.dropna(axis=0)

# Safety check
if numeric_data.empty:
    raise ValueError("❌ No valid numeric gene expression data found for PCA.")

# PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(numeric_data)

# Plot
plt.figure(figsize=(8, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.5, color="purple")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title("PCA of Gene Expression")
plt.grid()
plt.savefig(f"{output_dir}/pca_cell_line_clustering.png")
plt.show()


ValueError: ❌ No valid numeric gene expression data found for PCA.

print("\n🎯 Dataset analysis completed! The following files have been saved:")
print(f"- 📊 Dataset statistics: {output_dir}/dataset_statistics.csv")
print(f"- 🔬 Gene-IC50 correlations: {output_dir}/gene_ic50_correlations.csv")
print(f"- 🧬 Most variable genes: {output_dir}/most_variable_genes.csv")
print(f"- 📈 IC50 Distribution Plot: {output_dir}/ic50_distribution.png")
print(f"- 🔥 Drug Testing Frequency Plot: {output_dir}/cell_lines_per_drug.png")
print(f"- 🎨 PCA Clustering Plot: {output_dir}/pca_cell_line_clustering.png")
