In [1]:
import warnings
import scanpy as sc
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import time
import sys
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from scipy.sparse import csr_matrix
from scipy.stats import hypergeom
from scipy.stats import pearsonr

import matplotlib.cm as cm
import matplotlib.colors as mcolors 
import time


sys.path.append('../3_DE_analysis/')
from DE_analysis_utils import *

pd.set_option('display.max_rows', 100)
sc.set_figure_params(figsize=(20, 4))

  from anndata import __version__ as anndata_version
  if Version(anndata.__version__) >= Version("0.11.0rc2"):
  if Version(anndata.__version__) >= Version("0.11.0rc2"):


In [3]:
def assess_complex_enrichment(df, complexes, cluster_label, gene_name_label):
    """
    Assess pathway/complex enrichment
    """
    de_genes_per_cluster = {}
    clusters = df[cluster_label].unique()
    for cluster in clusters:
        cluster_genes = df[df[cluster_label]==cluster][gene_name_label]
        de_genes_per_cluster[cluster] = set(cluster_genes)

    N = len(df)
    
    best_result = []
    all_result = []
    for cluster, de_genes in de_genes_per_cluster.items():
        n = len(de_genes)
        best_result_for_cluster = None # Initialize a variable to track the best result for this cluster
        for complex_name, subunits in complexes.items():
            K = len(subunits)
            # Find the overlap between DE genes and complex subunits
            overlap = de_genes.intersection(subunits)
            k = len(overlap)
            
            # Calculate the p-value
            if (n > 0 and K > 0) and (k > 1): # Avoid division by zero or trivial cases
                pval = hypergeom.sf(k - 1, N, K, n) # sf is survival function
                
                # Adjust for multiple testing (e.g., using Benjamini-Hochberg)
                # This is a crucial step to avoid false positives
                # Store raw p-values and correct later
                current_result = {
                    'cluster': cluster,
                    'complex': complex_name,
                    'overlap_genes': list(overlap),
                    'overlap_fraction': len(overlap)/n,
                    'raw_p_value': pval,
                    'complex_size': K,
                    'overlap_size': len(overlap),
                    'cluster_size': n
                }
                
                if (best_result_for_cluster is None) or (pval < best_result_for_cluster['raw_p_value']):
                    best_result_for_cluster = current_result
                if len(overlap) > 1:
                    all_result.append(current_result)
        
        if best_result_for_cluster is not None:
            if best_result_for_cluster['raw_p_value']<1:
                best_result.append(best_result_for_cluster)
    
    # Convert results to a DataFrame for easy viewing
    best_result_df = pd.DataFrame(best_result)
    #best_result_df = best_result_df.sort_values(by='cluster').reset_index(drop=True)
    all_result_df = pd.DataFrame(all_result)
    #all_result_df = all_result_df.sort_values(by='cluster').reset_index(drop=True)

    return all_result_df, best_result_df

### Check regulator enrichment

In [None]:
adata_de = sc.read_h5ad('../../../../3_expts/processed_data/CD4i_final//DE_results_all_confounders/CD4i_final.merged_DE_results_corrected.h5ad', backed='r')
adata_final = sc.read_h5ad('../../../../3_expts/processed_data/analysis_largefiles/nde50ntotal100_varfiltered_clustering_10232025.h5ad', backed='r')
de_by_guide = pd.read_csv('../3_DE_analysis/results/DE_by_guide.correlation_results.csv', index_col=0)
donor_robustness_summary = pd.read_csv('../3_DE_analysis/results/DE_donor_robustness_correlation_summary.csv', index_col=0)
de_summary_stats = pd.read_csv('../../../../3_expts/processed_data/CD4i_final/DE_results_all_confounders/DE_summary_stats_per_target_corrected.csv', index_col=0)

In [9]:
disease_association_df = pd.read_csv('disease_gene_associations_detailed.csv', index_col=0)
disease_association_df = disease_association_df[disease_association_df.gene_symbol.isin(de_summary_stats[(de_summary_stats.n_total_de_genes>50)&(de_summary_stats.n_cells_target>100)].target_name)]

# Parse into dictionary
disease_genes = {}
for disease_name in disease_association_df.disease_name.unique():
    disease_genes[disease_name] = set(disease_association_df[disease_association_df.disease_name==disease_name].gene_symbol)

# Run enrichment
disease_enrichment_df, _ = assess_complex_enrichment(adata_final.obs[['hdbscan', 'target_contrast_gene_name_corrected']],
                                              disease_genes, 'hdbscan', 'target_contrast_gene_name_corrected')

In [10]:
disease_enrichment_df.sort_values(by='raw_p_value').head(50)

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
178,89,autoimmune disease,"[ARID1A, IL21R, FLI1, ERGIC1, STAT3, KLF2, IPM...",0.294118,1e-06,277,20,68
177,89,asthma,"[STAT3, KLF2, TBX21, IL21R, TET2, SCAF8, CDK2,...",0.147059,1.8e-05,88,10,68
242,54,psoriasis,"[PRKRA, PTEN, FOXO1]",0.75,2.5e-05,57,3,4
18,20,rheumatoid arthritis,"[RASGRP1, TM2D1, SMAD3, CACHD1, IL12RB2, NFKB1...",0.179487,6.2e-05,82,7,39
179,89,inflammatory bowel disease,"[ELF4, STAT3, BATF, IL21R, IPMK, BRD1, TET2, F...",0.161765,7.7e-05,125,11,68
151,6,autoimmune disease,"[TSPYL5, UBE2L3, MED13, SUPT20H, CD274, SGF29,...",0.305556,0.000226,277,11,36
14,20,autoimmune disease,"[C8orf33, RASGRP1, SLAMF7, TM2D1, SMAD3, CACHD...",0.282051,0.000492,277,11,39
40,93,autoimmune disease,"[MAU2, ARID2, B4GALNT3, RHBDF2, SP3, KLF2, BCL...",0.189655,0.000587,277,22,116
52,111,autoimmune disease,"[DMRTA2, BCL10, IKBKB, PDCD5, CD28, MAF, RNF8,...",0.163158,0.000753,277,31,190
231,16,asthma,"[GSTCD, INTS12]",1.0,0.000831,88,2,2


### Check downstream enrichment (didn't think of best way to get background set)

In [13]:
df_downstream = pd.read_csv('../../../../3_expts/processed_data/analysis_largefiles/nde50ntotal100_varfiltered_clustering_downstream_genes.csv', index_col=0)
df_downstream_top = df_downstream[(df_downstream['zscore_rank_negative_regulation']<50)|(df_downstream['zscore_rank_positive_regulation']<50)].copy()

In [16]:
disease_association_df = pd.read_csv('disease_gene_associations_detailed.csv', index_col=0)
disease_association_df = disease_association_df[disease_association_df.gene_symbol.isin(df_downstream_top.downstream_gene.unique())]

disease_genes = {}
for disease_name in disease_association_df.disease_name.unique():
    disease_genes[disease_name] = set(disease_association_df[disease_association_df.disease_name==disease_name].gene_symbol)

disease_enrichment_downstream_df, _ = assess_complex_enrichment(df_downstream_top, disease_genes, 'hdbscan_cluster', 'downstream_gene')

In [17]:
disease_enrichment_downstream_df.sort_values(by='raw_p_value').head(50)

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
292,46,autoimmune disease,"[IL2RA, ATP8B2, RBPJ, ITGAM, CD109, TRAT1, ZFP...",0.255102,1.306959e-07,436,25,98
299,46,type 1 diabetes mellitus,"[SHC4, IL2RA, CTLA4, FASLG, MAPK1, CLINT1, PRK...",0.091837,3.031204e-06,68,9,98
359,2,autoimmune disease,"[PITPNM2, PTPN1, IL18RAP, TRIB2, KCTD12, TBC1D...",0.22449,7.475485e-06,436,22,98
405,26,autoimmune disease,"[TNFRSF14, CTSB, VEGFA, NFATC2, SOCS1, TGFBR3,...",0.22449,7.475485e-06,436,22,98
291,46,asthma,"[CYFIP2, IL2RA, POU2AF1, KLF2, AHI1, FASLG, NI...",0.122449,8.737764e-06,143,12,98
331,39,autoimmune disease,"[CDK6, AK4, IL7R, VEGFA, APEH, PDK1, DPP4, LEF...",0.222222,8.903875e-06,436,22,99
229,35,inflammatory bowel disease,"[HSPD1, PRKCQ, TNFSF8, ETS1, HNRNPAB, MBD2, RB...",0.142857,1.034793e-05,197,14,98
173,32,autoimmune disease,"[LSP1, CDCA7, PTEN, SELL, CD28, PLEC, HSPD1, A...",0.214286,2.570037e-05,436,21,98
507,85,autoimmune disease,"[IL2RA, IL7R, ALOX5AP, MCL1, HSPD1, KIF21B, LT...",0.214286,2.570037e-05,436,21,98
528,67,psoriasis,"[UBLCP1, ACTR2, IL4R, FOXO1, DDX39B, SP140L, C...",0.091837,2.60405e-05,88,9,98
