In [1]:
import warnings
import scanpy as sc
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import time
import sys
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from scipy.sparse import csr_matrix
from scipy.stats import hypergeom
from scipy.stats import pearsonr

import matplotlib.cm as cm
import matplotlib.colors as mcolors 
import time


sys.path.append('../3_DE_analysis/')
from DE_analysis_utils import *

pd.set_option('display.max_rows', 100)
sc.set_figure_params(figsize=(20, 4))

  from anndata import __version__ as anndata_version
  if Version(anndata.__version__) >= Version("0.11.0rc2"):
  if Version(anndata.__version__) >= Version("0.11.0rc2"):


In [2]:
adata_final = sc.read_h5ad('../../../../3_expts/processed_data/analysis_largefiles/nde30ntotal75_varfiltered_downstream_clustering_additional_filtering.h5ad', backed='r')
de_by_guide = pd.read_csv('../3_DE_analysis/results/DE_by_guide.correlation_results.csv', index_col=0)
donor_robustness_summary = pd.read_csv('../3_DE_analysis/results/Stim8hr_debyguidegenes_donor_robustness_summary.csv', index_col=0)

In [3]:
num_of_cluster = len(adata_final.obs['hdbscan'].unique())
cluster_name = []
corr = []
cluster_gene_size = []
cluster_member = []
for i, cl in enumerate(adata_final.obs['hdbscan'].unique()):
    df = pd.DataFrame(data=adata_final[adata_final.obs['hdbscan']==cl].layers['zscore'],
                        index=adata_final[adata_final.obs['hdbscan']==cl].obs_names,
                        columns=adata_final.var_names)
    df_corr = df.T.corr()
    np.fill_diagonal(df_corr.values, 0)
    cluster_name.append(int(cl))
    corr.append(np.mean(df_corr))
    cluster_gene_size.append(len(df))
    cluster_member.append(adata_final[adata_final.obs['hdbscan']==cl].obs.gene_name.tolist())

cluster_df = pd.DataFrame({'cluster': cluster_name,
                           'intracluster_corr': corr,
                           'cluster_gene_size': cluster_gene_size,
                           'cluster_member': cluster_member})

In [4]:
import gseapy
from gseapy import Msigdb
from gseapy import barplot, dotplot

msig = Msigdb()
kegg_gene_sets = msig.get_gmt(category= 'c2.cp.kegg_legacy', dbver="2025.1.Hs")
reactome_gene_sets = msig.get_gmt(category= 'c2.cp.reactome', dbver="2025.1.Hs")

In [5]:
corum_df = pd.read_csv('../../../../2_files/enrichment_analysis/CORUM/corum_humanComplexes.txt', delimiter='\t', index_col='complex_id')
stringdb = pd.read_csv('../../../../2_files/enrichment_analysis/STRINGDB/9606.clusters.proteins.v12.0.txt.gz', delimiter='\t', compression='gzip')
protein_info = pd.read_csv('../../../../2_files/enrichment_analysis/STRINGDB/9606.protein.info.v12.0.txt.gz', delimiter='\t', compression='gzip')
cluster_info = pd.read_csv('../../../../2_files/enrichment_analysis/STRINGDB/9606.clusters.info.v12.0.txt.gz', delimiter='\t', compression='gzip')
stringdb_df = pd.merge(stringdb, protein_info, left_on='protein_id', right_on='#string_protein_id')
stringdb_df = pd.merge(stringdb_df, cluster_info, left_on='cluster_id', right_on='cluster_id')
stringdb_df = stringdb_df[stringdb_df.cluster_size<1000].copy()
zhang_df = pd.read_csv('../../../../2_files/enrichment_analysis/Zhang_et_al_2025/final_predictions_80.tsv', delimiter='\t', skiprows=28, header=0)

corum_complexes = {}
for _, row in corum_df.iterrows():
    complex_name = row['complex_name']
    subunits = set(row['subunits_gene_name'].split(';'))
    corum_complexes[complex_name] = subunits

stringdb_complexes = {}
for cluster_id in stringdb_df.cluster_id.unique():
    stringdb_complexes[cluster_id] = set(stringdb_df[stringdb_df.cluster_id==cluster_id].preferred_name)

zhang_complexes = {}
for gene in list(set(zhang_df[['Name1','Name2']].values.flatten())):
    zhang_complexes[gene] = set(zhang_df[(zhang_df.Name1==gene)|(zhang_df.Name2==gene)][['Name1','Name2']].values.flatten())

zhang_pairwise_complexes = {}
for i, row in zhang_df.iterrows():
    zhang_pairwise_complexes['complex '+str(i)] = set([row['Name1'], row['Name2']])

In [8]:
def assess_complex_enrichment(df, complexes, cluster_label, gene_name_label):
    """
    Assess pathway/complex enrichment
    """
    de_genes_per_cluster = {}
    clusters = df[cluster_label].unique()
    for cluster in clusters:
        cluster_genes = df[df[cluster_label]==cluster][gene_name_label]
        de_genes_per_cluster[cluster] = set(cluster_genes)

    N = len(df)
    
    best_result = []
    all_result = []
    for cluster, de_genes in de_genes_per_cluster.items():
        n = len(de_genes)
        best_result_for_cluster = None # Initialize a variable to track the best result for this cluster
        for complex_name, subunits in complexes.items():
            K = len(subunits)
            # Find the overlap between DE genes and complex subunits
            overlap = de_genes.intersection(subunits)
            k = len(overlap)
            
            # Calculate the p-value
            if (n > 0 and K > 0) and (k > 1): # Avoid division by zero or trivial cases
                pval = hypergeom.sf(k - 1, N, K, n) # sf is survival function
                
                # Adjust for multiple testing (e.g., using Benjamini-Hochberg)
                # This is a crucial step to avoid false positives
                # Store raw p-values and correct later
                current_result = {
                    'cluster': cluster,
                    'complex': complex_name,
                    'overlap_genes': list(overlap),
                    'overlap_fraction': len(overlap)/n,
                    'raw_p_value': pval,
                    'complex_size': K,
                    'overlap_size': len(overlap),
                    'cluster_size': n
                }
                
                if (best_result_for_cluster is None) or (pval < best_result_for_cluster['raw_p_value']):
                    best_result_for_cluster = current_result
                if len(overlap) > 1:
                    all_result.append(current_result)
        
        if best_result_for_cluster is not None:
            if best_result_for_cluster['raw_p_value']<1:
                best_result.append(best_result_for_cluster)
    
    # Convert results to a DataFrame for easy viewing
    best_result_df = pd.DataFrame(best_result)
    #best_result_df = best_result_df.sort_values(by='cluster').reset_index(drop=True)
    all_result_df = pd.DataFrame(all_result)
    #all_result_df = all_result_df.sort_values(by='cluster').reset_index(drop=True)

    return all_result_df, best_result_df

In [11]:
def run_enrichment_analysis(df, cluster_label, gene_name_label):
    # STRINGDB enrichment
    stringdb_enrichment_all, stringdb_enrichment_best = assess_complex_enrichment(df, stringdb_complexes, cluster_label, gene_name_label)
    stringdb_enrichment_all = pd.merge(stringdb_enrichment_all, stringdb_df[['cluster_id', 'best_described_by']].drop_duplicates(), left_on='complex', right_on='cluster_id')
    stringdb_enrichment_all = stringdb_enrichment_all.drop(columns=['cluster_id'])
    stringdb_enrichment_best = pd.merge(stringdb_enrichment_best, stringdb_df[['cluster_id', 'best_described_by']].drop_duplicates(), left_on='complex', right_on='cluster_id')
    stringdb_enrichment_best = stringdb_enrichment_best.drop(columns=['cluster_id'])
    # Corum enrichment
    corum_enrichment_all, corum_enrichment_best = assess_complex_enrichment(df, corum_complexes, cluster_label, gene_name_label)
    # KEGG enrichment
    kegg_enrichment_all, kegg_enrichment_best = assess_complex_enrichment(df, kegg_gene_sets, cluster_label, gene_name_label)
    # Reactome enrichment
    reactome_enrichment_all, reactome_enrichment_best = assess_complex_enrichment(df, reactome_gene_sets, cluster_label, gene_name_label)
    # Zhang et al 2025 enrichment
    zhang_enrichment_all, zhang_enrichment_best = assess_complex_enrichment(df, zhang_complexes, cluster_label, gene_name_label)

    # Zhang et al 2025 enrichment
    zhang_pairwise_enrichment_all, zhang_pairwise_enrichment_best = assess_complex_enrichment(df, zhang_pairwise_complexes, cluster_label, gene_name_label)
    
    # Also checking inter-cluster overlap
    gene_grouping = {}
    gene_grouping_df = {}
    for cluster in df[cluster_label].unique():
        gene_grouping[cluster] = set(df[df[cluster_label]==cluster][gene_name_label])
        gene_grouping_df[cluster] = [set(df[df[cluster_label]==cluster][gene_name_label])]
    gene_grouping_df = pd.DataFrame(gene_grouping_df).T.rename(columns={0:'cluster_member'})
    gene_grouping_df['cluster'] = gene_grouping_df.index
    
    intercluster_enrichment_all, _ = assess_complex_enrichment(df, gene_grouping, cluster_label, gene_name_label)
    intercluster_enrichment_all = intercluster_enrichment_all[intercluster_enrichment_all.cluster!=intercluster_enrichment_all.complex].copy()
    intercluster_enrichment_all = intercluster_enrichment_all[intercluster_enrichment_all.raw_p_value<1e-2].copy()
    intercluster_enrichment_all = intercluster_enrichment_all[intercluster_enrichment_all.overlap_fraction>=0.2].copy()
    intercluster_enrichment_summary = {}
    
    for cluster in intercluster_enrichment_all.cluster.unique():
        intercluster_enrichment_summary[cluster] = set(intercluster_enrichment_all[intercluster_enrichment_all.cluster==cluster].complex)
    
    intercluster_enrichment_summary_df = pd.DataFrame(intercluster_enrichment_summary.items(), columns=['cluster', 'related_cluster'])
    
    # Summarize results
    enrichment_df1 = pd.merge(corum_enrichment_best, stringdb_enrichment_best, on='cluster', how='outer', suffixes=('_corum', '_stringdb'))
    enrichment_df2 = pd.merge(kegg_enrichment_best, reactome_enrichment_best, on='cluster', how='outer', suffixes=('_kegg', '_reactome'))
    enrichment_df3 = pd.merge(zhang_enrichment_best, zhang_pairwise_enrichment_best, on='cluster', how='outer', suffixes=('_zhang', '_zhangpairwise'))
    enrichment_df = pd.merge(enrichment_df1, enrichment_df2, on='cluster', how='outer')
    enrichment_df = pd.merge(enrichment_df, enrichment_df3, on='cluster', how='outer')
    enrichment_df = pd.merge(enrichment_df, cluster_df[['cluster', 'intracluster_corr', 'cluster_gene_size']], on='cluster', how='outer')
    enrichment_df = pd.merge(enrichment_df, gene_grouping_df, on='cluster', how='outer')
    enrichment_df = pd.merge(enrichment_df, intercluster_enrichment_summary_df, on='cluster', how='outer')
    
    return enrichment_df, corum_enrichment_all, stringdb_enrichment_all, kegg_enrichment_all, reactome_enrichment_all, zhang_enrichment_all, zhang_pairwise_enrichment_all

### Check downstream cluster enrichment

In [12]:
enrichment_downstream_df,\
corum_enrichment_downstream_all,\
stringdb_enrichment_downstream_all,\
kegg_enrichment_downstream_all,\
reactome_enrichment_downstream_all,\
zhang_enrichment_downstream_all,\
zhang_pairwise_enrichment_downstream_all = run_enrichment_analysis(adata_final.obs[['hdbscan', 'gene_name']], 'hdbscan', 'gene_name')

In [13]:
enrichment_downstream_df.to_parquet('results/clustering_nde30ntotal75_downstream_cluster.parquet')
enrichment_downstream_df.to_csv('results/clustering_nde30ntotal75_downstream_cluster.csv')
corum_enrichment_downstream_all.to_parquet('results/corum_enrichment_nde30ntotal75_downstream_cluster.parquet')
corum_enrichment_downstream_all.to_csv('results/corum_enrichment_nde30ntotal75_downstream_cluster.csv')
stringdb_enrichment_downstream_all.to_parquet('results/stringdb_enrichment_nde30ntotal75_downstream_cluster.parquet')
stringdb_enrichment_downstream_all.to_csv('results/stringdb_enrichment_nde30ntotal75_downstream_cluster.csv')
kegg_enrichment_downstream_all.to_parquet('results/kegg_enrichment_nde30ntotal75_downstream_cluster.parquet')
kegg_enrichment_downstream_all.to_csv('results/kegg_enrichment_nde30ntotal75_downstream_cluster.csv')
reactome_enrichment_downstream_all.to_parquet('results/reactome_enrichment_nde30ntotal75_downstream_cluster.parquet')
reactome_enrichment_downstream_all.to_csv('results/reactome_enrichment_nde30ntotal75_downstream_cluster.csv')
zhang_enrichment_downstream_all.to_parquet('results/zhang_enrichment_nde30ntotal75_downstream_cluster.parquet')
zhang_enrichment_downstream_all.to_csv('results/zhang_enrichment_nde30ntotal75_downstream_cluster.csv')
zhang_pairwise_enrichment_downstream_all.to_parquet('results/zhang_pairwise_enrichment_nde30ntotal75_downstream_cluster.parquet')
zhang_pairwise_enrichment_downstream_all.to_csv('results/zhang_pairwise_enrichment_nde30ntotal75_downstream_cluster.csv')

In [20]:
test = ['AASDH','ADAT3','ARL6IP1','ATRX','C1D','CALCOCO2','CAMSAP1','CD2','CD247','CD28',
        'COLGALT1','COPS9','CREBBP','CYB5R4','DNAJC8','DOLPP1','EIF1','EIF4G2','ELOB','EWSR1',
        'FOXM1','GNE','GPD1L','GPI','GSS','HELT','HTT','ITK','KAT2B','KIF14','KMT5B','KNOP1',
        'L2HGDH','LRP5','MALT1','MEN1','METTL23','MTX2','NDFIP2','NUDCD1','NUP188','NUP37','PCNX3',
        'PEX5','PHF23','PLCG1','PPHLN1','RNF8','SENP1','SHOC2','SIK3','SOCS4','STAT5A','SZRD1',
        'TEAD4','TMEM263','TMX1','TNRC6A','TXNDC9','UBIAD1','XRRA1','ZC3H10','ZNF384','ZNF837']

In [21]:
test_filtered = ['AASDH','ADAT3','ARL6IP1','ATRX','C1D','CALCOCO2','CAMSAP1','COLGALT1','COPS9','CYB5R4','DNAJC8','DOLPP1','EIF1','EIF4G2','ELOB','EWSR1',
        'FOXM1','GNE','GPD1L','GPI','GSS','HELT','HTT','KAT2B','KIF14','KMT5B','KNOP1','L2HGDH','LRP5','MEN1','METTL23','MTX2','NDFIP2','NUDCD1','NUP188','NUP37','PCNX3',
        'PEX5','PHF23','PPHLN1','RNF8','SHOC2','SIK3','SZRD1','TEAD4','TMEM263','TMX1','TNRC6A','TXNDC9','UBIAD1','XRRA1','ZC3H10','ZNF384','ZNF837']

In [37]:
test_df = pd.DataFrame({'cluster':[109]*len(test), 'gene': test})

In [None]:
assess_complex_enrichment(test_df, reactome_gene_sets, 'cluster', 'gene')[0].tail(100)

In [43]:
test_stringdb_df, _ = assess_complex_enrichment(test_df, stringdb_complexes, 'cluster', 'gene')

In [46]:
test_stringdb_df.sort_values(by='raw_p_value').head(50)

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
26,109,CL:17472,"[SOCS4, ITK, PLCG1]",0.046875,1.0,56,3,64
27,109,CL:17470,"[SOCS4, ITK, PLCG1]",0.046875,1.0,62,3,64
29,109,CL:17473,"[SOCS4, PLCG1]",0.03125,1.0,33,2,64
67,109,CL:6606,"[FOXM1, KIF14]",0.03125,1.0,61,2,64
68,109,CL:6608,"[FOXM1, KIF14]",0.03125,1.0,51,2,64
99,109,CL:2582,"[NUP188, NUP37]",0.03125,1.0,61,2,64
100,109,CL:2583,"[NUP188, NUP37]",0.03125,1.0,42,2,64
101,109,CL:2587,"[NUP188, NUP37]",0.03125,1.0,27,2,64
102,109,CL:2584,"[NUP188, NUP37]",0.03125,1.0,34,2,64
103,109,CL:2589,"[NUP188, NUP37]",0.03125,1.0,18,2,64


In [52]:
assess_complex_enrichment(test_df, zhang_complexes, 'cluster', 'gene')[0]

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
0,109,complex 1000,"[CD2, SZRD1]",0.03125,1.0,9,2,64
1,109,complex 1200,"[DNAJC8, ATRX]",0.03125,1.0,17,2,64
2,109,complex 1271,"[CD247, CD2]",0.03125,1.0,14,2,64
3,109,complex 4819,"[CD28, CD2]",0.03125,1.0,7,2,64
4,109,complex 5031,"[SHOC2, ATRX]",0.03125,1.0,37,2,64
5,109,complex 5061,"[CD247, CD28]",0.03125,1.0,7,2,64
6,109,complex 7365,"[CD247, CD28]",0.03125,1.0,9,2,64
7,109,complex 11778,"[RNF8, ATRX]",0.03125,1.0,12,2,64
8,109,complex 12242,"[CD28, CD2]",0.03125,1.0,5,2,64
9,109,complex 14897,"[CD28, CD2]",0.03125,1.0,2,2,64


In [53]:
assess_complex_enrichment(test_df, zhang_pairwise_complexes, 'cluster', 'gene')[0]

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
0,109,complex 11465,"[CD247, CD28]",0.03125,1.0,2,2,64
1,109,complex 20678,"[CD28, CD2]",0.03125,1.0,2,2,64


In [56]:
zhang_complexes['complex 1000']

{'CD2', 'CD2BP2', 'EAPP', 'FEM1B', 'MAU2', 'NAA30', 'RHOT2', 'SF3B4', 'SZRD1'}

In [57]:
zhang_df[zhang_df.Name1=='CD2BP2']

Unnamed: 0,Protein1,Protein2,Name1,Name2,RFprob,AFprob,CFprob,AFMprob,Source,PDBtemp,...,Locality2,Disease1,Disease2,Process1,Process2,Function1,Function2,Exact_templates,Ortho_templates,Homo_templates
538,O95400,Q15427,CD2BP2,SF3B4,0.5083,0.007206,0.008575,0.6426,P,none,...,"Nucleus,Spliceosome",none,none,"mRNA processing,mRNA splicing","mRNA processing,mRNA splicing",CD2 antigen cytoplasmic tail-binding protein 2,Splicing factor 3B subunit 4,none,none,none
10981,O95400,Q56P03,CD2BP2,EAPP,0.1026,0.897,0.9585,0.4573,P,none,...,"Cytoplasm,Nucleus",none,none,"mRNA processing,mRNA splicing",none,CD2 antigen cytoplasmic tail-binding protein 2,E2F-associated phosphoprotein,none,none,none
14297,O95400,Q7Z422,CD2BP2,SZRD1,0.4846,0.8853,0.7905,0.2507,S,none,...,none,none,none,"mRNA processing,mRNA splicing",none,CD2 antigen cytoplasmic tail-binding protein 2,SUZ domain-containing protein 1,none,none,none
14337,O95400,Q9UK73,CD2BP2,FEM1B,0.3196,0.8057,0.9575,0.993,S,none,...,"Cytoplasm,Nucleus",none,none,"mRNA processing,mRNA splicing","Apoptosis,Ubl conjugation pathway",CD2 antigen cytoplasmic tail-binding protein 2,Protein fem-1 homolog B,none,none,none
19657,O95400,Q9Y6X3,CD2BP2,MAU2,0.8374,0.7173,0.8613,0.6807,S,none,...,"Chromosome,Nucleus",none,none,"mRNA processing,mRNA splicing","Cell cycle,Cell division,Chromosome partition,...",CD2 antigen cytoplasmic tail-binding protein 2,MAU2 chromatid cohesion factor homolog,none,none,none
28058,O95400,Q8IXI1,CD2BP2,RHOT2,0.3833,0.8477,0.9697,0.747,S,none,...,"Membrane,Mitochondrion,Mitochondrion outer mem...",none,none,"mRNA processing,mRNA splicing",none,CD2 antigen cytoplasmic tail-binding protein 2,Mitochondrial Rho GTPase 2,none,none,none
28545,O95400,P06729,CD2BP2,CD2,0.11505,0.565,0.744,0.6523,P,exact,...,"Cell membrane,Membrane",none,none,"mRNA processing,mRNA splicing",Cell adhesion,CD2 antigen cytoplasmic tail-binding protein 2,T-cell surface antigen CD2,1l2z,none,none
28832,O95400,Q147X3,CD2BP2,NAA30,0.5933,0.046,0.0909,0.9536,P,none,...,"Cytoplasm,Nucleus",none,none,"mRNA processing,mRNA splicing",none,CD2 antigen cytoplasmic tail-binding protein 2,N-alpha-acetyltransferase 30,none,none,none


In [55]:
zhang_df[zhang_df.Name2=='SZRD1']

Unnamed: 0,Protein1,Protein2,Name1,Name2,RFprob,AFprob,CFprob,AFMprob,Source,PDBtemp,...,Locality2,Disease1,Disease2,Process1,Process2,Function1,Function2,Exact_templates,Ortho_templates,Homo_templates
4940,P06733,Q7Z422,ENO1,SZRD1,0.6187,0.06683,0.0551,0.6416,P,none,...,none,none,none,"Glycolysis,Plasminogen activation,Transcriptio...",none,Alpha-enolase,SUZ domain-containing protein 1,none,none,none
14297,O95400,Q7Z422,CD2BP2,SZRD1,0.4846,0.8853,0.7905,0.2507,S,none,...,none,none,none,"mRNA processing,mRNA splicing",none,CD2 antigen cytoplasmic tail-binding protein 2,SUZ domain-containing protein 1,none,none,none
