In [1]:
import warnings
import scanpy as sc
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import time
import sys
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from scipy.sparse import csr_matrix
from scipy.stats import hypergeom
from scipy.stats import pearsonr

import matplotlib.cm as cm
import matplotlib.colors as mcolors 
import time


sys.path.append('../3_DE_analysis/')
from DE_analysis_utils import *

pd.set_option('display.max_rows', 100)
sc.set_figure_params(figsize=(20, 4))

  from anndata import __version__ as anndata_version
  if Version(anndata.__version__) >= Version("0.11.0rc2"):
  if Version(anndata.__version__) >= Version("0.11.0rc2"):


In [2]:
def assess_complex_enrichment(df, complexes, cluster_label, gene_name_label):
    """
    Assess pathway/complex enrichment
    """
    de_genes_per_cluster = {}
    clusters = df[cluster_label].unique()
    for cluster in clusters:
        cluster_genes = df[df[cluster_label]==cluster][gene_name_label]
        de_genes_per_cluster[cluster] = set(cluster_genes)

    N = len(df[gene_name_label].unique())
    
    best_result = []
    all_result = []
    for cluster, de_genes in de_genes_per_cluster.items():
        n = len(de_genes)
        best_result_for_cluster = None # Initialize a variable to track the best result for this cluster
        for complex_name, subunits in complexes.items():
            K = len(subunits)
            # Find the overlap between DE genes and complex subunits
            overlap = de_genes.intersection(subunits)
            k = len(overlap)
            
            # Calculate the p-value
            if (n > 0 and K > 0) and (k > 1): # Avoid division by zero or trivial cases
                pval = hypergeom.sf(k - 1, N, K, n) # sf is survival function
                
                # Adjust for multiple testing (e.g., using Benjamini-Hochberg)
                # This is a crucial step to avoid false positives
                # Store raw p-values and correct later
                current_result = {
                    'cluster': cluster,
                    'complex': complex_name,
                    'overlap_genes': list(overlap),
                    'overlap_fraction': len(overlap)/n,
                    'raw_p_value': pval,
                    'complex_size': K,
                    'overlap_size': len(overlap),
                    'cluster_size': n
                }
                
                if (best_result_for_cluster is None) or (pval < best_result_for_cluster['raw_p_value']):
                    best_result_for_cluster = current_result
                if len(overlap) > 1:
                    all_result.append(current_result)
        
        if best_result_for_cluster is not None:
            if best_result_for_cluster['raw_p_value']<1:
                best_result.append(best_result_for_cluster)
    
    # Convert results to a DataFrame for easy viewing
    best_result_df = pd.DataFrame(best_result)
    #best_result_df = best_result_df.sort_values(by='cluster').reset_index(drop=True)
    all_result_df = pd.DataFrame(all_result)
    #all_result_df = all_result_df.sort_values(by='cluster').reset_index(drop=True)

    return all_result_df, best_result_df

### Check regulator enrichment

In [3]:
#adata_de = sc.read_h5ad('../../../../3_expts/processed_data/CD4i_final//DE_results_all_confounders/CD4i_final.merged_DE_results_corrected.h5ad', backed='r')
adata_final = sc.read_h5ad('../../../../3_expts/processed_data/analysis_largefiles/nde50ntotal50_varfiltered_simple_clustering.h5ad', backed='r')
de_by_guide = pd.read_csv('../3_DE_analysis/results/DE_by_guide.correlation_results.csv', index_col=0)
donor_robustness_summary = pd.read_csv('../3_DE_analysis/results/DE_donor_robustness_correlation_summary.csv', index_col=0)
de_summary_stats = pd.read_csv('../../../../3_expts/processed_data/CD4i_final/DE_results_all_confounders/DE_summary_stats_per_target_corrected.csv', index_col=0)

In [17]:
adata_final = sc.read_h5ad('../../../../3_expts/processed_data/analysis_largefiles/nde50ntotal100_varfiltered_simple_clustering.h5ad', backed='r')

In [6]:
disease_association_df = pd.read_csv('disease_gene_associations_detailed.csv', index_col=0)
disease_association_df = disease_association_df[disease_association_df.gene_symbol.isin(de_summary_stats[(de_summary_stats.n_total_de_genes>50)&(de_summary_stats.n_cells_target>50)].target_name_corrected)]

# Parse into dictionary
disease_genes = {}
for disease_name in disease_association_df.disease_name.unique():
    disease_genes[disease_name] = set(disease_association_df[disease_association_df.disease_name==disease_name].gene_symbol)

# Run enrichment
disease_enrichment_df, _ = assess_complex_enrichment(adata_final.obs[['hdbscan', 'target_contrast_gene_name_corrected']],
                                              disease_genes, 'hdbscan', 'target_contrast_gene_name_corrected')

In [16]:
disease_enrichment_df.loc[60, 'overlap_genes']

['IRF4',
 'CD247',
 'CD28',
 'PTPRC',
 'PDCD5',
 'HTT',
 'SIK3',
 'PRKCQ',
 'TBX21',
 'ITK',
 'TESPA1',
 'RAD51B',
 'FOXO1',
 'ICE2',
 'STAT5A',
 'COG6']

In [20]:
disease_enrichment_df.loc[56, 'overlap_genes']

['IRF4',
 'CD247',
 'CD28',
 'PTPRC',
 'STAT5B',
 'PDCD5',
 'STAT6',
 'IL4R',
 'HTT',
 'SIK3',
 'PRKCQ',
 'TBX21',
 'ITK',
 'RAD51B',
 'FOXO1',
 'ICE2',
 'STAT5A',
 'COG6']

In [9]:
disease_enrichment_df[disease_enrichment_df.cluster==99]

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
167,90,Crohn's disease,"[CD28, PTPRC, RAVER1, ZAP70, FOXO1, COG6]",0.0375,0.334476,49,6,160
168,90,Hashimoto's thyroiditis,"[IRF4, CD247, PTPRC, TBX21, PRKCQ, TBCA, CREBB...",0.0625,0.0007,33,10,160
169,90,ankylosing spondylitis,"[CD28, USP24, CD2, FOXO1, RNF8, MAF]",0.0375,0.10275,34,6,160
170,90,asthma,"[IRF4, CD247, CD28, PTPRC, PDCD5, HTT, SIK3, P...",0.08125,0.069623,87,13,160
171,90,autoimmune disease,"[IRF4, CD247, IKBKB, TPGS2, BCL10, RNF8, MEN1,...",0.19375,0.035379,235,31,160
172,90,celiac disease,"[IRF4, PRKCQ, CD28, RTRAF]",0.025,0.073794,17,4,160
173,90,inflammatory bowel disease,"[IRF4, CD28, PTPRC, MAX, TPGS2, RAVER1, PRKCQ,...",0.09375,0.069167,104,15,160
174,90,multiple sclerosis,"[CD28, PTPRC, BCL10, MALT1, MAF]",0.03125,0.30325,38,5,160
175,90,psoriasis,"[IRF4, FIGNL1, CD28, CD2, FOXO1]",0.03125,0.463119,46,5,160
176,90,rheumatoid arthritis,"[IRF4, CD28, PTPRC, USP24, CD2, RAVER1, PRKCQ,...",0.08125,0.010898,68,13,160


In [7]:
disease_enrichment_df.sort_values(by='raw_p_value').head(50)

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
196,54,autoimmune disease,"[FLI1, CD2, HDAC7, IMMP1L, BRD1, IPMK, STAT5B,...",0.304878,6.1e-05,301,25,82
195,54,asthma,"[TBX21, FLI1, STAT3, PPA2, SCAF8, KLF2, BCL3, ...",0.146341,0.000331,102,12,82
20,55,rheumatoid arthritis,"[TM2D1, PLA2G4A, SMAD3, CACHD1, ICOS, IL12RB2,...",0.184211,0.000704,89,7,38
199,54,psoriasis,"[LRRC25, FLI1, STAT3, RNF145, IPMK, CD2, HDAC7...",0.097561,0.001241,58,8,82
197,54,inflammatory bowel disease,"[STAT3, FLI1, BRD1, ACTB, ELF4, IPMK, HDAC7, E...",0.158537,0.001511,137,13,82
277,23,asthma,"[GSTCD, INTS12]",1.0,0.002205,102,2,2
176,82,ulcerative colitis,"[STAT3, IPMK, WAPL, IRF1, CD2BP2]",0.142857,0.003633,66,5,35
132,43,rheumatoid arthritis,"[CD244, SMAD3, CACHD1, IL12RB2, RASGRP1]",0.178571,0.004901,89,5,28
285,80,rheumatoid arthritis,"[METRN, NPEPPS, MAU2, BRD1]",0.222222,0.005273,89,4,18
283,80,ankylosing spondylitis,"[METRN, NPEPPS, BRD1]",0.166667,0.005528,45,3,18


In [12]:
disease_enrichment_df.sort_values(by='raw_p_value').head(50)

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
176,85,autoimmune disease,"[IRF4, FAM20B, BATF, STAT3, ARID1A, RASA2, ELF...",0.30137,0.00026,282,22,73
175,85,asthma,"[IRF4, CDK2, KLF2, IL21R, PPA2, FLI1, STAT3, T...",0.150685,0.000336,91,11,73
19,56,rheumatoid arthritis,"[IL12RB2, TM2D1, PLA2G4A, SMAD3, RASGRP1, ICOS...",0.175,0.001191,85,7,40
177,85,inflammatory bowel disease,"[IRF4, IL21R, BRD1, FLI1, ELF4, BATF, IPMK, ST...",0.164384,0.001981,129,12,73
267,50,asthma,"[GSTCD, INTS12]",1.0,0.002065,91,2,2
86,-1,asthma,"[CDK2, HDAC3, SIK3, ARHGAP27, PTEN, PTPRC, CYL...",0.073059,0.002226,91,32,438
54,114,Hashimoto's thyroiditis,"[IRF4, CD247, PTPRC, CEPT1, GPS2, TBX21, PRKCQ...",0.053398,0.003217,43,11,206
154,91,Crohn's disease,"[IPMK, IL21R, STAT3, IRF1]",0.190476,0.003225,61,4,21
91,-1,psoriasis,"[FIGNL1, PTPN2, AHR, PTEN, DAP, PRKRA, FOXO1, ...",0.050228,0.003787,58,22,438
56,114,asthma,"[IRF4, CD247, CD28, PTPRC, STAT5B, PDCD5, STAT...",0.087379,0.00424,91,18,206


In [7]:
disease_enrichment_df.sort_values(by='raw_p_value').head(50)

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
243,79,inflammatory bowel disease,"[IPMK, STAT3, IRF4, BATF]",0.666667,0.000224,120,4,6
245,79,psoriasis,"[IPMK, STAT3, IRF4]",0.5,0.000412,53,3,6
240,79,Crohn's disease,"[IPMK, STAT3, BATF]",0.5,0.000485,56,3,6
19,35,rheumatoid arthritis,"[PLA2G4A, RASGRP1, CACHD1, IL12RB2, ICOS, SMAD...",0.189189,0.000648,78,7,37
58,99,Hashimoto's thyroiditis,"[GPS2, MAF, CD247, FOXO1, PRKCQ, CREBBP, IRF4,...",0.05641,0.0019,40,11,195
199,19,asthma,"[HDAC3, NCOR1, TBL1XR1]",0.5,0.002238,94,3,6
218,80,autoimmune disease,"[KLF2, BHLHE40, IMMP1L, RASA2, CNOT4, FAM20B, ...",0.317073,0.003462,268,13,41
242,79,autoimmune disease,"[IPMK, STAT3, IRF4, BATF]",0.666667,0.004984,268,4,6
48,-1,autoimmune disease,"[ZNF644, PPA2, MYB, HSPE1, FAM98B, REL, SYK, G...",0.185185,0.0057,268,75,405
22,35,ulcerative colitis,"[C8orf33, RASGRP1, IL12RB2, SMAD3, NDRG1]",0.135135,0.005817,60,5,37


In [5]:
disease_enrichment_df.sort_values(by='raw_p_value').head(50)

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
203,92,inflammatory bowel disease,"[IRF4, IPMK, BATF, STAT3, RASA2, DR1]",0.4,0.000168,104,6,15
39,-1,Crohn's disease,"[PTPN2, BATF, STAT3, ACTB, ATP2A2, NAA25, DAP,...",0.061765,0.00027,49,21,340
45,-1,inflammatory bowel disease,"[KEAP1, PTPN2, BATF, STAT3, ACTB, NDRG1, ATP2A...",0.105882,0.000394,104,36,340
187,93,autoimmune disease,"[STAT5B, KLF2, BHLHE40, CNOT4, BRD1, FAM20B, F...",0.473684,0.00051,235,9,19
155,18,asthma,"[HDAC3, TBL1XR1, NCOR1]",0.75,0.000541,87,3,4
205,92,psoriasis,"[IRF4, IPMK, STAT3, LRRC25]",0.266667,0.000571,46,4,15
168,90,Hashimoto's thyroiditis,"[IRF4, CD247, PTPRC, TBX21, PRKCQ, TBCA, CREBB...",0.0625,0.0007,33,10,160
200,92,Crohn's disease,"[IPMK, BATF, RASA2, STAT3]",0.266667,0.000729,49,4,15
17,16,rheumatoid arthritis,"[IL12RB2, TM2D1, SMAD3, RASGRP1, ICOS, CACHD1]",0.2,0.001049,68,6,30
41,-1,ankylosing spondylitis,"[HDAC7, IMMP2L, MAPK14, UBE2L3, DAP, PTPN2, BR...",0.044118,0.001454,34,15,340


### Check downstream enrichment (didn't think of best way to get background set)

In [15]:
df_downstream = pd.read_csv('../../../../3_expts/processed_data/analysis_largefiles/nde75ntotal50_varfiltered_simple_clustering_downstream_genes.csv', index_col=0)
df_downstream_top = df_downstream[(df_downstream['zscore_rank_negative_regulation']<50)|(df_downstream['zscore_rank_positive_regulation']<50)].copy()

In [16]:
disease_association_df = pd.read_csv('disease_gene_associations_detailed.csv', index_col=0)
disease_association_df = disease_association_df[disease_association_df.gene_symbol.isin(df_downstream_top.downstream_gene.unique())]
#disease_association_df = disease_association_df[disease_association_df.gene_symbol.isin(adata_de.obs.target_contrast_gene_name_corrected.unique())]

disease_genes = {}
for disease_name in disease_association_df.disease_name.unique():
    disease_genes[disease_name] = set(disease_association_df[disease_association_df.disease_name==disease_name].gene_symbol)

disease_enrichment_downstream_df, _ = assess_complex_enrichment(df_downstream_top, disease_genes, 'hdbscan_cluster', 'downstream_gene')

In [17]:
disease_enrichment_downstream_df.sort_values(by='raw_p_value').head(50)

Unnamed: 0,cluster,complex,overlap_genes,overlap_fraction,raw_p_value,complex_size,overlap_size,cluster_size
845,79,rheumatoid arthritis,"[JADE2, GPR174, CD28, STAT4, IRF8, PDE4A, P2RY...",0.173469,2e-06,196,17,98
841,79,celiac disease,"[CD28, STAT4, CTLA4, UBE2E3, AEBP2, SH2B3, CIITA]",0.071429,3.3e-05,41,7,98
840,79,autoimmune disease,"[CD28, IKBKE, PDE4A, SESN1, SH2B3, CIITA, CAMK...",0.295918,7.4e-05,614,29,98
517,82,type 1 diabetes mellitus,"[CD44, IL2RA, SHC4, STAT4, FASLG, CD6, GLUL, G...",0.102041,7.6e-05,99,10,98
846,79,systemic lupus erythematosus,"[JADE2, RYBP, STAT4, IKBKE, IRF8, DRAP1, CTLA4...",0.122449,7.8e-05,141,12,98
674,55,autoimmune disease,"[TMEM116, PSMB9, RBM17, PHGDH, TNFSF8, DPP4, I...",0.28866,0.000158,614,28,97
313,38,ulcerative colitis,"[SEPTIN9, LSP1, CD28, FYB1, DOCK5, SELL, SATB1...",0.112245,0.000249,136,11,98
1114,94,asthma,"[CXCR4, RAD50, MAP4K4, IL2RA, ASS1, IL13, ICOS...",0.142857,0.000273,210,14,98
338,73,inflammatory bowel disease,"[GPR183, RYBP, RAP2C, MTRR, STK24, NT5E, ALOX5...",0.163265,0.000391,271,16,98
676,55,multiple sclerosis,"[IL7R, CARD11, TIMMDC1, RBM17, SATB1, CD69, TN...",0.092784,0.000396,100,9,97
