In [1]:
import warnings
import scanpy as sc
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import time
import sys
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from scipy.sparse import csr_matrix
from scipy.stats import hypergeom
from scipy.stats import pearsonr

import matplotlib.cm as cm
import matplotlib.colors as mcolors 
import time
from statsmodels.stats.multitest import multipletests


sys.path.append('../3_DE_analysis/')
from DE_analysis_utils import *

pd.set_option('display.max_rows', 100)
sc.set_figure_params(figsize=(20, 4))

  from anndata import __version__ as anndata_version
  if Version(anndata.__version__) >= Version("0.11.0rc2"):
  if Version(anndata.__version__) >= Version("0.11.0rc2"):


In [2]:
#adata_de = sc.read_h5ad('../../../../3_expts/processed_data/CD4i_final//DE_results_all_confounders/CD4i_final.merged_DE_results_corrected.h5ad', backed='r')
adata_final = sc.read_h5ad('../../../../3_expts/processed_data/analysis_largefiles/nde75ntotal50_varfiltered_simple_clustering.h5ad')#, backed='r')
#de_by_guide = pd.read_csv('../3_DE_analysis/results/DE_by_guide.correlation_results.csv', index_col=0)
#donor_robustness_summary = pd.read_csv('../3_DE_analysis/results/DE_donor_robustness_correlation_summary.csv', index_col=0)
#de_summary_stats = pd.read_csv('../../../../3_expts/processed_data/CD4i_final/DE_results_all_confounders/DE_summary_stats_per_target_corrected.csv', index_col=0)

In [3]:
num_of_cluster = len(adata_final.obs['hdbscan'].unique())
cluster_name = []
corr = []
cluster_size = []
cluster_gene_size = []
cluster_member = []
for i, cl in enumerate(adata_final.obs['hdbscan'].unique()):
    df = pd.DataFrame(data=adata_final[adata_final.obs['hdbscan']==cl].layers['zscore'],
                        index=adata_final[adata_final.obs['hdbscan']==cl].obs_names,
                        columns=adata_final.var_names)
    df_corr = df.T.corr()
    np.fill_diagonal(df_corr.values, 0)
    cluster_name.append(int(cl))
    corr.append(np.mean(df_corr))
    cluster_size.append(len(df))
    cluster_gene_size.append(len(adata_final[adata_final.obs['hdbscan']==cl].obs.target_contrast_gene_name_corrected.unique()))
    cluster_member.append(adata_final[adata_final.obs['hdbscan']==cl].obs.target_contrast_gene_name_corrected.unique().tolist())

cluster_df = pd.DataFrame({'cluster': cluster_name,
                           'intracluster_corr': corr,
                           'cluster_size': cluster_size,
                           'cluster_gene_size': cluster_gene_size,
                           'cluster_member': cluster_member})

In [3]:
import gseapy
from gseapy import Msigdb
from gseapy import barplot, dotplot

msig = Msigdb()
kegg_gene_sets = msig.get_gmt(category= 'c2.cp.kegg_legacy', dbver="2025.1.Hs")
reactome_gene_sets = msig.get_gmt(category= 'c2.cp.reactome', dbver="2025.1.Hs")

In [4]:
corum_df = pd.read_csv('../../../../2_files/enrichment_analysis/CORUM/corum_humanComplexes.txt', delimiter='\t', index_col='complex_id')
stringdb = pd.read_csv('../../../../2_files/enrichment_analysis/STRINGDB/9606.clusters.proteins.v12.0.txt.gz', delimiter='\t', compression='gzip')
protein_info = pd.read_csv('../../../../2_files/enrichment_analysis/STRINGDB/9606.protein.info.v12.0.txt.gz', delimiter='\t', compression='gzip')
cluster_info = pd.read_csv('../../../../2_files/enrichment_analysis/STRINGDB/9606.clusters.info.v12.0.txt.gz', delimiter='\t', compression='gzip')
stringdb_df = pd.merge(stringdb, protein_info, left_on='protein_id', right_on='#string_protein_id')
stringdb_df = pd.merge(stringdb_df, cluster_info, left_on='cluster_id', right_on='cluster_id')
stringdb_df = stringdb_df[stringdb_df.cluster_size<1000].copy()

corum_complexes = {}
for _, row in corum_df.iterrows():
    complex_name = row['complex_name']
    subunits = set(row['subunits_gene_name'].split(';'))
    corum_complexes[complex_name] = subunits

stringdb_complexes = {}
for cluster_id in stringdb_df.cluster_id.unique():
    stringdb_complexes[cluster_id] = set(stringdb_df[stringdb_df.cluster_id==cluster_id].preferred_name)

In [6]:
def assess_complex_enrichment(df, complexes, cluster_label, gene_name_label):
    """
    Assess pathway/complex enrichment
    """
    de_genes_per_cluster = {}
    clusters = df[cluster_label].unique()
    for cluster in clusters:
        cluster_genes = df[df[cluster_label]==cluster][gene_name_label]
        de_genes_per_cluster[cluster] = set(cluster_genes)

    N = len(df[gene_name_label].unique())
    
    best_result = []
    all_result = []
    for cluster, de_genes in de_genes_per_cluster.items():
        n = len(de_genes)
        best_result_for_cluster = None # Initialize a variable to track the best result for this cluster
        for complex_name, subunits in complexes.items():
            K = len(subunits)
            # Find the overlap between DE genes and complex subunits
            overlap = de_genes.intersection(subunits)
            k = len(overlap)
            
            # Calculate the p-value
            if (n > 0 and K > 0) and (k > 1): # Avoid division by zero or trivial cases
                pval = hypergeom.sf(k - 1, N, K, n) # sf is survival function
                
                # Adjust for multiple testing (e.g., using Benjamini-Hochberg)
                # This is a crucial step to avoid false positives
                # Store raw p-values and correct later
                current_result = {
                    'cluster': cluster,
                    'complex': complex_name,
                    'overlap_genes': list(overlap),
                    'overlap_fraction': len(overlap)/n,
                    'raw_p_value': pval,
                    'complex_size': K,
                    'overlap_size': len(overlap),
                    'cluster_size': n
                }
                
                if (best_result_for_cluster is None) or (pval < best_result_for_cluster['raw_p_value']):
                    best_result_for_cluster = current_result
                if len(overlap) > 1:
                    all_result.append(current_result)
        
        if best_result_for_cluster is not None:
            if best_result_for_cluster['raw_p_value']<1:
                best_result.append(best_result_for_cluster)
    
    # Convert results to a DataFrame for easy viewing
    best_result_df = pd.DataFrame(best_result)
    #best_result_df = best_result_df.sort_values(by='cluster').reset_index(drop=True)
    all_result_df = pd.DataFrame(all_result)
    #all_result_df = all_result_df.sort_values(by='cluster').reset_index(drop=True)

    return all_result_df, best_result_df

In [9]:
def assess_complex_enrichment(df, complexes, cluster_label, gene_name_label):
    """
    Assess pathway/complex enrichment with Benjamini-Hochberg FDR correction.
    """
    de_genes_per_cluster = {}
    clusters = df[cluster_label].unique()
    for cluster in clusters:
        cluster_genes = df[df[cluster_label] == cluster][gene_name_label]
        de_genes_per_cluster[cluster] = set(cluster_genes)

    N = len(df[gene_name_label].unique())
    
    all_result = []
    
    # 1. Collect all raw p-values first
    for cluster, de_genes in de_genes_per_cluster.items():
        n = len(de_genes)
        
        for complex_name, subunits in complexes.items():
            K = len(subunits)
            overlap = de_genes.intersection(subunits)
            k = len(overlap)
            
            # Filter for non-trivial overlaps (at least 2 genes)
            if (n > 0 and K > 0) and (k > 1): 
                pval = hypergeom.sf(k - 1, N, K, n)
                
                current_result = {
                    'cluster': cluster,
                    'complex': complex_name,
                    'overlap_genes': list(overlap),
                    'overlap_fraction': len(overlap)/n,
                    'raw_p_value': pval,
                    'complex_size': K,
                    'overlap_size': len(overlap),
                    'cluster_size': n
                }
                all_result.append(current_result)
    
    # 2. Create DataFrame
    all_result_df = pd.DataFrame(all_result)

    # 3. Apply FDR Correction (Benjamini-Hochberg)
    if not all_result_df.empty:
        # We correct across all tests performed for this specific dictionary of complexes
        # method='fdr_bh' is the standard Benjamini-Hochberg procedure
        rejects, fdr_pvals, _, _ = multipletests(all_result_df['raw_p_value'], alpha=0.05, method='fdr_bh')
        all_result_df['fdr'] = fdr_pvals
        
        # Sort by cluster and significance
        all_result_df = all_result_df.sort_values(by=['cluster', 'fdr', 'raw_p_value'])
        
        # 4. Extract best result per cluster
        # We simply take the top hit (lowest FDR) for each cluster
        best_result_df = all_result_df.drop_duplicates(subset=['cluster'], keep='first').copy()
        
        # Optional: Filter best results to only show significant ones (e.g. raw_p < 0.05 or FDR < 0.25)
        # best_result_df = best_result_df[best_result_df['raw_p_value'] < 1] 
    else:
        # Handle empty case
        best_result_df = pd.DataFrame(columns=['cluster', 'complex', 'overlap_genes', 'overlap_fraction', 
                                             'raw_p_value', 'fdr', 'complex_size', 'overlap_size', 'cluster_size'])

    return all_result_df, best_result_df

In [47]:
def run_enrichment_analysis(df, cluster_label, gene_name_label):
    # STRINGDB enrichment
    stringdb_enrichment_all, stringdb_enrichment_best = assess_complex_enrichment(df, stringdb_complexes, cluster_label, gene_name_label)
    stringdb_enrichment_all = pd.merge(stringdb_enrichment_all, stringdb_df[['cluster_id', 'best_described_by']].drop_duplicates(), left_on='complex', right_on='cluster_id')
    stringdb_enrichment_all = stringdb_enrichment_all.drop(columns=['cluster_id'])
    stringdb_enrichment_best = pd.merge(stringdb_enrichment_best, stringdb_df[['cluster_id', 'best_described_by']].drop_duplicates(), left_on='complex', right_on='cluster_id')
    stringdb_enrichment_best = stringdb_enrichment_best.drop(columns=['cluster_id'])
    # Corum enrichment
    corum_enrichment_all, corum_enrichment_best = assess_complex_enrichment(df, corum_complexes, cluster_label, gene_name_label)
    # KEGG enrichment
    kegg_enrichment_all, kegg_enrichment_best = assess_complex_enrichment(df, kegg_gene_sets, cluster_label, gene_name_label)
    # Reactome enrichment
    reactome_enrichment_all, reactome_enrichment_best = assess_complex_enrichment(df, reactome_gene_sets, cluster_label, gene_name_label)
    
    # Also checking inter-cluster overlap
    gene_grouping = {}
    gene_grouping_df = {}
    for cluster in df[cluster_label].unique():
        gene_grouping[cluster] = set(df[df[cluster_label]==cluster][gene_name_label])
        gene_grouping_df[cluster] = [set(df[df[cluster_label]==cluster][gene_name_label])]
    gene_grouping_df = pd.DataFrame(gene_grouping_df).T.rename(columns={0:'cluster_member'})
    gene_grouping_df['cluster'] = gene_grouping_df.index
    
    intercluster_enrichment_all, _ = assess_complex_enrichment(df, gene_grouping, cluster_label, gene_name_label)
    intercluster_enrichment_all = intercluster_enrichment_all[intercluster_enrichment_all.cluster!=intercluster_enrichment_all.complex].copy()
    intercluster_enrichment_all = intercluster_enrichment_all[intercluster_enrichment_all.raw_p_value<1e-2].copy()
    intercluster_enrichment_all = intercluster_enrichment_all[intercluster_enrichment_all.overlap_fraction>=0.2].copy()
    intercluster_enrichment_summary = {}
    
    for cluster in intercluster_enrichment_all.cluster.unique():
        intercluster_enrichment_summary[cluster] = set(intercluster_enrichment_all[intercluster_enrichment_all.cluster==cluster].complex)
    
    intercluster_enrichment_summary_df = pd.DataFrame(intercluster_enrichment_summary.items(), columns=['cluster', 'related_cluster'])
    
    # Summarize results
    enrichment_df1 = pd.merge(corum_enrichment_best, stringdb_enrichment_best, on='cluster', how='outer', suffixes=('_corum', '_stringdb'))
    enrichment_df2 = pd.merge(kegg_enrichment_best, reactome_enrichment_best, on='cluster', how='outer', suffixes=('_kegg', '_reactome'))
    enrichment_df = pd.merge(enrichment_df1, enrichment_df2, on='cluster', how='outer')
    enrichment_df = pd.merge(enrichment_df, gene_grouping_df, on='cluster', how='outer')
    enrichment_df = pd.merge(enrichment_df, intercluster_enrichment_summary_df, on='cluster', how='outer')
    #enrichment_df = pd.merge(cluster_df[['cluster', 'intracluster_corr', 'cluster_size', 'cluster_gene_size']], enrichment_df, on='cluster', how='outer')
    
    return enrichment_df, corum_enrichment_all, stringdb_enrichment_all, kegg_enrichment_all, reactome_enrichment_all

### Check regulator enrichment

In [48]:
enrichment_reg_df,\
corum_enrichment_reg_all,\
stringdb_enrichment_reg_all,\
kegg_enrichment_reg_all,\
reactome_enrichment_reg_all = run_enrichment_analysis(adata_final.obs[['hdbscan', 'target_contrast_gene_name_corrected']], 'hdbscan', 'target_contrast_gene_name_corrected')

In [49]:
# Add cluster information
enrichment_reg_df = pd.merge(cluster_df[['cluster', 'intracluster_corr', 'cluster_size', 'cluster_gene_size']], enrichment_reg_df, on='cluster', how='outer')
# Add timepoint information
cluster_member_with_condition = []
for index, row in enrichment_reg_df.iterrows():
    list1 = adata_final[adata_final.obs.hdbscan==row.cluster].obs.target_contrast_gene_name_corrected.tolist()
    list2 = adata_final[adata_final.obs.hdbscan==row.cluster].obs.culture_condition.tolist()
    cluster_member_with_condition.append([f"{item1}_{item2}" for item1, item2 in zip(list1, list2)])
    enrichment_reg_df.loc[index, 'rest_count'] = len(adata_final[(adata_final.obs.hdbscan==row.cluster)&(adata_final.obs.culture_condition=='Rest')])
    enrichment_reg_df.loc[index, 'stim8hr_count'] = len(adata_final[(adata_final.obs.hdbscan==row.cluster)&(adata_final.obs.culture_condition=='Stim8hr')])
    enrichment_reg_df.loc[index, 'stim48hr_count'] = len(adata_final[(adata_final.obs.hdbscan==row.cluster)&(adata_final.obs.culture_condition=='Stim48hr')])
    
enrichment_reg_df['cluster_member_with_condition'] = cluster_member_with_condition

In [50]:
enrichment_reg_df.to_parquet('results/clustering_nde75ntotal50_reg.parquet')
enrichment_reg_df.to_csv('results/clustering_nde75ntotal50_reg.csv')
corum_enrichment_reg_all.to_parquet('results/corum_enrichment_nde75ntotal50_reg.parquet')
corum_enrichment_reg_all.to_csv('results/corum_enrichment_nde75ntotal50_reg.csv')
stringdb_enrichment_reg_all.to_parquet('results/stringdb_enrichment_nde75ntotal50_reg.parquet')
stringdb_enrichment_reg_all.to_csv('results/stringdb_enrichment_nde75ntotal50_reg.csv')
kegg_enrichment_reg_all.to_parquet('results/kegg_enrichment_nde75ntotal50_reg.parquet')
kegg_enrichment_reg_all.to_csv('results/kegg_enrichment_nde75ntotal50_reg.csv')
reactome_enrichment_reg_all.to_parquet('results/reactome_enrichment_nde75ntotal50_reg.parquet')
reactome_enrichment_reg_all.to_csv('results/reactome_enrichment_nde75ntotal50_reg.csv')

### Check downstream enrichment

In [51]:
df_downstream = pd.read_csv('../../../../3_expts/processed_data/analysis_largefiles/nde75ntotal50_varfiltered_simple_clustering_downstream_genes.csv', index_col=0)
df_downstream_top = df_downstream[(df_downstream['zscore_rank_negative_regulation']<50)|(df_downstream['zscore_rank_positive_regulation']<50)].copy()

In [52]:
enrichment_downstream_df,\
corum_enrichment_downstream_all,\
stringdb_enrichment_downstream_all,\
kegg_enrichment_downstream_all,\
reactome_enrichment_downstream_all = run_enrichment_analysis(df_downstream_top, 'hdbscan_cluster', 'downstream_gene')

In [53]:
# Add timepoint information
cluster_member_with_condition = []
for index, row in enrichment_downstream_df.iterrows():
    list1 = adata_final[adata_final.obs.hdbscan==row.cluster].obs.target_contrast_gene_name_corrected.tolist()
    list2 = adata_final[adata_final.obs.hdbscan==row.cluster].obs.culture_condition.tolist()
    cluster_member_with_condition.append([f"{item1}_{item2}" for item1, item2 in zip(list1, list2)])
    enrichment_downstream_df.loc[index, 'rest_count'] = len(adata_final[(adata_final.obs.hdbscan==row.cluster)&(adata_final.obs.culture_condition=='Rest')])
    enrichment_downstream_df.loc[index, 'stim8hr_count'] = len(adata_final[(adata_final.obs.hdbscan==row.cluster)&(adata_final.obs.culture_condition=='Stim8hr')])
    enrichment_downstream_df.loc[index, 'stim48hr_count'] = len(adata_final[(adata_final.obs.hdbscan==row.cluster)&(adata_final.obs.culture_condition=='Stim48hr')])
    
enrichment_downstream_df['cluster_member_with_condition'] = cluster_member_with_condition

In [54]:
enrichment_downstream_df.to_parquet('results/clustering_nde75ntotal50_downstream.parquet')
enrichment_downstream_df.to_csv('results/clustering_nde75ntotal50_downstream.csv')
corum_enrichment_downstream_all.to_parquet('results/corum_enrichment_nde75ntotal50_downstream.parquet')
corum_enrichment_downstream_all.to_csv('results/corum_enrichment_nde75ntotal50_downstream.csv')
stringdb_enrichment_downstream_all.to_parquet('results/stringdb_enrichment_nde75ntotal50_downstream.parquet')
stringdb_enrichment_downstream_all.to_csv('results/stringdb_enrichment_nde75ntotal50_downstream.csv')
kegg_enrichment_downstream_all.to_parquet('results/kegg_enrichment_nde75ntotal50_downstream.parquet')
kegg_enrichment_downstream_all.to_csv('results/kegg_enrichment_nde75ntotal50_downstream.csv')
reactome_enrichment_downstream_all.to_parquet('results/reactome_enrichment_nde75ntotal50_downstream.parquet')
reactome_enrichment_downstream_all.to_csv('results/reactome_enrichment_nde75ntotal50_downstream.csv')

In [25]:
# top_perc = 0.025

# df_downstream_neg300 = pd.DataFrame()
# for cl in adata_final.obs.hdbscan.unique():
#     rank_cutoff = len(df_downstream[df_downstream.hdbscan_cluster==cl])*top_perc
#     df_temp = df_downstream[(df_downstream.hdbscan_cluster==cl)&(df_downstream['zscore_rank_negative_regulation']<rank_cutoff)].copy()
#     df_downstream_neg300 = pd.concat([df_downstream_neg300, df_temp])

# enrichment_downstream_neg300_df,\
# corum_enrichment_downstream_neg300_all,\
# stringdb_enrichment_downstream_neg300_all,\
# kegg_enrichment_downstream_neg300_all,\
# reactome_enrichment_downstream_neg300_all,\
# zhang_enrichment_downstream_neg300_all,\
# zhang_pairwise_enrichment_downstream_neg300_all = run_enrichment_analysis(df_downstream_neg300, 'hdbscan_cluster', 'downstream_gene')
# enrichment_downstream_neg300_df.to_csv('results/clustering_nde75ntotal50_downstream_neg300.csv')
# corum_enrichment_downstream_neg300_all.to_csv('results/corum_enrichment_nde75ntotal50_downstream_neg300.csv')
# stringdb_enrichment_downstream_neg300_all.to_csv('results/stringdb_enrichment_nde75ntotal50_downstream_neg300.csv')
# kegg_enrichment_downstream_neg300_all.to_csv('results/kegg_enrichment_nde75ntotal50_downstream_neg300.csv')
# reactome_enrichment_downstream_neg300_all.to_csv('results/reactome_enrichment_nde75ntotal50_downstream_neg300.csv')
# zhang_enrichment_downstream_neg300_all.to_csv('results/zhang_enrichment_nde75ntotal50_downstream_neg300.csv')
# zhang_pairwise_enrichment_downstream_neg300_all.to_csv('results/zhang_pairwise_enrichment_nde75ntotal50_downstream_neg300.csv')

# df_downstream_pos300 = pd.DataFrame()
# for cl in adata_final.obs.hdbscan.unique():
#     rank_cutoff = len(df_downstream[df_downstream.hdbscan_cluster==cl])*top_perc
#     df_temp = df_downstream[(df_downstream.hdbscan_cluster==cl)&(df_downstream['zscore_rank_positive_regulation']<rank_cutoff)].copy()
#     df_downstream_pos300 = pd.concat([df_downstream_pos300, df_temp])
    
# enrichment_downstream_pos300_df,\
# corum_enrichment_downstream_pos300_all,\
# stringdb_enrichment_downstream_pos300_all,\
# kegg_enrichment_downstream_pos300_all,\
# reactome_enrichment_downstream_pos300_all,\
# zhang_enrichment_downstream_pos300_all,\
# zhang_pairwise_enrichment_downstream_pos300_all = run_enrichment_analysis(df_downstream_pos300, 'hdbscan_cluster', 'downstream_gene')
# enrichment_downstream_pos300_df.to_csv('results/clustering_nde75ntotal50_downstream_pos300.csv')
# corum_enrichment_downstream_pos300_all.to_csv('results/corum_enrichment_nde75ntotal50_downstream_pos300.csv')
# stringdb_enrichment_downstream_pos300_all.to_csv('results/stringdb_enrichment_nde75ntotal50_downstream_pos300.csv')
# kegg_enrichment_downstream_pos300_all.to_csv('results/kegg_enrichment_nde75ntotal50_downstream_pos300.csv')
# reactome_enrichment_downstream_pos300_all.to_csv('results/reactome_enrichment_nde75ntotal50_downstream_pos300.csv')
# zhang_enrichment_downstream_pos300_all.to_csv('results/zhang_enrichment_nde75ntotal50_downstream_pos300.csv')
# zhang_pairwise_enrichment_downstream_pos300_all.to_csv('results/zhang_pairwise_enrichment_nde75ntotal50_downstream_pos300.csv')

### Combined regulator and downstream enrichment

In [55]:
enrichment_reg_df, _, _, _, _ = run_enrichment_analysis(adata_final.obs[['hdbscan', 'target_contrast_gene_name_corrected']], 'hdbscan', 'target_contrast_gene_name_corrected')

In [56]:
enrichment_downstream_df, _, _, _, _ = run_enrichment_analysis(df_downstream_top, 'hdbscan_cluster', 'downstream_gene')

In [58]:
enrichment_df = pd.merge(enrichment_reg_df, enrichment_downstream_df, how='outer', on='cluster', suffixes=('_reg', '_downstream'))

In [59]:
# Add cluster information
enrichment_df = pd.merge(cluster_df[['cluster', 'intracluster_corr', 'cluster_size', 'cluster_gene_size']], enrichment_df, on='cluster', how='outer')
# Add timepoint information
cluster_member_with_condition = []
for index, row in enrichment_df.iterrows():
    list1 = adata_final[adata_final.obs.hdbscan==row.cluster].obs.target_contrast_gene_name_corrected.tolist()
    list2 = adata_final[adata_final.obs.hdbscan==row.cluster].obs.culture_condition.tolist()
    cluster_member_with_condition.append([f"{item1}_{item2}" for item1, item2 in zip(list1, list2)])
    enrichment_df.loc[index, 'rest_count'] = len(adata_final[(adata_final.obs.hdbscan==row.cluster)&(adata_final.obs.culture_condition=='Rest')])
    enrichment_df.loc[index, 'stim8hr_count'] = len(adata_final[(adata_final.obs.hdbscan==row.cluster)&(adata_final.obs.culture_condition=='Stim8hr')])
    enrichment_df.loc[index, 'stim48hr_count'] = len(adata_final[(adata_final.obs.hdbscan==row.cluster)&(adata_final.obs.culture_condition=='Stim48hr')])
    
enrichment_df['cluster_member_with_condition'] = cluster_member_with_condition

In [60]:
enrichment_df.to_parquet('results/clustering_nde75ntotal50_reg_and_downstream.parquet')
enrichment_df.to_csv('results/clustering_nde75ntotal50_reg_and_downstream.csv')