### Set parameters and names

In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import gseapy as gp
from gseapy import barplot, dotplot
import pickle        


In [2]:
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rcParams['savefig.dpi'] = 250
plt.rcParams['figure.dpi'] = 100
plt.rcParams['axes.facecolor'] = 'white'
%matplotlib inline

In [3]:
INPUT_GENES    = 'ALL'
INPUT_FEATURES = 'X_FC'
INPUT_NORM     = 'z'
CODINGS_SIZE = 6

ID     = f'{CODINGS_SIZE}D_{INPUT_GENES}_{INPUT_FEATURES}_{INPUT_NORM}'
METHOD = 'VAE'
k      = 80
LABELS_COL = f'GMM_{METHOD}_{k}'
DIR_DATA= f'../data/{ID}_analysis/{LABELS_COL}/'
DIR_FIG = f'../figures/{ID}_analysis/{METHOD}/ORA_k{k}'
! mkdir -p {DIR_FIG}



In [4]:
with open(f'{DIR_DATA}gene_clusters_dict.pkl', 'rb') as f:
    GENE_CLUSTERS = pickle.load(f)


In [5]:

MOUSE_DB = gp.get_library_name(organism='Mouse')
MOUSE_DB


['ARCHS4_Cell-lines',
 'ARCHS4_IDG_Coexp',
 'ARCHS4_Kinases_Coexp',
 'ARCHS4_TFs_Coexp',
 'ARCHS4_Tissues',
 'Achilles_fitness_decrease',
 'Achilles_fitness_increase',
 'Aging_Perturbations_from_GEO_down',
 'Aging_Perturbations_from_GEO_up',
 'Allen_Brain_Atlas_10x_scRNA_2021',
 'Allen_Brain_Atlas_down',
 'Allen_Brain_Atlas_up',
 'Azimuth_2023',
 'Azimuth_Cell_Types_2021',
 'BioCarta_2013',
 'BioCarta_2015',
 'BioCarta_2016',
 'BioPlanet_2019',
 'BioPlex_2017',
 'CCLE_Proteomics_2020',
 'CORUM',
 'COVID-19_Related_Gene_Sets',
 'COVID-19_Related_Gene_Sets_2021',
 'Cancer_Cell_Line_Encyclopedia',
 'CellMarker_2024',
 'CellMarker_Augmented_2021',
 'ChEA_2013',
 'ChEA_2015',
 'ChEA_2016',
 'ChEA_2022',
 'Chromosome_Location',
 'Chromosome_Location_hg19',
 'ClinVar_2019',
 'DSigDB',
 'Data_Acquisition_Method_Most_Popular_Genes',
 'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
 'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
 'Descartes_Cell_Types_and_Tissue_2021',
 'Diabetes_Perturbatio

In [None]:
RESULTS = {}
for k, value in GENE_CLUSTERS.items():
    enr = gp.enrichr(gene_list= value['gene_list'],
                    background=  f'../data/{ID}_analysis/background.list',
                    gene_sets=['KEGG_2019_Mouse','WikiPathways_2019_Mouse','GO_Biological_Process_2023','GO_Molecular_Function_2023'],
                    organism='Mouse', 
                    outdir=None, 
                    verbose=True )
    
    RESULTS[f'Cluster_{k}']= {'result': enr.results, 'n_genes':value['len']}

### Remove from Term final string after '('

In [7]:
RESULTS_CLEANED = RESULTS.copy()
for key,value in RESULTS_CLEANED.items():
    x=RESULTS_CLEANED[key]['result']['Term'].str
    NEW_COL=[]
    for el in list(x.split('(')):
        NEW_COL.append(el[0])
    RESULTS_CLEANED[key]['result']['Term']=NEW_COL

In [8]:
COL_DICT = {
    'KEGG_2019_Mouse': '#66C2A5',
    'WikiPathways_2019_Mouse': '#FC8D62',
    'GO_Biological_Process_2023': '#8DA0CB',
    'GO_Molecular_Function_2023': '#E78AC3'
}


In [None]:
for cluster_k, value in RESULTS_CLEANED.items():
        n_genes=RESULTS_CLEANED[cluster_k]['n_genes']
        
        df = RESULTS_CLEANED[cluster_k]['result']
                #
        try:
                barplot(df,
                        column="Adjusted P-value",
                        group='Gene_set',
                        color=COL_DICT,
                        top_term=5,
                        title=f'{cluster_k}, n={n_genes}',
                        #xticklabels_rot=45,
                        #show_ring=False,
                        #marker='o',
                        #cmap='viridis'
                        )
                plt.savefig(f'{DIR_FIG}/{cluster_k}.pdf', format="pdf", bbox_inches="tight")
        except ValueError:
                with open(f'{DIR_FIG}/{cluster_k}_no_sign_terms.error', "w") as file:
                        file.write("No significant terms")                    
        if cluster_k=='5':break
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
