In [None]:
from platform import python_version
print(python_version())

### Calculating all possible Enrichment Analysis
  - for each LFC/FDR cutoff one calculates the Enrichment Analysis
  - We used Enricher python API
     - Reactome (2022)
     - Bioplanet (2019)
     - WikiPathways (2021 Human)
     - KEGG (2021 Human)
     - GO Biological Process (2021)
     - MSigDB Hallmark (2020)
   
### For each enriched pathways one calculates:
  - DEGs in the pathway
  - DEGs not in the pathway
  - TOI1, 2, 3, 4

In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
pd.set_option("display.precision", 3)

import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from enricher_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

# !pip3 install pyyaml
with open('params.yml', 'r') as file:
    dic_yml=yaml.safe_load(file)


In [None]:
root0=dic_yml['root0']
email=dic_yml['email']

project=dic_yml['project']
s_project=dic_yml['s_project']

gene_protein=dic_yml['gene_protein']
s_omics=dic_yml['s_omics']

has_age=dic_yml['has_age']
has_gender=dic_yml['has_gender']

want_normalized=dic_yml['want_normalized']

abs_lfc_cutoff_inf=dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method=dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr=dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index=dic_yml['tolerance_pathway_index']
type_sat_ptw_index=dic_yml['type_sat_ptw_index']
saturation_lfc_index=dic_yml['saturation_lfc_index']
chosen_model_sampling=dic_yml['chosen_model_sampling']

case_list=dic_yml['case_list']

pval_pathway_cutoff=dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff=dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff=dic_yml['num_of_genes_cutoff']

run_list=dic_yml['run_list']
chosen_model_list=dic_yml['chosen_model_list']
i_dfp_list=dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg=Config(project, s_project, case_list, root0)

case=case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio=-1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw=cfg.get_best_lfc_cutoff(case, 'not_normalized')


print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
enr=enricheR(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method=s_pathw_enrichm_method,
             abs_lfc_cutoff_inf=abs_lfc_cutoff_inf, 
             type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case=case_list[0]

enr.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
print("\nEcho Parameters:")
enr.echo_parameters()
geneset_num=enr.geneset_num

In [None]:
enr.abs_lfc_cutoff_inf, abs_lfc_cutoff_inf

In [None]:
print(len(enr.gene.df_my_gene))
enr.gene.df_my_gene.head(2)

In [None]:
lista=[x for x in os.listdir(enr.root_result) if 'medulloblastoma_DEG_' in x and not '~lock' in x]
lista.sort()
print(len(lista))
lista[:3]

In [None]:
files=[x for x in os.listdir(enr.root_enrichment) if 'Reactome_' in x and not '~lock' in x and '_WNT_' in x]
print(len(files))
files[:2]

### Summary of cases - below on can see the enriched tables for different databases

In [None]:
print("")

for case in case_list:
    enr.open_case(case, verbose=False)
    enr.echo_parameters()
    print("\n------------------\n\n")

### Cutoffs and Results

In [None]:
for case in case_list:
    ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)

    print(f"For {case}")
    print(f"\tLFC cutoffs: lfc={enr.abs_lfc_cutoff:.3f}; fdr={enr.fdr_lfc_cutoff} #{enr.s_deg_dap}s={len(degs)}")
    print(f"\tPathway cutoffs: fdr={enr.pathway_fdr_cutoff:.3f}; num of genes={enr.num_of_genes_cutoff}, #Pathways={enr.n_pathways}, #{enr.s_deg_dap}s in pathwyas={enr.n_degs_in_pathways}\n")


In [None]:
# df2=enr.dflfc_ori[ (enr.dflfc_ori.symbol == 'IGHA2') | (enr.dflfc_ori.symbol == 'A2M')]
# df2

In [None]:
fname_final_ori, fname_ori, title=enr.set_lfc_names()
fname_final_ori, title

In [None]:
enr.set_enrichment_name()

In [None]:
enr.get_best_ptw_cutoff_biopax(verbose=True)
# self.pathway_pval_cutoff, self.pathway_fdr_cutoff, self.num_of_genes_cutoff,

### Testing EnrichR API 

In [None]:
case=case_list[0]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
ret, len(degs), enr.n_degs, enr.n_degs_ensembl

In [None]:
# dfdegs.columns

In [None]:
print(len(dfdegs))
dfdegs.head(3)

In [None]:
dfdegs_ensembl=dfdegs[ (~pd.isnull(dfdegs.ensembl_id)) & (dfdegs.biotype != 'TEC')].copy()
cols=['probe', 'symbol', 'symbol_prev', 'symb_or_syn', 'biotype', '_type',
       'lfc', 'abs_lfc', 'fdr', 'description',
       'desc_gff', 'description_prev', 'accession', 'ensembl_id',
       'ensembl_transc_id', 'geneid', 'cytoband', 'symbol_pipe', ]
print(len(dfdegs), len(enr.dflfc), len(dfdegs_ensembl), len(enr.dfdegs_ensembl))
dfdegs_ensembl[cols].head()

In [None]:
np.unique(dfdegs_ensembl.biotype)

In [None]:
len(enr.degs), len(enr.degs_ensembl)

In [None]:
enr.n_degs, enr.n_degs_ensembl

In [None]:
enr.set_db(geneset_num=0)

In [None]:
shortId, userListId=enr.open_session_upload_symbols(enr.degs_ensembl)
shortId, userListId

### All enriched cases for many databases

In [None]:
fdr_ptw_cutoff_list=enr.fdr_ptw_cutoff_list
fdr_ptw_cutoff_list

In [None]:
# dfsim=pdreadcsv( enr.cfg.fname_lfc_cutoff, enr.cfg.root_config)
dfsim=enr.open_simulation_table()
if dfsim is None:
    dfsim=pd.DataFrame()

print(len(dfsim))
dfsim.tail(3)

In [None]:
enr.lfc_list

In [None]:
enr.fdr_list

### How many samples per case?

In [None]:
for case in case_list:
    dfsim2=dfsim[ (dfsim.case == case) & (dfsim.normalization == enr.normalization) & (dfsim.n_degs >= enr.num_min_degs_for_ptw_enr)]
    print(f"case {case} #simulations {len(dfsim2)}")

In [None]:
dfsim2=dfsim[ (dfsim.normalization == enr.normalization) & (dfsim.n_degs > 2)].copy()
dfsim2.index=np.arange(0, len(dfsim2))
print(len(dfsim2))

In [None]:
for i in range(len(dfsim2)):
    row=dfsim2.iloc[i]
    degs=eval(row.degs)

    case=row.case
    abs_lfc_cutoff=row.abs_lfc_cutoff
    fdr_lfc_cutoff=row.fdr_lfc_cutoff

    print(i, case, abs_lfc_cutoff, fdr_lfc_cutoff, len(degs), degs[:9], '...')
    if i > 3: break

In [None]:
enr.abs_lfc_cutoff_inf

In [None]:
dfsim[ (dfsim.case == 'WNT') & (dfsim.abs_lfc_cutoff == abs_lfc_cutoff_inf) & (dfsim.fdr_lfc_cutoff == 0.15)]

In [None]:
dfsim[ (dfsim.case == 'G4') & (dfsim.abs_lfc_cutoff == abs_lfc_cutoff_inf) & (dfsim.fdr_lfc_cutoff == 0.15)]

### Calc all enrichment analyses

In [None]:
geneset_num_list=[1, 2, 4, 5, 7]
geneset_num_list=[0, 1, 2, 4, 5, 7]
geneset_num_list=[0]

In [None]:
enr.set_db(0, verbose=True)

In [None]:
enr.set_enrichment_name()

In [None]:
enr.abs_lfc_cutoff_inf

In [None]:
print(enr.abs_lfc_cutoff_inf)
df_fdr=enr.open_fdr_lfc_correlation(case, enr.abs_lfc_cutoff_inf)

In [None]:
dfsim=enr.open_simulation_table()
dfsim.head(3)

### Calc DEFAULT paramenters Enrichment Analysis

In [None]:
force=False
verbose=False
enr.calc_default_enrichment_analysis(geneset_num_list=[0, 1, 2, 4, 5, 7], force=force, verbose=verbose)

### Reactome in Enricher

In [None]:
case=case_list[0]
df_fdr=enr.open_fdr_lfc_correlation(case, enr.abs_lfc_cutoff_inf)
df2=df_fdr[ pd.notnull(df_fdr['corr']) ]
print(len(df2))
df2.head(3)

In [None]:
verbose=False
geneset_num_list=[0]
# remove the comments - it last some minutes
enr.calc_all_enrichment_analysis(geneset_num_list, force=force, verbose=verbose)

In [None]:
verbose=False
geneset_num_list=[1, 2, 4, 5, 7]
# remove the comments - it last some minutes
enr.calc_all_enrichment_analysis(geneset_num_list, force=force, verbose=verbose)

### Sampling Pathways 

In [None]:
dfa=enr.count_sampling(geneset_num_list=[0], prompt_verbose=True)
len(dfa)

In [None]:
fig, dfa=enr.barplot_sampling_cutoffs(prompt_verbose=False, verbose=False)
fig.show()

### Other tests

In [None]:
force=False; verbose=False
num_min_degs_for_ptw_enr=3

geneset_num_list=[1, 2, 4, 5, 7]
geneset_num_list=[0, 1, 2, 4, 5, 7]
geneset_num_list=[0]

want_test=False

if want_test:
    icount=-1
    for case in case_list:
        if not enr.open_case_simple(case):
            print(f"Problems for {case} !!!!")
            continue
        
        dfsim2=dfsim[ (dfsim.normalization == enr.normalization) & (dfsim.case == case) &
                        (dfsim.n_degs >= num_min_degs_for_ptw_enr)]
        
        for i in range(len(dfsim2)):
            icount += 1
            
            row=dfsim2.iloc[i]
    
            degs=eval(row.degs)
            case=row.case
            
            abs_lfc_cutoff=row.abs_lfc_cutoff
            fdr_lfc_cutoff=row.fdr_lfc_cutoff
    
            degs2, _=enr.list_of_degs_params(abs_lfc_cutoff, fdr_lfc_cutoff, verbose=False)
    
            if len(degs) != len(degs2):
                print("Error:", case, abs_lfc_cutoff, fdr_lfc_cutoff, len(degs), len(degs2))
                continue
    
            # if i > 10:break
            enr.calc_EA_dataset_symbol(degs, return_value=True, force=force, verbose=verbose)
            if icount%100==0:
                print(case, len(degs), abs_lfc_cutoff, fdr_lfc_cutoff)
                enr.echo_degs()
                print("")
                enr.echo_enriched_pathways()
                print("\n")


### Differences between databases
#### Run only if you defined teh best config: new05 algorithm

In [None]:
enr.get_best_ptw_cutoff_biopax()

In [None]:
case=case_list[0]

In [None]:
enr.abs_lfc_cutoff , enr.fdr_lfc_cutoff, enr.pathway_pval_cutoff, enr.pathway_fdr_cutoff, enr.num_of_genes_cutoff

In [None]:
case=case_list[0]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
len(degs)

In [None]:
fname, fname_cutoff=enr.set_enrichment_name()
fname, fname_cutoff 

In [None]:
geneset_num_list=[0, 1, 2, 4, 5, 7]
verbose=True

for geneset_num in geneset_num_list:
    enr.set_db(geneset_num, verbose=verbose)

### Reactome, Bioplanet, KEGG

In [None]:
enr.dbs_list

In [None]:
[enr.dbs_list[i] for i in [0, 1, 2, 4, 5, 7]]

In [None]:
enr.set_which_db('xxx')

In [None]:
enr.set_which_db('Reactome_2022')

In [None]:
enr.set_which_db('Reactome')

In [None]:
enr.set_which_db('reactome')

In [None]:
enr.set_which_db('KEGG_2021')

In [None]:
enr.set_which_db('KEGG')

In [None]:
enr.set_db(geneset_num=0)

### Reactome_2022

In [None]:
enr.set_db(0, verbose=True)
case=case_list[0]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
# print("\nEcho Parameters:")
enr.echo_parameters()

In [None]:
if enr.df_enr is None:
    enr.df_enr=pd.DataFrame()
print(len(enr.df_enr))
enr.df_enr

### Reactome_2022 case G4

In [None]:
enr.set_db(0, verbose=True)
case=case_list[1]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
# print("\nEcho Parameters:")
enr.echo_parameters()

In [None]:
if enr.df_enr is None:
    enr.df_enr=pd.DataFrame()
print(len(enr.df_enr))
enr.df_enr

### WikiPathway_2021_Human

In [None]:
enr.set_db(1, verbose=True)
case=case_list[0]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)

In [None]:
enr.echo_enriched_pathways()

In [None]:
if enr.df_enr is None:
    enr.df_enr=pd.DataFrame()

print(len(enr.df_enr))
enr.df_enr.head(42)

In [None]:
enr.df_enr.tail(40)

### KEGG

In [None]:
enr.set_db(2, verbose=True)
case=case_list[0]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
enr.echo_enriched_pathways()

In [None]:
if enr.df_enr is None:
    enr.df_enr=pd.DataFrame()

print(len(enr.df_enr))
enr.df_enr.head(45)

In [None]:
enr.df_enr.tail(40)

In [None]:
enr.set_db(2, verbose=True)
case=case_list[1]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
enr.echo_enriched_pathways()

In [None]:
if enr.df_enr is None:
    enr.df_enr=pd.DataFrame()

enr.df_enr.head(30)

In [None]:
enr.df_enr.tail(30)

### BioPlanet_2019=4

In [None]:
enr.set_db(4, verbose=True)
case=case_list[0]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
enr.echo_enriched_pathways()

In [None]:
if enr.df_enr is None:
    enr.df_enr=pd.DataFrame()

print(len(enr.df_enr))
enr.df_enr.head(57)

In [None]:
enr.df_enr.tail(50)

In [None]:
enr.set_db(4, verbose=True)
case=case_list[1]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
enr.echo_enriched_pathways()

In [None]:
if enr.df_enr is None:
    enr.df_enr=pd.DataFrame()

print(len(enr.df_enr))
enr.df_enr.head(50)

In [None]:
enr.df_enr.tail(43)