In [None]:
from platform import python_version
print(python_version())

## Final Summary Results

### Definitions:
  - LFC table has:
    - abs_LFC (the absolute LFC cutoff value)
    - FDR_LFC, its FDR or p-value adjusted
  - The enriched pathway table has:
    - FDR_pathway cutoff value
   
### Default values for LFC table:
  - abs_LFC = 1
  - FDR_LFC = 0.05
  - therefore, a DEG/DEP is abs(LFC) >= 1 and FDR < 0.05

### Default values for Enriched Pathways:
  - FDR_pathway = 0.05
  - therefore, an enriched pathways has FDR < 0.05 and at least 3 DEGs/DEPs

### Calculating the best cutoffs:
  - We proposed and calculated many indexes to define a new statistics to flebilize the LFC and Enriched Pathway cutoffs.
    - Indexes are calculated for each case, each cutoff, and each resulting enriched pathway.
  - To find the possible best LFC/FDR expression and FDR pathway cutoffs:
     - We look for a high number in n_pathway and n_DEGs_in_pathway, having a low FDR_LFC and a high absLFC.
       - The default FDR_LFC (0.05):
          - It may have fewer DEGs, resulting in fewer enriched pathways.
          - It may have fewer enriched pathways, even having many DEGs/DEPs.
       - Therefore, a trade-off exists between optimizing (abs_LFC and FDR_LFC cutoffs) and (FDR_pathway cutoffs, n_pathways, and n_DEGs_in_pathways.)

### An index measures the trade-off between "LFC" and "Enriched Pathways" cufoff -> LFC - Enriched Pathway Trade-Off Statistics (LEATOS)

  - We proposed and calculated the following possible indexes:

<p style="font-size: 20px; color: navy;">
$toi1 = \sqrt{-log{_{10}}{FDR_{pathway}} * \frac{n}{N} }$ </p>

<p style="font-size: 20px; color: navy;">
$toi2 = \sqrt{-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} }$ </p>

<p style="font-size: 20px; color: navy;">
$toi3 = (-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/3}$ </p>

<p style="font-size: 20px; color: red;">
$toi4 = (abs\_LFC * -log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/4}$ </p>

where,
  - n is the number of DEGs/DEPs found in the pathway
  - N is the total number of annotated DEGs/DEPs in the pathway (depend in the database, our default database is Reactome 2022)

### Then we searched for the best cutoffs
  - In each 5 percentile of the index histogram, we look for the best abs_LFC, FDR_LFC, FDR_pathway:
  -  We expected that the best cutoff should be in the right tail of the histogram (high index value.)
  -  High index values must have a high number of n_pathways and n DEGs in pathways.

### Testing the best cutoffs (for each case)

  - Is the new set of cutoffs correct? good enough?
  - How to establish that the calculated cutoff is correct?
  - To answer these questions we calculated the chi-square test between the "best cutoff" and the "default"
    - Best cutoff has:
      - n DEGs/DEPs in pathways
      - n DEGs/DEPs not in pathways
    - The Default cutoff may have:
      - n DEGs/DEPs in pathways
      - n DEGs/DEPs not in pathways
      - The DEGs/DEPs can be:
        - greater or equal number of the best cutoff DEGs/DEPs
        - fewer number of the best cutoff DEGs/DEPs:
           - in this case, one complements the number of DEGs/DEPs with random genes not DEGs/DEPs (found in the experiment)

#### Chi-square test:

DEGs/DEPs | # in pathway | no in pathway
--- | --- | --- 
 Best cutoff |     A      |   B  
 Default cutoff |   C | D 

Chi-square p-value:
  - p-value < 0.05 denotes that both distributions are not similar; therefore, random genes could not reach the best cutoff DEGs/DEPs; in conclusion, the best cutoff was not found randomly.
  - p-value \>= 0.05 denotes that both distributions are similar, and the best cutoff can be achieved randomly.

### Chi-square results:
  - Chi-square tests showed that the present results cannot be found randomly (data not presented here)
  - The team is performing the final curation of Pahtways and DEGs in pathways


In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
pd.set_option("display.precision", 3)

import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from enricher_lib import *
from config_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

# !pip3 install pyyaml
with open('params.yml', 'r') as file:
    dic_yml = yaml.safe_load(file)

In [None]:
root_chibe = dic_yml['root_chibe']
root_colab = dic_yml['root_colab']
root0 = dic_yml['root0']

project = dic_yml['project']
s_project = dic_yml['s_project']

email = dic_yml['email']

gene_protein = dic_yml['gene_protein']
s_omics = dic_yml['s_omics']

has_age = dic_yml['has_age']
has_gender = dic_yml['has_gender']

want_normalized = dic_yml['want_normalized']

abs_lfc_cutoff_inf = dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method = dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr = dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index = dic_yml['tolerance_pathway_index']
type_sat_ptw_index = dic_yml['type_sat_ptw_index']
saturation_lfc_index = dic_yml['saturation_lfc_index']
chosen_model_sampling = dic_yml['chosen_model_sampling']

case_list = dic_yml['case_list']

pval_pathway_cutoff = dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff = dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff = dic_yml['num_of_genes_cutoff']

run_list = dic_yml['run_list']
chosen_model_list = dic_yml['chosen_model_list']
i_dfp_list = dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')


print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
enr = enricheR(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method,
             abs_lfc_cutoff_inf = abs_lfc_cutoff_inf, 
             type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case = case_list[0]

enr.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
print("\nEcho Parameters:")
enr.echo_parameters()

geneset_num = enr.geneset_num

### Group 4

In [None]:
case = case_list[1]
ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
enr.echo_parameters()

In [None]:
enr.case, enr.group, enr.gender, enr.age

### Reference database

In [None]:
enr.geneset_num, enr.geneset_lib, enr.dbs_list

In [None]:
enr.set_db(0, verbose=True)

In [None]:
cols = ['toi1_median',  'toi2_median','toi3_median',  'toi4_median',
        'n_pathways', 'n_degs_in_pathways', 'n_degs_in_pathways_mean', 'n_degs_in_pathways_median']

for col in cols:
    fig = enr.plot_cutoff_simulation_histograms(col, width=1100, height=270)
    print(col)
    fig.show()

In [None]:
dfsim = enr.open_simulation_table()
dfsim = dfsim.sort_values(['case', 'fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False, False])

dfsim.head(2)

In [None]:
enr.fdr_list, enr.lfc_list

In [None]:
colors=['navy', 'red', 'darkcyan', 'darkgreen', 'orange', 'brown', 'darksalmon',
        'magenta', 'darkturquoise', 'orange', 'darkred', 'indigo', 'magenta', 'maroon', 'black',
        'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'olivedrab', 'navy'] + plotly_colors_proteins


In [None]:
enr.abs_lfc_cutoff_inf

In [None]:
enr.abs_lfc_cutoff_inf = 0.4
fig = enr.plot_degs_vs_lfc_per_fdr_per_case(selected_toi_col='toi4_median', title=None,
                                 width=1100, height=600, plot_all_dfi=False, sel_colors=None,
                                 plot_bgcolor='lightgray', verbose=False)

fig.show()

In [None]:
cols = ['case', 'med_max_ptw', 'quantile', 'toi4_median', 'abs_lfc_cutoff', 'fdr_lfc_cutoff',
        'pathway_pval_cutoff', 'pathway_fdr_cutoff',
        'n_pathways', 'n_degs_in_pathways',
        'toi1_median', 'toi2_median', 'toi3_median']

In [None]:
dfbest = enr.cfg.open_best_ptw_cutoff(verbose=False)
dfbest = dfbest[dfbest.med_max_ptw == 'median']
print(len(dfbest))
dfbest[cols]

### Plot abs_LFC x num of DEP/DEGs

In [None]:
verbose=False

df_all_fdr = enr.calc_all_LFC_FDR_cutoffs(corr_cutoff=-.75, force=False, verbose=verbose)
print(len(df_all_fdr))
df_all_fdr

### DEGs in pahtways x toi4 median

In [None]:
colors=['navy', 'red', 'darkcyan', 'darkgreen', 'orange', 'brown', 'darksalmon',
        'magenta', 'darkturquoise', 'orange', 'darkred', 'indigo', 'magenta', 'maroon', 'black',
        'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'olivedrab', 'navy'] + plotly_colors_proteins

cols = ['case', 'toi4_median', 'med_max_ptw',  'quantile', 'quantile_val_inf', 'quantile_val_sup',
        'abs_lfc_cutoff', 'fdr_lfc_cutoff', 'pathway_fdr_cutoff',  'n_pathways',
        'n_degs_in_pathways']  #  'toi1_median', 'toi2_median',  'toi3_median',

In [None]:
selected_toi_col = 'toi4_median'

In [None]:
fig = enr.plot_degs_in_pathways_vs_toi_per_case(selected_toi_col=selected_toi_col, title=None, plot_all_dfi=False,
                                                width=1100, height=600, sel_colors=None, plot_bgcolor='lightgray', verbose=False)

if fig: fig.show()

### Comparing: toi4 (or 1,2,3), n_pathways, n_degs_in_pathways

In [None]:
cols = ['case', 'parameter', 'quantile', 'med_max_ptw', 'toi4_median', 'abs_lfc_cutoff', 'fdr_lfc_cutoff',
        'pathway_fdr_cutoff', 'n_pathways', 'n_degs_in_pathways', 
        'toi1_median',  'toi2_median', 'toi3_median']

In [None]:
col = 'n_pathways'

dfcut = enr.build_all_cutoffs_table(col, force=False, verbose=False)
print(len(dfcut))

case = case_list[0]
dfa = dfcut[ (dfcut.case == case) & (dfcut.med_max_ptw == 'median') ][cols].head(3)
dfa = dfa.sort_values(['n_pathways', 'n_degs_in_pathways'], ascending=[False, False])
dfa

In [None]:
col = 'toi4_median'

dfcut = enr.build_all_cutoffs_table(col, force=False, verbose=False)
print(len(dfcut))

case = case_list[0]
dfa = dfcut[ (dfcut.case == case) & (dfcut.med_max_ptw == 'median') ][cols].head(3)
dfa = dfa.sort_values(col, ascending=False)
dfa

### toi4

In [None]:
# selected_toi_col  ['toi1_median', 'toi2_median', 'toi3_median', 'toi4_median' ]

selected_toi_col = 'toi4_median'

fig_list = enr.plot_genes_and_pathways_frequecies_per_cases(selected_toi_col,  width=1100, height=700)

fig0 = fig_list[0]
fig1 = fig_list[1]
print(">>>", selected_toi_col)
print(f"# {enr.s_deg_dap}s")
fig0.show()
print("# n pathways")
fig1.show()

In [None]:
# dfbest.columns

### Selected best cutoffs per case

In [None]:
dfbest = enr.cfg.open_best_ptw_cutoff(verbose=False)
dfbest = dfbest[dfbest.med_max_ptw == 'median']
print(len(dfbest))

dfbest[cols]

### n_degs_in_pathways_median

In [None]:
fig_list = enr.plot_genes_and_pathways_frequecies_per_cases('n_degs_in_pathways_median',  width=1100, height=700)

fig0 = fig_list[0]
fig1 = fig_list[1]
fig0.show()
print("")
fig1.show()

### n_pathways

In [None]:
fig_list = enr.plot_genes_and_pathways_frequecies_per_cases('n_pathways',  width=1100, height=700)

fig0 = fig_list[0]
fig1 = fig_list[1]
fig0.show()
print("")
fig1.show()

## Why toi4_median is the best approach?

#### balance between best LFC cutoffs and Pathway cutoffs

In [None]:
cols2 = ['case', 'med_max_ptw', 'quantile', 'toi4_median', 'toi4_mean', 'abs_lfc_cutoff', 'fdr_lfc_cutoff', 'pathway_fdr_cutoff', 
        'n_pathways', 'n_degs_in_pathways', 'n_degs_in_pathways_mean',
        'n_degs_in_pathways_median', 'n_degs_in_pathways_std',
        'toi1_median', 'toi2_median', 'toi3_median']

dfconfig = enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=1, save_config=False, verbose=True)
dfconfig = dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfconfig = enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=2, save_config=False, verbose=True)
dfconfig = dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

## It minimizes fdr and maximizes abs_lfc!!!

### Summary DEPs + Up and Down

In [None]:
per_biotype=False
ensembl=False;
dfa = enr.summary_degs_up_down(per_biotype=per_biotype, ensembl=ensembl, verbose=False)
print(len(dfa))
dfa

In [None]:
per_biotype=True
ensembl=False;
dfa = enr.summary_degs_up_down(per_biotype=per_biotype, ensembl=ensembl, verbose=False)
print(len(dfa))
dfa

In [None]:
per_biotype=True
ensembl=True;
dfa = enr.summary_degs_up_down(per_biotype=per_biotype, ensembl=ensembl, verbose=False)
print(len(dfa))
dfa

In [None]:
before_best_cutoff = False

per_biotype=False
ensembl=False

fig, dfa = enr.barplot_up_down_genes_per_case(per_biotype=per_biotype, ensembl=ensembl, before_best_cutoff=before_best_cutoff,
                                              width=1100, height=700, verbose=False)
fig.show()

In [None]:
before_best_cutoff = False

per_biotype=True
ensembl=False

fig, dfa = enr.barplot_up_down_genes_per_case(per_biotype=per_biotype, ensembl=ensembl, before_best_cutoff=before_best_cutoff,
                                              width=1100, height=700, verbose=False)
fig.show()

In [None]:
before_best_cutoff = False

per_biotype=True
ensembl=True

fig, dfa = enr.barplot_up_down_genes_per_case(per_biotype=per_biotype, ensembl=ensembl, before_best_cutoff=before_best_cutoff,
                                              width=1100, height=700, verbose=False)
fig.show()

In [None]:
verbose=False

dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=verbose)
print(len(dfi))
dfi.head(3)

### Running all cases

In [None]:
enr.cfg.open_best_ptw_cutoff()

enr.echo_default()
print("")

all_degs = []
for case in case_list:
    print(">>>", case)
    ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
    
    if not ret:
        print(f"\nError?? case {case}")
        enr.echo_degs()
        print("")
        continue

    enr.echo_parameters(want_echo_default=False, jump_line=True)
    print("")
    all_degs += enr.degs_in_pathways + enr.degs_not_in_pathways
    print("")
    
all_degs = np.unique(all_degs)
print(f"There are {len(all_degs)} {enr.s_gene_protein}s in all cases ")
print("\nall degs:", "; ".join(all_degs))
print("\n\n")