In [None]:
from platform import python_version
print(python_version())

## Super-summary: chosen cutoffs

### Definitions:
  - LFC table has:
    - abs_LFC (the absolute LFC cutoff value)
    - FDR_LFC, its FDR or p-value adjusted
  - The enriched pathway table has:
    - FDR_pathway cutoff value
   
### Default values for LFC table:
  - abs_LFC = 1
  - FDR_LFC = 0.05
  - therefore, a DEG/DEP is abs(LFC) >= 1 and FDR < 0.05

### Default values for Enriched Pathways:
  - FDR_pathway = 0.05
  - therefore, an enriched pathways has FDR < 0.05 and at least 3 DEGs/DEPs

### Calculating the best cutoffs:
  - We proposed and calculated many indexes to define a new statistics to flebilize the LFC and Enriched Pathway cutoffs.
    - Indexes are calculated for each case, each cutoff, and each resulting enriched pathway.
  - To find the possible best LFC/FDR expression and FDR pathway cutoffs:
     - We look for a high number in n_pathway and n_DEGs_in_pathway, having a low FDR_LFC and a high absLFC.
       - The default FDR_LFC (0.05):
          - It may have fewer DEGs, resulting in fewer enriched pathways.
          - It may have fewer enriched pathways, even having many DEGs/DEPs.
       - Therefore, a trade-off exists between optimizing (abs_LFC and FDR_LFC cutoffs) and (FDR_pathway cutoffs, n_pathways, and n_DEGs_in_pathways.)

### An index measures the trade-off between "LFC" and "Enriched Pathways" cufoff -> LFC - Enriched Pathway Trade-Off Statistics (LEATOS)

  - We proposed and calculated the following possible indexes:

<p style="font-size: 20px; color: lightgreen;">
$index1 = \sqrt{-log{_{10}}{FDR_{pathway}} * \frac{n}{N} }$ </p>

<p style="font-size: 20px; color: cyan;">
$index2 = \sqrt{-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} }$ </p>

<p style="font-size: 20px; color: orange;">
$index3 = (-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/3}$ </p>

<p style="font-size: 20px; color: pink;">
$index4 = (abs\_LFC * -log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/4}$ </p>

where,
  - n is the number of DEGs/DEPs found in the pathway
  - N is the total number of annotated DEGs/DEPs in the pathway (depend in the database, our default database is Reactome 2022)

### Then we searched for the best cutoffs
  - In each 5 percentile of the index histogram, we look for the best abs_LFC, FDR_LFC, FDR_pathway:
  -  We expected that the best cutoff should be in the right tail of the histogram (high index value.)
  -  High index values must have a high number of n_pathways and n DEGs in pathways.

### Testing the best cutoffs (for each case)

  - Is the new set of cutoffs correct? good enough?
  - How to establish that the calculated cutoff is correct?
  - To answer these questions we calculated the chi-square test between the "best cutoff" and the "default"
    - Best cutoff has:
      - n DEGs/DEPs in pathways
      - n DEGs/DEPs not in pathways
    - The Default cutoff may have:
      - n DEGs/DEPs in pathways
      - n DEGs/DEPs not in pathways
      - The DEGs/DEPs can be:
        - greater or equal number of the best cutoff DEGs/DEPs
        - fewer number of the best cutoff DEGs/DEPs:
           - in this case, one complements the number of DEGs/DEPs with random genes not DEGs/DEPs (found in the experiment)

#### Chi-square test:

DEGs/DEPs | # in pathway | no in pathway
--- | --- | --- 
 Best cutoff |     A      |   B  
 Default cutoff |   C | D 

Chi-square p-value:
  - p-value < 0.05 denotes that both distributions are not similar; therefore, random genes could not reach the best cutoff DEGs/DEPs; in conclusion, the best cutoff was not found randomly.
  - p-value \>= 0.05 denotes that both distributions are similar, and the best cutoff can be achieved randomly.

In [None]:
import json, requests
import os, sys
import pandas as pd

sys.path.insert(1, '../src/')

from Basic import *
from enricher_lib import *
from biopax_lib import *
from config_lib import *
from stat_lib import *

pd.set_option("display.precision", 3)
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
root_chibe = "../../chibe/"
root_colab = '../../colaboracoes/'
root0       = '../../colaboracoes/covid/sonia_andrade/taubate/proteomics_202205/'

project = 'Taubate COVID-19'
s_project = 'taubate_covid19'

gene_protein = 'protein'
s_omics = 'proteomics'

has_age = True
has_gender = True

want_normalized = False
exp_normalization='quantile_norm' if want_normalized else None
normalization = 'not_normalized' if exp_normalization is None else exp_normalization

s_pathw_enrichm_method = 'enricher'
num_min_degs_for_ptw_enr=3
tolerance_gene_reg_index = 0.15

#----------------- old --------------------------------------------
# abs_lfc_cutoff=np.log2(1.5); fdr_lfc_cutoff=.58
# abs_lfc_modulated=np.log2(1.4); fdr_lfc_modulated=.7
# pval_pathway_cutoff=0.05; fdr_pathway_cutoff=0.05; num_of_genes_cutoff=3
#----------------- new -------------------------------------------
abs_lfc_cutoff=0.20; fdr_lfc_cutoff=.3
abs_lfc_modulated=0.15; fdr_lfc_modulated=.4
pval_pathway_cutoff=0.05; fdr_pathway_cutoff=0.1; num_of_genes_cutoff=2

case_list = ['g2a_male', 'g2a_female', 
             'g2b_male', 'g2b_female', 
             'g3_male_adult',   'g3_male_elder',
             'g3_female_adult', 'g3_female_elder']

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

pval_pathway_cutoff = 0.05
fdr_pathway_cutoff = .05
num_of_genes_cutoff = 3

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
pathway_name_id = 'Hemostasis - R-HSA-109582'
pathway_name_id = 'Regulation Of IGF Transport And Uptake By IGFBPs - R-HSA-381426'
pathway_name_id = 'Platelet degranulate - R-HSA-114608'
pathway_name_id = 'Platelet Activation, Signaling And Aggregation - R-HSA-76002'
pathway_name_id = 'Integrin Cell Surface Interactions - R-HSA-216083'
pathway_name_id = 'Neutrophil Degranulation - R-HSA-6798695'
pathway_name_id = 'Regulation of Complement cascade - R-HSA-977606'
pathway_name_id = 'Response To Elevated Platelet Cytosolic Ca2+ - R-HSA-76005'

enr = enricheR(gene_protein, s_omics, project, s_project, root0,
               case_list, has_age, has_gender, clone_objects=False,
               exp_normalization=exp_normalization, geneset_num=0, 
               num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
               tolerance_gene_reg_index=tolerance_gene_reg_index, 
               s_pathw_enrichm_method = s_pathw_enrichm_method)

case = case_list[0]

enr.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, dfdegs = enr.open_case(case, verbose=False)

# print("\nEcho Parameters:")
# enr.echo_parameters()

geneset_num = enr.geneset_num

### Distribution related to possible indexes

In [None]:
index_cols = ['index1_median',  'index2_median','index3_median',  'index4_median',
              'n_pathways', 'n_degs_in_pathways', 'n_degs_in_pathways_mean', 'n_degs_in_pathways_median']

for col in index_cols:
    fig = enr.plot_cutoff_simulation_histograms(col, width=1100, height=270)
    print(col)
    fig.show()

In [None]:
dfsim = enr.open_simulation_table()
dfsim = dfsim.sort_values(['case', 'fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False, False])

dfsim.head(2)

In [None]:
colors=['navy', 'red', 'darkcyan', 'darkgreen', 'orange', 'brown', 'darksalmon',
        'magenta', 'darkturquoise', 'orange', 'darkred', 'indigo', 'magenta', 'maroon', 'black',
        'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'olivedrab', 'navy'] + plotly_colors_proteins


### First, per case plot
#### DEPs 'in pathway' x index 4

In [None]:
fig = enr.plot_degs_in_pathways_vs_index_per_case(selected_index_col='index4_median', title=None,
                                 width=1100, height=600, plot_all_dfi=False, sel_colors=None,
                                 plot_bgcolor='lightgray', verbose=False)

fig.show()

### Next, per case plot
#### ALL DEPs x abs_LFC
#### Comments:

  - each FDR selects a subset of DEG/DEPs
    - abs_LFC may or may not filter them
    - we define that only 'curves' that change DEG/DEPs varying the abs_LFC are valid
    - we observed that most of the curves saturate at 0.4 abs_LFC
  - we defined the range of FDR from 0.05 to 0.75 (here 0.55)
    - if we had defined:
      - min abs_LFC = 0 and max_FDR = 1, we got in the left-upper corner the total Proteins (or Genes) 

In [None]:
fig = enr.plot_degs_vs_lfc_per_fdr_per_case(selected_index_col='index4_median', title=None,
                                 width=1100, height=600, plot_all_dfi=False, sel_colors=None,
                                 plot_bgcolor='lightgray', verbose=False)

fig.show()

In [None]:
cols = ['case', 'parameter', 'quantile', 'med_max_ptw', 'index4_median', 'abs_lfc_cutoff', 'fdr_lfc_cutoff',
        'pathway_fdr_cutoff', 'n_pathways', 'n_degs_in_pathways', 
        'index1_median',  'index2_median', 'index3_median']

### index4

In [None]:
# selected_index_col  ['index1_median', 'index2_median', 'index3_median', 'index4_median' ]

selected_index_col = 'index4_median'

fig_list = enr.plot_genes_and_pathways_frequecies_per_cases(selected_index_col,  width=1100, height=700)

fig0 = fig_list[0]
fig1 = fig_list[1]
print(">>>", selected_index_col)
print(f"# {enr.s_deg_dep}s")
fig0.show()
print("# n pathways")
fig1.show()

In [None]:
# dfbest.columns

### Selected best cutoffs per case

In [None]:
dfbest = enr.cfg.open_best_ptw_cutoff(verbose=False)
dfbest = dfbest[dfbest.med_max_ptw == 'median']
print(len(dfbest))

dfbest[cols]

## Why index4_median is the best approach?

#### balance between best LFC cutoffs and Pathway cutoffs

### First using 'n_pathways' as index
### Next using 'index4_median' as index
#### Any index can be used to compare to index4_median

In [None]:
n_best_samples_chosen = 4

dfconfig = enr.calc_best_cutoffs_params(selected_index_col='n_pathways', n_best_samples=n_best_samples_chosen, force=False, verbose=False)
dfconfig = dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols]

In [None]:
dfconfig = enr.calc_best_cutoffs_params(selected_index_col='index4_median', n_best_samples=n_best_samples_chosen, force=False, verbose=False)
dfconfig = dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols]

## It minimizes fdr and maximizes abs_lfc!!!

### Summary DEPs + Up and Down

In [None]:
force=False; save_file=False; prompt_verbose=False
dfa = enr.summary_degs_up_down(geneset_num=enr.geneset_num, force=force, save_file=save_file, prompt_verbose=prompt_verbose, verbose=False)
print(len(dfa))
dfa

In [None]:
title = f'Up and Down {enr.s_deg_dep}s with the best cutoff'
fig, dfa = enr.barplot_up_down_genes_per_case(title=title, width=1100, height=700, verbose=False)
fig.show()