In [None]:
from platform import python_version
print(python_version())

## Final Summary Results

### Definitions:
  - LFC table has:
    - abs_LFC (the absolute LFC cutoff value)
    - FDR_LFC, its FDR or p-value adjusted
  - The enriched pathway table has:
    - FDR_pathway cutoff value
   
### Default values for LFC table:
  - abs_LFC = 1
  - FDR_LFC = 0.05
  - therefore, a DEG/DEP is abs(LFC) >= 1 and FDR < 0.05

### Default values for Enriched Pathways:
  - FDR_pathway = 0.05
  - therefore, an enriched pathways has FDR < 0.05 and at least 3 DEGs/DEPs

### Calculating the best cutoffs:
  - We proposed and calculated many toies to define a new statistics to flebilize the LFC and Enriched Pathway cutoffs.
    - toies are calculated for each case, each cutoff, and each resulting enriched pathway.
  - To find the possible best LFC/FDR expression and FDR pathway cutoffs:
     - We look for a high number in n_pathway and n_DEGs_in_pathway, having a low FDR_LFC and a high absLFC.
       - The default FDR_LFC (0.05):
          - It may have fewer DEGs, resulting in fewer enriched pathways.
          - It may have fewer enriched pathways, even having many DEGs/DEPs.
       - Therefore, a trade-off exists between optimizing (abs_LFC and FDR_LFC cutoffs) and (FDR_pathway cutoffs, n_pathways, and n_DEGs_in_pathways.)

### An toi measures the trade-off between "LFC" and "Enriched Pathways" cufoff -> LFC - Enriched Pathway Trade-Off Statistics (LEATOS)

  - We proposed and calculated the following possible toies:

<p style="font-size: 20px; color: yellow;">
$toi1 = \sqrt{-log{_{10}}{FDR_{pathway}} * \frac{n}{N} }$ </p>

<p style="font-size: 20px; color: cyan;">
$toi2 = \sqrt{-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} }$ </p>

<p style="font-size: 20px; color: orange;">
$toi3 = (-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/3}$ </p>

<p style="font-size: 20px; color: pink;">
$toi4 = (abs\_LFC * -log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/4}$ </p>

where,
  - n is the number of DEGs/DEPs found in the pathway
  - N is the total number of annotated DEGs/DEPs in the pathway (depend in the database, our default database is Reactome 2022)

### Then we searched for the best cutoffs
  - In each 5 percentile of the toi histogram, we look for the best abs_LFC, FDR_LFC, FDR_pathway:
  -  We expected that the best cutoff should be in the right tail of the histogram (high toi value.)
  -  High toi values must have a high number of n_pathways and n DEGs in pathways.

### Testing the best cutoffs (for each case)

  - Is the new set of cutoffs correct? good enough?
  - How to establish that the calculated cutoff is correct?
  - To answer these questions we calculated the chi-square test between the "best cutoff" and the "default"
    - Best cutoff has:
      - n DEGs/DEPs in pathways
      - n DEGs/DEPs not in pathways
    - The Default cutoff may have:
      - n DEGs/DEPs in pathways
      - n DEGs/DEPs not in pathways
      - The DEGs/DEPs can be:
        - greater or equal number of the best cutoff DEGs/DEPs
        - fewer number of the best cutoff DEGs/DEPs:
           - in this case, one complements the number of DEGs/DEPs with random genes not DEGs/DEPs (found in the experiment)

#### Chi-square test:

DEGs/DEPs | # in pathway | no in pathway
--- | --- | --- 
 Best cutoff |     A      |   B  
 Default cutoff |   C | D 

Chi-square p-value:
  - p-value < 0.05 denotes that both distributions are not similar; therefore, random genes could not reach the best cutoff DEGs/DEPs; in conclusion, the best cutoff was not found randomly.
  - p-value \>= 0.05 denotes that both distributions are similar, and the best cutoff can be achieved randomly.

### Chi-square results:
  - Chi-square tests showed that the present results cannot be found randomly (data not presented here)
  - The team is performing the final curation of Pahtways and DEGs in pathways


In [None]:
import json, requests
import os, sys
import pandas as pd

sys.path.insert(1, '../src/')

from Basic import *
from enricher_lib import *
from biopax_lib import *
from config_lib import *
from stat_lib import *

pd.set_option("display.precision", 3)
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
root_chibe = "../../chibe/"
root_colab = '../../colaboracoes/'
root0       = '../../colaboracoes/covid/sonia_andrade/taubate/proteomics_202205/'

project = 'Taubate COVID-19'
s_project = 'taubate_covid19'

gene_protein = 'protein'
s_omics = 'proteomics'

has_age = True
has_gender = True

want_normalized = False
exp_normalization='quantile_norm' if want_normalized else None
normalization = 'not_normalized' if exp_normalization is None else exp_normalization

abs_lfc_cutoff_inf = 0.40
s_pathw_enrichm_method = 'enricher'
num_min_degs_for_ptw_enr=3

#------------ pathway pseudo-modulation toi ------------
tolerance_pathway_toi = 0.15
type_sat_ptw_toi = 'linear_sat'
saturation_lfc_toi = 5

case_list = ['g2a_male', 'g2a_female', 
             'g2b_male', 'g2b_female', 
             'g3_male_adult',   'g3_male_elder',
             'g3_female_adult', 'g3_female_elder']

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

pval_pathway_cutoff = 0.05
fdr_pathway_cutoff = .05
num_of_genes_cutoff = 3

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
pathway_name_id = 'Hemostasis - R-HSA-109582'
pathway_name_id = 'Regulation Of IGF Transport And Uptake By IGFBPs - R-HSA-381426'
pathway_name_id = 'Platelet degranulate - R-HSA-114608'
pathway_name_id = 'Platelet Activation, Signaling And Aggregation - R-HSA-76002'
pathway_name_id = 'Integrin Cell Surface Interactions - R-HSA-216083'
pathway_name_id = 'Neutrophil Degranulation - R-HSA-6798695'
pathway_name_id = 'Regulation of Complement cascade - R-HSA-977606'
pathway_name_id = 'Response To Elevated Platelet Cytosolic Ca2+ - R-HSA-76005'

enr = enricheR(gene_protein, s_omics, project, s_project, root0,
               case_list, has_age, has_gender, clone_objects=False,
               exp_normalization=exp_normalization, geneset_num=0, 
               num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
               tolerance_pathway_toi=tolerance_pathway_toi, 
               s_pathw_enrichm_method = s_pathw_enrichm_method,
               abs_lfc_cutoff_inf = abs_lfc_cutoff_inf, 
               type_sat_ptw_toi=type_sat_ptw_toi, saturation_lfc_toi=saturation_lfc_toi)

case = case_list[0]

enr.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
print("\nEcho Parameters:")
enr.echo_parameters()

geneset_num = enr.geneset_num

### Find another case = g2a_female

In [None]:
case = case_list[1]
ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
enr.echo_parameters()

In [None]:
enr.case, enr.group, enr.gender, enr.age

### Reference database

In [None]:
enr.geneset_num, enr.geneset_lib, enr.dbs_list

In [None]:
enr.set_db(0, verbose=True)

In [None]:
cols = ['toi1_median',  'toi2_median','toi3_median',  'toi4_median',
        'n_pathways', 'n_degs_in_pathways', 'n_degs_in_pathways_mean', 'n_degs_in_pathways_median']

for col in cols:
    fig = enr.plot_cutoff_simulation_histograms(col, width=1100, height=270)
    print(col)
    fig.show()

In [None]:
dfsim = enr.open_simulation_table()
dfsim = dfsim.sort_values(['case', 'fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False, False])

dfsim.head(2)

In [None]:
enr.fdr_list, enr.lfc_list

In [None]:
colors=['navy', 'red', 'darkcyan', 'darkgreen', 'orange', 'brown', 'darksalmon',
        'magenta', 'darkturquoise', 'orange', 'darkred', 'indigo', 'magenta', 'maroon', 'black',
        'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'olivedrab', 'navy'] + plotly_colors_proteins


In [None]:
enr.abs_lfc_cutoff_inf

### All DEPs x LFC

In [None]:
enr.abs_lfc_cutoff_inf = 0.0

fig = enr.plot_degs_vs_lfc_per_fdr_per_case(selected_toi_col='toi4_median', title=None,
                                 width=1100, height=600, plot_all_dfi=False, sel_colors=None,
                                 plot_bgcolor='lightgray', verbose=False)

fig.show()
enr.abs_lfc_cutoff_inf = 0.4

### Plot abs_LFC x num of DEP/DEGs

In [None]:
verbose=False

df_all_fdr= enr.calc_all_LFC_FDR_cutoffs(corr_cutoff=-.75, force=False, verbose=verbose)
print(len(df_all_fdr))
df_all_fdr.head(3)

In [None]:
dic_fig = enr.plot_all_LFC_FDR_cutoffs(width=1100, height=450, title=None, verbose=verbose)

for key, dic_fig_return in dic_fig.items():
    for key2, fig in dic_fig_return.items():
        print(key, key2)
        fig.show()

### DEGs in pahtways x toi4 median

In [None]:
fig = enr.plot_degs_in_pathways_vs_toi_per_case(selected_toi_col='toi4_median', title=None,
                                 width=1100, height=600, plot_all_dfi=False, sel_colors=None,
                                 plot_bgcolor='lightgray', verbose=False)

fig.show()

### Comparing: toi4 (or 1,2,3), n_pathways, n_degs_in_pathways

In [None]:
cols = ['case', 'parameter', 'quantile', 'med_max_ptw', 'toi4_median', 'abs_lfc_cutoff', 'fdr_lfc_cutoff',
        'pathway_fdr_cutoff', 'n_pathways', 'n_degs_in_pathways', 
        'toi1_median',  'toi2_median', 'toi3_median']

In [None]:
col = 'n_pathways'

dfcut = enr.build_all_cutoffs_table(col, force=False, verbose=False)
print(len(dfcut))

case = case_list[0]
dfa = dfcut[ (dfcut.case == case) & (dfcut.med_max_ptw == 'median') ][cols].head(3)
dfa = dfa.sort_values(['n_pathways', 'n_degs_in_pathways'], ascending=[False, False])
dfa

In [None]:
col = 'toi4_median'

dfcut = enr.build_all_cutoffs_table(col, force=False, verbose=False)
print(len(dfcut))

case = case_list[0]
dfa = dfcut[ (dfcut.case == case) & (dfcut.med_max_ptw == 'median') ][cols].head(3)
dfa = dfa.sort_values(col, ascending=False)
dfa

### toi4

In [None]:
# selected_toi_col  ['toi1_median', 'toi2_median', 'toi3_median', 'toi4_median' ]

selected_toi_col = 'toi4_median'

fig_list = enr.plot_genes_and_pathways_frequecies_per_cases(selected_toi_col,  width=1100, height=700)

fig0 = fig_list[0]
fig1 = fig_list[1]
print(">>>", selected_toi_col)
print(f"# {enr.s_deg_dap}s")
fig0.show()
print("# n pathways")
fig1.show()

### Selected best cutoffs per case

In [None]:
dfbest = enr.cfg.open_best_ptw_cutoff(verbose=False)
dfbest = dfbest[dfbest.med_max_ptw == 'median']
print(len(dfbest))
dfbest[cols]

### n_degs_in_pathways_median

In [None]:
fig_list = enr.plot_genes_and_pathways_frequecies_per_cases('n_degs_in_pathways_median',  width=1100, height=700)

fig0 = fig_list[0]
fig1 = fig_list[1]
fig0.show()
print("")
fig1.show()

### n_pathways

In [None]:
fig_list = enr.plot_genes_and_pathways_frequecies_per_cases('n_pathways',  width=1100, height=700)

fig0 = fig_list[0]
fig1 = fig_list[1]
fig0.show()
print("")
fig1.show()

## Why toi4_median is the best approach?

#### balance between best LFC cutoffs and Pathway cutoffs

In [None]:
n_best_samples_chosen = 4

dfconfig = enr.calc_best_cutoffs_params(selected_toi_col='n_pathways', n_best_samples=n_best_samples_chosen, force=False, verbose=False)
dfconfig = dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols]

In [None]:
dfconfig = enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_samples=n_best_samples_chosen, force=False, verbose=False)
dfconfig = dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols]

## It minimizes fdr and maximizes abs_lfc!!!

### Summary DEPs + Up and Down

In [None]:
force=False; save_file=False; prompt_verbose=False
dfa = enr.summary_degs_up_down(geneset_num=enr.geneset_num, force=force, save_file=save_file, prompt_verbose=prompt_verbose, verbose=False)
print(len(dfa))
dfa

In [None]:
title = f'Up and Down {enr.s_deg_dep}s with the best cutoff'
fig, dfa = enr.barplot_up_down_genes_per_case(title=title, width=1100, height=700, verbose=False)
fig.show()

In [None]:
force=False; verbose=False

dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=force, verbose=verbose)
print(len(dfi))
dfi.head(3)

### Running all cases

In [None]:
enr.cfg.open_best_ptw_cutoff()

enr.echo_default()
print("")

all_degs = []
for case in case_list:
    print(">>>", case)
    ret, degs, dfdegs = enr.open_case(case, verbose=False)
    
    if not ret:
        print(f"\nError?? case {case}")
        enr.echo_degs()
        print("")
        continue

    enr.echo_parameters(want_echo_default=False, jump_line=True)
    print("")
    all_degs += enr.degs_in_pathways + enr.degs_not_in_pathways
    enr.echo_parameters()
    print("")
    
all_degs = np.unique(all_degs)
print(f"There are {len(all_degs)} {enr.s_gene_protein}s in all cases ")
print("\nall degs:", "; ".join(all_degs))
print("\n\n")

### Development & tests

In [None]:
width = 1000; height = 700
verbose = False
plot_bgcolor='lightgray'

xaxis_title = f"abs LFC"
yaxis_title = f"# {enr.s_deg_dep}s"
   
fig = go.Figure()

colors =['navy', 'red', 'darkcyan', 'darkgreen', 'orange', 'brown', 'darksalmon',
        'magenta', 'darkturquoise', 'orange',  'indigo', 'magenta', 'maroon', 'black',
        'darkred', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'olivedrab', 'navy']

title = f'scatter plot - {enr.s_deg_dep}s versus abs_LFC per FDR'

dic_visible = {}
for icase in range(len(enr.case_list)):
    case = enr.case_list[icase]

    dic_visible[case] = 0
    is_visible = True if icase == 0 else False
    i = -1;
    for i in range(len(enr.fdr_list)):
        fdr_lfc_cutoff = enr.fdr_list[i]
        color = colors[i]
        name = f"{fdr_lfc_cutoff:.3f}"

        dfsim2 = dfsim[ (dfsim.case == case) & (dfsim.fdr_lfc_cutoff == fdr_lfc_cutoff)]
        if dfsim2.empty:
            # print("No data for fdr", fdr)
            continue

        dic_visible[case] += 1

        text_ini = f'case {case}<br>FDR_LFC cutoff={fdr_lfc_cutoff:.3f}'

        hovertext_list = []
        for j in range(len(dfsim2)):
            row = dfsim2.iloc[j]
            text =  f'LFC_cutoff={row.abs_lfc_cutoff:.3f}'
            text += f'# {enr.s_deg_dep}s {row.n_degs}<br># Up={row.n_degs_up} Down={row.n_degs_dw}'
            hovertext_list.append(text_ini + '<br>' + text)
            
        fig.add_trace(go.Scatter(x=dfsim2.abs_lfc_cutoff, y=dfsim2.n_degs, hovertext=hovertext_list, hoverinfo="text", 
                                 mode='markers', marker={'color':color}, visible=is_visible, name=name)) 

    fig.update_layout(
                autosize=True,
                title=title,
                width=width,
                height=height,
                xaxis_title=xaxis_title,
                yaxis_title=yaxis_title,
                showlegend=True,
                legend_title='FDR_LFC cutoff',
                plot_bgcolor=plot_bgcolor,
                font=dict(
                    family="Arial",
                    size=14,
                    color="Black"
                )
    )

# add dropdown menus to the figure
buttons=[]
for case in enr.case_list:
    buttons.append(dict(method='update',
                        label=case,
                        visible=True,
                        args=[ {'visible': list(sum( [tuple([True]  * dic_visible[case2]) if case == case2 else \
                                                      tuple([False] * dic_visible[case2]) for case2 in enr.case_list], () ))} ]
                        )
                  )

# some adjustments to the updatemenus
updatemenu = []
your_menu = dict()
updatemenu.append(your_menu)

updatemenu[0]['buttons'] = buttons
updatemenu[0]['direction'] = 'down'
updatemenu[0]['showactive'] = True
updatemenu[0]['showactive'] = True
updatemenu[0]['x'] = 1
updatemenu[0]['y'] = 1.2

fig.update_layout(
    autosize=True,
    title=title,
    width=width,
    height=height,
    xaxis_title=xaxis_title,
    yaxis_title=yaxis_title,
    showlegend=True,
    legend_title='FDR_LFC cutoff',
    font=dict(
        family="Arial",
        size=14,
        color="Black"
    ),
    plot_bgcolor=plot_bgcolor,
    updatemenus=updatemenu
)

figname = title_replace(title)
figname = os.path.join(enr.root_figure, figname+'.html')

fig.write_html(figname)
if verbose: print(">>> HTML and png saved:", figname)
fig.write_image(figname.replace('.html', '.png'))    


fig.show()