In [None]:
from platform import python_version
print(python_version())

### Definitions:
  - LFC table has the following cutoffs:
    - abs_LFC the absolute LFC cutoff value
    - FDR_LFC, the FDR or p-value adjusted cutoff
  - The enriched pathway table has:
    - FDR_pathway cutoff value
    - pval_cutoff - p-value cutoff, necessary when flexibilizing FDR_pathway_cutoff
    - num_of_genes_cutoff - the minimum number of genes necessary to pick-up an enriched pathway
   
### Default cutoffs for LFC table:
  - abs_LFC=1
  - FDR_LFC=0.05
  - therefore, the default DEG/DAP is defined as having the abs(LFC) >= 1 and FDR < 0.05

### Default cutoffs for Enriched Pathways:
  - fdr_pathway_cutoff=0.05
  - pval_pathway_cutoff=0.05
  - num_of_genes_cutoff=3
  - therefore, an enriched pathways has FDR < 0.05 and at least 3 DEGs/DAPs

### Calculating the best cutoffs:
  - We proposed and calculated many indexes to define a new statistics to flebilize the LFC and Enriched Pathway cutoffs.
    - Indexes are calculated for each case, each cutoff, and each resulting enriched pathway.
  - To find the possible best LFC/FDR expression and FDR pathway cutoffs:
     - We look for a high number in n_pathway and n_DEGs_in_pathway, having a low FDR_LFC and a high absLFC.
       - The default FDR_LFC (0.05):
          - It may have fewer DEGs, resulting in fewer enriched pathways.
          - It may have fewer enriched pathways, even having many DEGs/DAPs.
       - Therefore, a trade-off exists between optimizing (abs_LFC and FDR_LFC cutoffs) and (FDR_pathway cutoffs, n_pathways, and n_DEGs_in_pathways.)

### An index measures the trade-off between "LFC" and "Enriched Pathways" cufoff -> LFC - Enriched Pathway Trade-Off Statistics (LEATOS)

  - We proposed and calculated the following possible indexes:

<p style="font-size: 20px; color: darkcyan;">
$index1=\sqrt{-log{_{10}}{FDR_{pathway}} * \frac{n}{N} }$ </p>

<p style="font-size: 20px; color: darkcyan;">
$index2=\sqrt{-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} }$ </p>

<p style="font-size: 20px; color: darkcyan;">
$index3=(-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/3}$ </p>

<p style="font-size: 20px; color: darkred;">
$toi4=(abs\_LFC * -log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/4}$ </p>

where,
  - n is the number of DEGs/DAPs found in the pathway
  - N is the total number of annotated DEGs/DAPs in the pathway (depend in the database, our default database is Reactome 2022)

### Then we searched for the best cutoffs
  - In each 5 percentile of the index histogram, we look for the best abs_LFC, FDR_LFC, FDR_pathway:
  -  We expected that the best cutoff should be in the right tail of the histogram (high index value.)
  -  High index values must have a high number of n_pathways and n DEGs in pathways.

### Testing the best cutoffs (for each case)

  - Is the new set of cutoffs correct? good enough?
  - How to establish that the calculated cutoff is correct?
  - To answer these questions we calculated the chi-square test between the "best cutoff" and the "default"
    - Best cutoff has:
      - n DEGs/DAPs in pathways
      - n DEGs/DAPs not in pathways
    - The Default cutoff may have:
      - n DEGs/DAPs in pathways
      - n DEGs/DAPs not in pathways
      - The DEGs/DAPs can be:
        - greater or equal number of the best cutoff DEGs/DAPs
        - fewer number of the best cutoff DEGs/DAPs:
           - in this case, one complements the number of DEGs/DAPs with random genes not DEGs/DAPs (found in the experiment)

#### Chi-square test:

DEGs/DAPs | # in pathway | no in pathway
--- | --- | --- 
 Best cutoff |     A      |   B  
 Default cutoff |   C | D 

Chi-square p-value:
  - p-value < 0.05 denotes that both distributions are not similar; therefore, random genes could not reach the best cutoff DEGs/DAPs; in conclusion, the best cutoff was not found randomly.
  - p-value \>= 0.05 denotes that both distributions are similar, and the best cutoff can be achieved randomly.

### For excel

In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
pd.set_option("display.precision", 3)

import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from enricher_lib import *
from config_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

# !pip3 install pyyaml
with open('params.yml', 'r') as file:
    dic_yml=yaml.safe_load(file)

In [None]:
root0=dic_yml['root0']
email=dic_yml['email']

project=dic_yml['project']
s_project=dic_yml['s_project']

gene_protein=dic_yml['gene_protein']
s_omics=dic_yml['s_omics']

has_age=dic_yml['has_age']
has_gender=dic_yml['has_gender']

want_normalized=dic_yml['want_normalized']

abs_lfc_cutoff_inf=dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method=dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr=dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index=dic_yml['tolerance_pathway_index']
type_sat_ptw_index=dic_yml['type_sat_ptw_index']
saturation_lfc_index=dic_yml['saturation_lfc_index']
chosen_model_sampling=dic_yml['chosen_model_sampling']

case_list=dic_yml['case_list']

pval_pathway_cutoff=dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff=dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff=dic_yml['num_of_genes_cutoff']

run_list=dic_yml['run_list']
chosen_model_list=dic_yml['chosen_model_list']
i_dfp_list=dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg=Config(project, s_project, case_list, root0)

case=case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio=-1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw=cfg.get_best_lfc_cutoff(case, 'not_normalized')


print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
enr=enricheR(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method=s_pathw_enrichm_method,
             abs_lfc_cutoff_inf=abs_lfc_cutoff_inf, 
             type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case=case_list[0]

enr.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
print("\nEcho Parameters:")
enr.echo_parameters()
geneset_num=enr.geneset_num

In [None]:
enr.case, enr.group, enr.gender, enr.age, enr.geneset_num, enr.abs_lfc_cutoff_inf

In [None]:
enr.abs_lfc_cutoff_inf, abs_lfc_cutoff_inf

In [None]:
for case in case_list:
    ret, degs, degs_ensembl, dfdegs=enr.open_case(case, save_file=True, verbose=False)
    enr.echo_parameters()
    print("\n\n\n")

### Find another case=g2a_female

In [None]:
case=case_list[1]
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
enr.echo_parameters()

### Reference database

In [None]:
enr.geneset_num, enr.geneset_lib, enr.dbs_list

In [None]:
enr.set_db(0, verbose=True)

In [None]:
# dfsum=enr.summary_degs_and_pathways(force=False, verbose=False)
# dfsum

In [None]:
fname_final_ori, fname_ori, title=enr.set_lfc_names()
fname_final_ori, title

### Sampling cutoffs

In [None]:
geneset_num_list=[0, 1, 2, 4, 5, 7]
want=False

if want:
    for geneset_num in geneset_num_list:
        enr.set_db(geneset_num, verbose=True)
    
        s_start=f"enricher_{enr.geneset_lib}"
    
        for case in case_list:
            files=[x for x in os.listdir(enr.root_enrichment) if x.startswith(s_start) and case in x]
            print("\tcase", case, len(files))
    
        print('')

### Define case

In [None]:
geneset_num=0
enr.set_db(geneset_num, verbose=True)
enr.geneset_num, enr.geneset_lib

In [None]:
i=3
case=case_list[i]

ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
print(f"G/P cutoff: lfc={enr.abs_lfc_cutoff:.3f}; lfc_fdr={enr.fdr_lfc_cutoff:.3f}")
len(degs)

### best_cutoff_quantiles() - for Reactome - return dfcut

In [None]:
fname=enr.fname_enr_gene_stat%(enr.case, enr.geneset_lib, enr.normalization)
filefull=os.path.join(enr.root_ressum, fname)

os.path.exists(filefull), fname

In [None]:
enr.set_enrichment_name()

In [None]:
force=False; verbose=False
print(">>>", case)
dfi=enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=force, verbose=verbose)
print(len(dfi))
dfi.head(3)

## build_all_cutoffs_table(col)
  - loop case_list
    - best_cutoff_quantiles()
      - calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib

## Next, calc_best_cutoffs_params()

In [None]:
enr.open_enriched_pathways_summary()

In [None]:
enr.abs_lfc_cutoff_inf

#### build_all_cutoffs_table

  - for cols=['toi1_median', 'toi2_median', 'toi3_median', 'toi4_median']
    - build_all_cutoffs_table()
      - for each case
        - for  med_max_ptw in ['median', 'maximum', 'pathway']:
          - best_cutoff_quantiles
            - dfi=self.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=force, verbose=verbose)


In [None]:
cols=['toi1_median', 'toi2_median', 'toi3_median', 'toi4_median']
geneset_num=0
enr.set_db(geneset_num)
print(enr.geneset_num, enr.geneset_lib, '\n')

for col in cols:
    print(">>>", col)
    dfcut=enr.build_all_cutoffs_table(selected_toi_col=col, force=force, verbose=False)

In [None]:
cols=['toi1_median', 'toi2_median', 'toi3_median', 'toi4_median']
force=False

geneset_num=0
enr.set_db(geneset_num)
print(enr.geneset_num, enr.geneset_lib, '\n')

for col in cols:
    print(">>>", col)
    dfcut=enr.build_all_cutoffs_table(selected_toi_col=col, force=force, verbose=False)


In [None]:
dfcut.columns

In [None]:
enr.case_list

In [None]:
case=enr.case_list[3]
print(">>", case)
dfi=enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
df2=dfi[ (dfi.case == case)]
maxi=df2.toi4_median.max()
median=df2.toi4_median.median()
maxi, median

In [None]:
case=enr.case_list[2]
print(">>", case)
dfi=enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
df2=dfi[ (dfi.case == case)]
maxi=df2.toi4_median.max()
median=df2.toi4_median.median()
maxi, median

In [None]:
df3=df2[ (df2.toi4_median == maxi)]
df3

In [None]:
cols=['case', 'geneset_num', 'normalization', 'med_max_ptw', 'parameter', 'quantile',
      'quantile_val', 'quantile_val_inf', 'quantile_val_sup', 
      'abs_lfc_cutoff', 'fdr_lfc_cutoff', 
      'pathway_pval_cutoff', 'pathway_fdr_cutoff', 'num_of_genes_cutoff',
      'n_pathways', 'n_degs_in_pathways', 
      'n_degs_in_pathways_mean', 'n_degs_in_pathways_median', 'n_degs_in_pathways_std', 
      'toi1_mean', 'toi1_median', 'toi1_std',
      'toi2_mean', 'toi2_median', 'toi2_std',
      'toi3_mean', 'toi3_median', 'toi3_std',
      'toi4_mean', 'toi4_median', 'toi4_std',]

cols=['case', 'med_max_ptw', 'quantile', 'toi4_median', 'abs_lfc_cutoff', 'fdr_lfc_cutoff',
        'pathway_pval_cutoff', 'pathway_fdr_cutoff', 'n_pathways', 'n_degs_in_pathways',
        'toi1_median', 'toi2_median', 'toi3_median']

### Look for different approaches (sorting)

In [None]:
dfcut.case.unique()

In [None]:
i=3
case=enr.case_list[i]
print(">>", case)

In [None]:
col='toi1_median'
dfcut=enr.build_all_cutoffs_table(selected_toi_col=col, force=False, verbose=False)
df2=dfcut[(dfcut.case == case) & (dfcut.n_degs_in_pathways > 3) & (dfcut.med_max_ptw == 'median')]
df2=df2.sort_values(col, ascending=False)
df2[cols].head(6)

In [None]:
col='toi2_median'
dfcut=enr.build_all_cutoffs_table(selected_toi_col=col, force=False, verbose=False)
df2=dfcut[(dfcut.case == case) & (dfcut.n_degs_in_pathways > 3) & (dfcut.med_max_ptw == 'median')]
df2=df2.sort_values(col, ascending=False)
df2[cols].head(6)

In [None]:
col='toi3_median'
dfcut=enr.build_all_cutoffs_table(selected_toi_col=col, force=False, verbose=False)
df2=dfcut[(dfcut.case == case) & (dfcut.n_degs_in_pathways > 3) & (dfcut.med_max_ptw == 'median')]
df2=df2.sort_values(col, ascending=False)
df2[cols].head(6)

In [None]:
col='toi4_median'
dfcut=enr.build_all_cutoffs_table(selected_toi_col=col, force=False, verbose=False)
df2=dfcut[(dfcut.case == case) & (dfcut.n_degs_in_pathways > 3) & (dfcut.med_max_ptw == 'median')]
df2=df2.sort_values(col, ascending=False)
df2[cols].head(6)

### Bad way to cut - by n_pathways

In [None]:
dfa=dfcut[dfcut.case == case].sort_values(['n_pathways', 'n_degs_in_pathways', 'pathway_fdr_cutoff'], ascending=[False, False, True])
dfa[cols].head(9)

In [None]:
dfa=dfcut[dfcut.case == case].sort_values(['n_degs_in_pathways', 'n_pathways', 'pathway_fdr_cutoff'], ascending=[False, False, True])
dfa[cols].head(9)

In [None]:
dfa=dfcut[dfcut.case == case].sort_values(['pathway_fdr_cutoff', 'n_pathways', 'n_degs_in_pathways'], ascending=[True, False, False])
dfa[cols].head(9)

In [None]:
dfa=dfcut[dfcut.case == case].sort_values(['pathway_fdr_cutoff', 'n_pathways', 'n_degs_in_pathways'], ascending=[True, False, False])
dfa[cols].head(9)

### Revisting DEGs and n_Pathwys x abs_LFC cutoff correlation

#### return dic_fig
  - which in ['deg', 'up', 'down']

In [None]:
plot_up_down=False

corr_cutoff=-0.90
nregs_fdr=5

for case in case_list:
    print(">>", case)
    ret, dic_fig, df_fdr=enr.plot_nDEG_curve_per_LFC_FDR(case, width=1100, height=700, title=None, 
                                                           corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr, verbose=verbose)
    
    print("deg")
    dic_fig['deg'].show()

    if plot_up_down:
        print("up")
        dic_fig['up'].show()
        print("down")
        dic_fig['down'].show()
        
    print()

### Revisting DEGs and n_Pathways x toi4 median

In [None]:
colors=['navy', 'red', 'darkcyan', 'darkgreen', 'orange', 'brown', 'darksalmon',
        'magenta', 'darkturquoise', 'orange', 'darkred', 'indigo', 'magenta', 'maroon', 'black',
        'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'olivedrab', 'navy'] + plotly_colors_proteins

### New plot - dash combo

In [None]:
selected_toi_col='toi4_median'
force=False

dfcut=enr.build_all_cutoffs_table(selected_toi_col=selected_toi_col, force=force, verbose=verbose)
dfcut.columns

maxi_x=np.round(dfcut[selected_toi_col].max(), 3) + 0.001
maxi_x

In [None]:
case=enr.case

df_fdr=enr.open_fdr_lfc_correlation(case=case)
print(len(df_fdr))
df_fdr.head(3)

In [None]:
fdr=0.2

dfcut=enr.build_all_cutoffs_table(selected_toi_col, force=False, verbose=False)
df2=dfcut[ (dfcut.case == case) & (dfcut.fdr_lfc_cutoff == fdr)  & (dfcut.med_max_ptw == 'median')]
df2

In [None]:
enr.abs_lfc_cutoff_inf, case

In [None]:
dfcut=enr.build_all_cutoffs_table(selected_toi_col, force=False, verbose=False)
# print(dfcut.columns)
cols=['case', 'geneset_num', 'normalization', 'med_max_ptw', 
        'quantile', 'quantile_val_inf', 'quantile_val_sup',
        'abs_lfc_cutoff', 'fdr_lfc_cutoff', 'pathway_fdr_cutoff',  'n_pathways',
        'n_degs_in_pathways', 'toi4_median']  #  'toi1_median', 'toi2_median',  'toi3_median',
case=case_list[4]

fdr=0.40
df2=dfcut[ (dfcut.case == case) & (dfcut.normalization == enr.normalization) & (dfcut.geneset_num == enr.geneset_num) &
             (dfcut.med_max_ptw == 'median')]  # (dfcut.fdr_lfc_cutoff == fdr)
df2[cols].head(3)

### The TOI landdscape

In [None]:
height=1200

fig=enr.plot_degs_in_pathways_vs_toi_per_case(selected_toi_col=selected_toi_col, title=None, plot_all_dfi=True,
                                                width=1100, height=height, sel_colors=None, plot_bgcolor='lightgray', verbose=False)

if fig: fig.show()

In [None]:
fig=enr.plot_degs_in_pathways_vs_toi_per_case(selected_toi_col=selected_toi_col, title=None, plot_all_dfi=False,
                                              width=1100, height=1000, sel_colors=None, plot_bgcolor='lightgray', verbose=False)

if fig: fig.show()

### best_cutoff_quantiles() - for Reactome - return dfcut

In [None]:
fname, fname_cut=enr.set_enrichment_name()

# name='enricher_Reactome_2022_medulloblastoma_microarray_for_WNT_x_ctrl_not_normalized_cutoff_lfc_0.950_fdr_0.200_pathway_pval_0.050_fdr_0.450_num_genes_3.tsv'
"; ".join(fname_cut.split('_'))

In [None]:
cols=['case', 'med_max_ptw', 'quantile', 'toi4_median', 'abs_lfc_cutoff', 'fdr_lfc_cutoff',
      'pathway_pval_cutoff', 'pathway_fdr_cutoff', 'n_pathways', 'n_degs_in_pathways',
      'toi1_median', 'toi2_median', 'toi3_median']

In [None]:
case=case_list[0]
selected_toi_col='toi4_median'

df2=dfcut[(dfcut.case == case) & (dfcut.med_max_ptw == 'median')]
df2=df2.sort_values(selected_toi_col, ascending=False)
df2[cols].head(6)

### Comparing: toi4 (or 1,2,3), n_pathways, n_degs_in_pathways
### which is them best?
  - n_best_samples=4 ... the lowest FDRs and highest abs_LFC

In [None]:
enr.geneset_num, enr.geneset_lib, enr.normalization

## Calc best cutoffs and save in config

### Comparing: toi4 (or 1,2,3), n_pathways, n_degs_in_pathways
### which is the best?
  - n_best_samples=4 ... the lowest FDRs and highest abs_LFC

In [None]:
enr.geneset_num, enr.geneset_lib, enr.normalization

In [None]:
cols2=['case', 'med_max_ptw', 'quantile', 'toi4_median', 'toi4_mean', 'abs_lfc_cutoff', 'fdr_lfc_cutoff', 'pathway_fdr_cutoff', 
       'n_pathways', 'n_degs_in_pathways', 'n_degs_in_pathways_mean',
       'n_degs_in_pathways_median', 'n_degs_in_pathways_std',
       'toi1_median', 'toi2_median', 'toi3_median']

save_config=False
dfconfig=enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=1, save_config=False, verbose=True)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfconfig=enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=2, save_config=False, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfconfig=enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=3, save_config=False, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfconfig=enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=4, save_config=False, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfconfig=enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=5, save_config=False, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfconfig=enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=6, save_config=False, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfconfig=enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=7, save_config=False, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfconfig=enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=8, save_config=False, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

### Multiple best points

In [None]:
save_config=False

with_params=False

dic={}
for ipoints in range(1,9):
    dfconfig=enr.calc_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample=ipoints, save_config=save_config, verbose=False)
    dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']

    for j in range(len(dfconfig)):
        row=dfconfig.iloc[j]
        
        if ipoints == 1:
            dic[j]={}
            dic2=dic[j]
            dic2['case']=row.case
        else:
            dic2=dic[j]
            

        dic2[f'nptw_{ipoints}']=row.n_pathways
        dic2[f'ndegs_{ipoints}']=row.n_degs_in_pathways

        if with_params:
            dic2[f'fdrc_{ipoints}']=row.fdr_lfc_cutoff
            dic2[f'lfcc_{ipoints}']=row.abs_lfc_cutoff
            dic2[f'fdrp_{ipoints}']=row.pathway_fdr_cutoff
        
dfpoints=pd.DataFrame(dic).T
dfpoints

In [None]:
ncols=[0] + list(np.arange(1,16))
df2=dfpoints.iloc[:,ncols]

df2

In [None]:
len(dfpoints.columns)

### Multiple best points

In [None]:
dfpoints=enr.display_best_cutoff_params(npoints=10, selected_toi_col='toi4_median', med_max_ptw='median')
dfpoints

### Turn to True to save config

In [None]:
save_config=False

n_best_sample_list=[1, 8, 2, 2, 4, 2, 2, 3]
# n_pathways: 45, 29, 42 44, 24, 60, 39, 39 for COVID-19 Taubate

dfconfig=enr.calc_multiple_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample_list=n_best_sample_list, save_config=save_config, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfbest=enr.cfg.dfbest_cutoffs[cols2]
dfbest[dfbest.med_max_ptw == 'median']

### Chosen columns

In [None]:
dfcut.columns

In [None]:
cols1=['case', 'med_max_ptw', 'quantile', 'toi4_median']
cols2=['abs_lfc_cutoff', 'fdr_lfc_cutoff', 'pathway_fdr_cutoff',
        'n_pathways', 'n_degs_in_pathways']

### Comparing: toi4 (or 1,2,3), n_pathways, n_degs_in_pathways

In [None]:
case='g2b_female'

In [None]:
col='n_pathways'
nrows=6
dfcut=enr.build_all_cutoffs_table(col, force=False, verbose=False)
case=case_list[0]
dfa=dfcut[ (dfcut.case == case) & (dfcut.med_max_ptw == 'median') ][cols]
dfa=dfa.sort_values(['n_pathways', 'n_degs_in_pathways'], ascending=[False, False])
dfa[cols1+cols2].head(nrows)

In [None]:
col='toi1_median'
n_rows=6
dfcut=enr.build_all_cutoffs_table(col, force=False, verbose=False)
dfa=dfcut[ (dfcut.case == case) & (dfcut.med_max_ptw == 'median') ][cols]
dfa=dfa.sort_values(col, ascending=False)
dfa[cols1+[col]+cols2].head(nrows)

In [None]:
col='toi2_median'
dfcut=enr.build_all_cutoffs_table(col, force=False, verbose=False)
dfa=dfcut[ (dfcut.case == case) & (dfcut.med_max_ptw == 'median') ][cols]
dfa=dfa.sort_values(col, ascending=False)
dfa[cols1+[col]+cols2].head(nrows)

In [None]:
col='toi3_median'
dfcut=enr.build_all_cutoffs_table(col, force=False, verbose=False)
dfa=dfcut[ (dfcut.case == case) & (dfcut.med_max_ptw == 'median') ][cols]
dfa=dfa.sort_values(col, ascending=False)
dfa[cols1+[col]+cols2].head(nrows)

In [None]:
col='toi4_median'
dfcut=enr.build_all_cutoffs_table(col, force=False, verbose=False)
dfa=dfcut[ (dfcut.case == case) & (dfcut.med_max_ptw == 'median') ][cols]
dfa=dfa.sort_values(col, ascending=False)
dfa[cols1+cols2].head(nrows)

### TOI4

In [None]:
selected_toi_col='toi4_median'

fig_list=enr.plot_genes_and_pathways_frequecies_per_cases(selected_toi_col,  width=1100, height=700)

print(">>>", selected_toi_col)
print(f"# {enr.s_deg_dap}s")
fig_list[0].show()
print("# n pathways")
fig_list[1].show()

In [None]:
cols

### Selected best cutoffs per case

In [None]:
dfconfig=enr.calc_multiple_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample_list=n_best_sample_list, save_config=False, verbose=False)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfbest=enr.cfg.open_best_ptw_cutoff(verbose=False)
dfbest=dfbest[dfbest.med_max_ptw == 'median']
print(len(dfbest))
dfbest[cols]

### n_pathways: to confirm, is it working?

In [None]:
fig_list=enr.plot_genes_and_pathways_frequecies_per_cases('n_pathways',  width=1100, height=700)

fig_list[0].show()
print("")
fig_list[1].show()

## Why toi4_median is the best approach?

#### balance between best LFC cutoffs and Pathway cutoffs

In [None]:
n_best_sample_list

In [None]:
dfconfig=enr.calc_multiple_best_cutoffs_params(selected_toi_col='n_pathways', n_best_sample_list=n_best_sample_list, save_config=False, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

In [None]:
dfconfig=enr.calc_multiple_best_cutoffs_params(selected_toi_col='toi4_median', n_best_sample_list=n_best_sample_list, save_config=False, verbose=save_config)
dfconfig=dfconfig[dfconfig.med_max_ptw == 'median']
dfconfig[cols2]

## It tries to minimize fdr and maximize abs_lfc cutoffs!!!
### look for toi4_median in both tables

### Summary DAPs + Up and Down

In [None]:
verbose=False
per_biotype= False
ensembl=False

dfa=enr.summary_degs_up_down(per_biotype=per_biotype, ensembl=ensembl, verbose=verbose)
print(len(dfa))
dfa

### At least 10 DAPs for each case

In [None]:
verbose=False
per_biotype= True
ensembl=False

dfa=enr.summary_degs_up_down(per_biotype=per_biotype, ensembl=ensembl, verbose=verbose)
print(len(dfa))
cols=list(dfa.columns)[1:]
dfa[cols]=dfa[cols].astype(int)
dfa

In [None]:
title=f'Up and Down {enr.s_deg_dap}s with the best cutoff'
fig, dfa=enr.barplot_up_down_genes_per_case(title=title, width=1100, height=700, verbose=False)
fig.show()

### Summary DEG-Pathway table

In [None]:
dfsum=enr.summary_degs_and_pathways(force=False, verbose=False)
cols5=list(dfsum.columns)
np.array(cols5)

In [None]:
lista_ndx=list(dfsum.index)
lista_ndx=[x for x in lista_ndx if '_ensembl' not in x]
lista_ndx

In [None]:
dfsum.loc[lista_ndx, cols5[:8]]

In [None]:
dfsum.loc[lista_ndx, cols5[8:]]

### Testing data & saving df_enr excel table in results

In [None]:
# !pip3 install openpyxl

In [None]:
from openpyxl import Workbook

In [None]:
dic={}

# if you want ot save excel files
save_enriched_ptws_excel_odt=True

for case in case_list:
    print(">>>", case)
    ret, degs, degs_ensembl, dfdegs=enr.open_case(case, save_enriched_ptws_excel_odt=save_enriched_ptws_excel_odt, verbose=False)
    enr.echo_parameters()
    dic[case]=enr.df_enr
    print("\n")

### Open best cutoffs (config)

In [None]:
cols=['case', 'med_max_ptw', 'quantile', 'toi4_median', 'abs_lfc_cutoff', 'fdr_lfc_cutoff',
        'pathway_pval_cutoff', 'pathway_fdr_cutoff', 'n_pathways', 'n_degs_in_pathways',
        'toi1_median', 'toi2_median', 'toi3_median']

dfbest=enr.cfg.open_best_ptw_cutoff(verbose=False)
dfbest=dfbest[dfbest.med_max_ptw == 'median']
dfbest[cols]

In [None]:
dfbest2=enr.cfg.open_best_ptw_cutoff(verbose=False)
dfbest2=dfbest2[dfbest2.case == 'g2a_female']
dfbest2[cols]

In [None]:
i=1
case=case_list[i]
print(">>>", case)
ret, degs, degs_ensembl, dfdegs=enr.open_case(case, save_enriched_ptws_excel_odt=False, verbose=False)
enr.echo_parameters()
print(len(enr.df_enr))
enr.df_enr

### Reviewing quantile cutoffs

In [None]:
col='toi4_median'
force=False
verbose=False

print(">>>", case)

dfi=enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)

dic_quant=enr.best_cutoff_quantiles(case, col, med_max_ptw='median', force=force, verbose=False)
df=pd.DataFrame(dic_quant).T
cols=['lim_inf', 'lim_sup', 'abs_lfc_cutoff', 'fdr_lfc_cutoff', 'pathway_pval_cutoff',
        'pathway_fdr_cutoff', 'num_of_genes_cutoff', 'n_pathways', 'n_degs_in_pathways',
        'n_degs_in_pathways_mean', 'n_degs_in_pathways_median ', 'n_degs_in_pathways_std',
        'index1_mean', 'index1_median', 'index1_std',
        'index2_mean', 'index2_median', 'index2_std',
        'index3_mean', 'index3_median', 'index3_std',
        'toi4_mean', 'toi4_median', 'toi4_std'  ]

df.columns=cols
df=df.reset_index()
cols=list(df.columns)
cols[0]='quantile'
df.columns=cols

print(len(df))
df.head(3)

In [None]:
dfi=enr.dfi
print(len(dfi))
dfi.head(3)

### Is this cut statistically correct?
  - Chi-square test - confusion matrix:
    - the best defined cutoff
    - against the default cutoff (LFC=1, FDR=0.05) + random Genes (all othere genes from the microarray experiment)
   
### Test retrieve best param

In [None]:
case=case_list[0]
enr.open_case(case, verbose=True)
enr.case

In [None]:
enr.get_best_ptw_cutoff_biopax()
enr.case, enr.quantile, enr.abs_lfc_cutoff, enr.fdr_lfc_cutoff, enr.pathway_pval_cutoff, enr.pathway_fdr_cutoff

### Prepare for chi-square test
  - best cufoff
  - default cutoffs (fdr=0.05,  abs_lfc=1)
  - case[0]=g2a_male, as example

### Default cutoff values

In [None]:
print("Defaults\n")
ret, degs_default, degs_ensembl_default, dflfc_default=enr.open_case_params(case, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
# enr.echo_parameters(want_echo_default=False, jump_line=True)
degs_default_in_pathways=enr.degs_in_pathways

len(degs_default_in_pathways), ','.join(degs_default_in_pathways)

In [None]:
ret, degs_best, degs_ensembl_best, dflfc_best=enr.open_case(case)
print("Best params\n")
# enr.echo_parameters(want_echo_default=False, jump_line=True)

degs_in_pathways_best=enr.degs_in_pathways
degs_not_in_pathways_best=enr.degs_not_in_pathways

len(degs_in_pathways_best), len(degs_not_in_pathways_best)

### One fulfills with other PROTEINS, complementing DEFAULT CUTOFFS and selected them randomly

In [None]:
len(degs_in_pathways_best)

### Getting random proteins=#(FOUND_BEST DAPs - FOUND_IN_DEFAULT DAPs)

In [None]:
dfa=enr.dflfc_ori[ ~enr.dflfc_ori.symbol.isin(degs_default_in_pathways) ].copy()
dfa.index=np.arange(0, len(dfa))
len(dfa)

In [None]:
enr.abs_lfc_cutoff, enr.fdr_lfc_cutoff, enr.pathway_fdr_cutoff

In [None]:
len(degs_in_pathways_best), len(degs_not_in_pathways_best), len(degs_default)

In [None]:
enr.get_best_ptw_cutoff_biopax()
enr.case, enr.quantile, enr.abs_lfc_cutoff, enr.fdr_lfc_cutoff, enr.pathway_pval_cutoff, enr.pathway_fdr_cutoff

In [None]:
abs_lfc_cutoff_default=1
fdr_lfc_cutoff_default=0.05
pathway_fdr_cutoff_default=0.05

i=0
df_enr=enr.calc_enriched_pathways_random_genes(i, case,abs_lfc_cutoff_default,
                                                 fdr_lfc_cutoff_default, pathway_fdr_cutoff_default)
print(len(df_enr))
df_enr.head(3)

In [None]:
enr.degs_in_pathways_random, enr.degs_not_in_pathways_random

In [None]:
enr.n_degs_in_pathways_random, enr.n_degs_not_in_pathways_random

In [None]:
enr.degs_in_pathways_best, enr.degs_not_in_pathways_best

In [None]:
enr.n_degs_in_pathways_best, enr.n_degs_not_in_pathways_best

### Prepare for chi-square test
  - best cufoff
  - default cutoffs (fdr=0.05,  abs_lfc=1)
  - case[0]=g2a_male, as example

In [None]:
i=0
df_enr=enr.calc_enriched_pathways_random_genes(i, case,abs_lfc_cutoff_default,
                                                 fdr_lfc_cutoff_default, pathway_fdr_cutoff_default)

dfmat, ret_chi, dof, stat, pvalue, stri_stat=enr.build_matrix_calc_chi_square(enr.n_degs_in_pathways_best, 
                                                                                enr.n_degs_not_in_pathways_best,
                                                                                enr.n_degs_in_pathways_random, 
                                                                                enr.n_degs_not_in_pathways_random)

print(f">>> chi-square statistics: {ret_chi} {stri_stat}, dof={dof}")
dfmat

### 100 simulations

In [None]:
case_list

In [None]:
j=5
case=case_list[j]
case

In [None]:
print(">>>", case)
n_sim=100
abs_lfc_cutoff_default=1.
fdr_lfc_cutoff_default=0.05
pathway_fdr_cutoff_default=0.05

force=False

dff=enr.run_n_simulations(n_sim, case, abs_lfc_cutoff_default, fdr_lfc_cutoff_default, pathway_fdr_cutoff_default, force=force, verbose=False)
dff.pvalue=dff.pvalue.astype(float)
print(len(dff))
dff.head(10)

In [None]:
dff[dff.pvalue < 0.1]

In [None]:
pvalue_cutoff=0.1
len(dff[dff.pvalue < pvalue_cutoff]), len(dff[dff.pvalue >= pvalue_cutoff])

In [None]:
mu=dff.stat_sig.mean()
mu

In [None]:
mini, maxi=dff.pvalue.min(), dff.pvalue.max()
mini,maxi

### All 100 random tests are statisticall different from the best cutoffs for g2a_male

In [None]:
enr.case

In [None]:
cutoff_pvalue=0.25

In [None]:
n_sim=100
abs_lfc_cutoff_default=1.
fdr_lfc_cutoff_default=0.05
pathway_fdr_cutoff_default=0.05

force=False

for case in case_list:
    ret, _, _, _=enr.open_case(case, verbose=False)

    dff=enr.run_n_simulations(n_sim, case, abs_lfc_cutoff_default, fdr_lfc_cutoff_default, pathway_fdr_cutoff_default, force=force, verbose=False)
    dff.pvalue=dff.pvalue.astype(float)

    n=len(dff)
    n_below=np.sum(dff.pvalue < cutoff_pvalue)
    n_above=np.sum(dff.pvalue >= cutoff_pvalue)
    # len(dff[dff.stat_sig == False])
    
    print(f">>> {case} - tested {n} simulations, cutoff pvalue={cutoff_pvalue}")
    print(f"{n} comparisons {n_below}/{n_above}")
    print("")

### Runingn all cases

In [None]:
enr.set_db(0, verbose=False)
text=enr.degs_to_text_all_cases_summary(verbose=False)
print(text)

In [None]:
want_to_run=False

if want_to_run:
    enr.cfg.open_best_ptw_cutoff()
    
    enr.echo_default()
    print("")
    
    all_degs=[]
    for case in case_list:
        print(">>>", case)
        ret, degs, degs_ensembl, dfdegs=enr.open_case(case, verbose=False)
        
        if not ret:
            print(f"\nError?? case {case}")
            enr.echo_degs()
            print("")
            continue
    
        enr.echo_parameters(want_echo_default=False, jump_line=True)
        print("")
        all_degs += enr.degs_in_pathways + enr.degs_not_in_pathways
        enr.echo_parameters()
        print("")
        
    all_degs=np.unique(all_degs)
    print(f"There are {len(all_degs)} {enr.s_gene_protein}s in all cases ")
    print("\nall degs:", "; ".join(all_degs))
    print("\n\n")

In [None]:
cols2=['case', 'toi4_median', 'abs_lfc_cutoff', 'fdr_lfc_cutoff', 'pathway_fdr_cutoff',  'n_pathways', 'n_degs_in_pathways']

dfbest=enr.cfg.open_best_ptw_cutoff(verbose=False)
dfbest2=dfbest[dfbest.med_max_ptw == 'median']
# print([True if x in dfbest.columns else False for x in cols2])
dfbest2[cols2]

In [None]:
for case in case_list:
    print(">>>", case)
    _plot='genes'
    print(_plot)
    fig=enr.plot_index_versus_genes_and_pathways(case=case, selected_toi_col='toi4_median', _plot=_plot, width=1100, height=450, plot_all_dfi=False)
    fig.show()
    
    _plot='pathways'
    print(_plot)
    fig=enr.plot_index_versus_genes_and_pathways(case=case, selected_toi_col='toi4_median', _plot=_plot, width=1100, height=450, plot_all_dfi=False)
    fig.show()