In [3]:
from platform import python_version
print(python_version())

3.12.0


### Calc best cutoffs
### Simulate many indexs
### Calc if random genes can be equal or better than best cutoffs

### Definitions:
  - LFC table has:
    - abs_LFC (the absolute LFC cutoff value)
    - FDR_LFC, its FDR or p-value adjusted
  - The enriched pathway table has:
    - FDR_pathway cutoff value
   
### Default values for LFC table:
  - abs_LFC = 1
  - FDR_LFC = 0.05
  - therefore, a DEG/DAP is abs(LFC) >= 1 and FDR < 0.05

### Default values for Enriched Pathways:
  - FDR_pathway = 0.05
  - therefore, an enriched pathways has FDR < 0.05 and at least 3 DEGs/DAPs

### Calculating the best cutoffs:
  - We proposed and calculated many indexes to define a new statistics to flebilize the LFC and Enriched Pathway cutoffs.
    - Indexes are calculated for each case, each cutoff, and each resulting enriched pathway.
  - To find the possible best LFC/FDR expression and FDR pathway cutoffs:
     - We look for a high number in n_pathway and n_DEGs_in_pathway, having a low FDR_LFC and a high absLFC.
       - The default FDR_LFC (0.05):
          - It may have fewer DEGs, resulting in fewer enriched pathways.
          - It may have fewer enriched pathways, even having many DEGs/DAPs.
       - Therefore, a trade-off exists between optimizing (abs_LFC and FDR_LFC cutoffs) and (FDR_pathway cutoffs, n_pathways, and n_DEGs_in_pathways.)

### toi calculates the trade-off between "LFC" and "Enriched Pathways" cufoffs

  - We proposed and calculated four Trade-Off Indexes (toi):

<p style="font-size: 20px; color: yellow;">
$toi1 = \sqrt{-log{_{10}}{FDR_{pathway}} * \frac{n}{N} }$ </p>

<p style="font-size: 20px; color: cyan;">
$toi2 = \sqrt{-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} }$ </p>

<p style="font-size: 20px; color: orange;">
$toi3 = (-log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/3}$ </p>

<p style="font-size: 20px; color: pink;">
$toi4 = (abs\_LFC * -log{_{10}}{FDR_{LFC}} * -log{_{10}}{FDR_{pathway}} * \frac{n}{N})^{1/4}$ </p>

where,
  - n is the number of DEGs/DAPs found in the pathway
  - N is the total number of annotated DEGs/DAPs in the pathway (depend in the database, our default database is Reactome 2022)

### Then we searched for the best cutoffs
  - In each 5 percentile of the index histogram, we look for the best abs_LFC, FDR_LFC, FDR_pathway:
  -  We expected that the best cutoff should be in the right tail of the histogram (high index value.)
  -  High index values must have a high number of n_pathways and n DEGs in pathways.

### Testing the best cutoffs (for each case)

  - Is the new set of cutoffs correct? good enough?
  - How to establish that the calculated cutoff is correct?
  - To answer these questions we calculated the chi-square test between the "best cutoff" and the "default"
    - Best cutoff has:
      - n DEGs/DAPs in pathways
      - n DEGs/DAPs not in pathways
    - The Default cutoff may have:
      - n DEGs/DAPs in pathways
      - n DEGs/DAPs not in pathways
      - The DEGs/DAPs can be:
        - greater or equal number of the best cutoff DEGs/DAPs
        - fewer number of the best cutoff DEGs/DAPs:
           - in this case, one complements the number of DEGs/DAPs with random genes not DEGs/DAPs (found in the experiment)

#### Chi-square test:

DEGs/DAPs | # in pathway | no in pathway
--- | --- | --- 
 Best cutoff |     A      |   B  
 Default cutoff |   C | D 

Chi-square p-value:
  - p-value < 0.05 denotes that both distributions are not similar; therefore, random genes could not reach the best cutoff DEGs/DAPs; in conclusion, the best cutoff was not found randomly.
  - p-value \>= 0.05 denotes that both distributions are similar, and the best cutoff can be achieved randomly.

In [4]:
import json, requests
import os, sys
import pandas as pd

sys.path.insert(1, '../src/')

from Basic import *
from enricher_lib import *
from biopax_lib import *
from config_lib import *
from stat_lib import *
from graphic_lib import *

pd.set_option("display.precision", 3)
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [5]:
root_chibe = "../../chibe/"
root_colab = '../../colaboracoes/'
root0      = '../../colaboracoes/aparecida/'

project = 'Medulloblastoma microarray study'
s_project = 'medulloblastoma'

gene_protein = 'dna'
s_omics = 'microarray'

has_age = False
has_gender = False

want_normalized = False
exp_normalization='quantile_norm' if want_normalized else None
normalization = 'not_normalized' if exp_normalization is None else exp_normalization

abs_lfc_cutoff_inf = 0.80
s_pathw_enrichm_method = 'enricher'
num_min_degs_for_ptw_enr=3

#------------ pathway pseudo-modulation index ------------
tolerance_pathway_index = 0.15
type_sat_ptw_index = 'linear_sat'
saturation_lfc_index = 5

case_list = ['WNT', 'G4']
case = case_list[0]

cfg = Config(project, s_project, case_list, root0)

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

pval_pathway_cutoff = 0.05
fdr_pathway_cutoff = .05
num_of_genes_cutoff = 3

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

G/P LFC cutoffs: lfc=1.000; fdr=0.050
Pathway cutoffs: pval=0.050; fdr=0.050; num of genes=3


In [7]:
pathway_name_id = 'Sensory Processing Of Sound By Inner Hair Cells Of Cochlea - R-HSA-9662360'
pathway_name_id = 'Cardiac Conduction - R-HSA-5576891'
pathway_name_id = 'RHOB GTPase Cycle - R-HSA-9013026'
pathway_name_id = 'Gap Junction Assembly - R-HSA-190861'
pathway_name_id = 'Opioid Signaling - R-HSA-111885'
pathway_name_id = 'Neuronal System - R-HSA-112316'

enr = enricheR(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method,
             abs_lfc_cutoff_inf = abs_lfc_cutoff_inf, 
             type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case = case_list[0]

enr.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
print("\nEcho Parameters:")
enr.echo_parameters()

geneset_num = enr.geneset_num

Start opening tables ....
Building synonym dictionary ...


Echo Parameters:
For case 'WNT', there are 1043/766 DEGs/DEGs with ensembl_id
DEG's cutoffs: abs(LFC)=1.000; FDR=0.130
	1043/766 DEGs/ensembl.
		Up 340/218 DEGs/ensembl.
		Dw 703/548 DEGs/ensembl.

Found 68 (best=68) pathways for geneset num=0 'Reactome_2022'
Pathway cutoffs p-value=0.050 fdr=0.150 min genes=3
DEGs found in enriched pathways:
	There are 766 DEGs found in pathways
	279 (best=279) DEGs in pathways and 764/487 DEGs/ensembl not in pathways

	74 DEGs ensembl Up in pathways
	144 DEGs Up ensembl not in pathways

	205 DEGs ensembl Dw in pathways
	343 DEGs Dw ensembl not in pathways


In [None]:
enr.case, enr.group, enr.gender, enr.age, enr.geneset_num, enr.abs_lfc_cutoff_inf

### BCA parameters

In [None]:
enr.abs_lfc_cutoff, enr.fdr_lfc_cutoff, enr.pathway_fdr_cutoff

### Reference database

In [None]:
enr.geneset_num, enr.geneset_lib, enr.dbs_list

### Define database: 0 is Reactome

In [None]:
want = False

if want:
    geneset_num = 0
    enr.set_db(geneset_num, verbose=True)
enr.geneset_num, enr.geneset_lib

### The second case

In [None]:
i=1
case = case_list[i]

ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
print(f"G/P cutoff: lfc={enr.abs_lfc_cutoff:.3f}; lfc_fdr={enr.fdr_lfc_cutoff:.3f}")
len(degs), len(degs_ensembl)

### BCA parameters

In [None]:
enr.abs_lfc_cutoff, enr.fdr_lfc_cutoff, enr.pathway_fdr_cutoff

In [None]:
enr.n_degs_in_pathways, enr.n_degs_ensembl_in_pathways

In [None]:
degs_diff = [x for x in enr.degs_in_pathways if x not in enr.degs_ensembl_in_pathways]
np.array(degs_diff)

In [None]:
# dflfc_ori.columns

In [None]:
gene = 'ADAMTSL3'

dflfc_ori = enr.dflfc_ori
cols = ['probe', 'symbol', 'symbol_prev', 'symb_or_syn', 'accession', 'ensembl_id', 'biotype', '_type',
       'lfc', 'abs_lfc', 'fdr',  'description',  'desc_gff', 'description_prev',  ]

df2 = dflfc_ori[dflfc_ori.symbol == gene][cols]
df2

In [None]:
ensembl_id = df2.iloc[0].ensembl_id
print(ensembl_id)
gene in enr.degs_ensembl

In [None]:
dflfc = enr.dflfc
dflfc = dflfc.sort_values('abs_lfc', ascending=False)
dflfc.head(6)

In [None]:
gene = 'SOX2'
df2 = dflfc_ori[dflfc_ori.symbol == gene][cols]
df2

In [None]:
df2 = dflfc[dflfc.symbol == gene][cols]
df2

In [None]:
gene in enr.degs_ensembl

### build_all_cutoffs_table(col)
  - loop case_list
    - best_cutoff_quantiles()
      - calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib

### Next, calc_best_cutoffs_params()

In [None]:
case = 'WNT'
dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
df2 = dfi[ (dfi.case == case)]
maxi = df2.toi4_median.max()
median = df2.toi4_median.median()
maxi, median

In [None]:
case = 'G4'
dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
df2 = dfi[ (dfi.case == case)]
maxi = df2.toi4_median.max()
median = df2.toi4_median.median()
maxi, median

In [None]:
cols = ['case', 'geneset_num', 'normalization', 'med_max_ptw', 'parameter', 'quantile',
        'quantile_val', 'quantile_val_inf', 'quantile_val_sup', 
        'abs_lfc_cutoff', 'fdr_lfc_cutoff', 
        'pathway_pval_cutoff', 'pathway_fdr_cutoff', 'num_of_genes_cutoff',
        'n_pathways', 'n_degs_in_pathways', 
        'n_degs_in_pathways_mean', 'n_degs_in_pathways_median', 'n_degs_in_pathways_std', 
        'toi1_mean', 'toi1_median', 'toi1_std',
        'toi2_mean', 'toi2_median', 'toi2_std',
        'toi3_mean', 'toi3_median', 'toi3_std',
        'toi4_mean', 'toi4_median', 'toi4_std',]


cols = ['case', 'med_max_ptw', 'quantile', 'toi4_median', 'abs_lfc_cutoff', 'fdr_lfc_cutoff',
        'pathway_pval_cutoff', 'pathway_fdr_cutoff', 'n_pathways', 'n_degs_in_pathways',
        'toi1_median', 'toi2_median', 'toi3_median']

### Look for different approaches (sorting)

In [None]:
case = 'WNT'

In [None]:
col = 'toi4_median'
dfcut = enr.build_all_cutoffs_table(selected_toi_col=col, force=False, verbose=False)
df2 = dfcut[(dfcut.case == case) & (dfcut.n_degs_in_pathways > 3) & (dfcut.med_max_ptw == 'median')]
df2 = df2.sort_values(col, ascending=False)
df2[cols].head(6)

In [None]:
colors=['navy', 'red', 'darkcyan', 'darkgreen', 'orange', 'brown', 'darksalmon',
        'magenta', 'darkturquoise', 'orange', 'darkred', 'indigo', 'magenta', 'maroon', 'black',
        'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'olivedrab', 'navy'] + plotly_colors_proteins

cols = ['case', 'toi4_median', 'med_max_ptw',  'quantile', 'quantile_val_inf', 'quantile_val_sup',
        'abs_lfc_cutoff', 'fdr_lfc_cutoff', 'pathway_fdr_cutoff',  'n_pathways',
       'n_degs_in_pathways']  #  'toi1_median', 'toi2_median',  'toi3_median',

In [None]:
selected_toi_col = 'toi4_median'

In [None]:
fig = enr.plot_degs_in_pathways_vs_toi_per_case(selected_toi_col=selected_toi_col, title=None, plot_all_dfi=False,
                                                  width=1100, height=600, sel_colors=None, plot_bgcolor='lightgray', verbose=False)

fig.show()

### Open best cutoffs (config)

In [None]:
dfbest = enr.cfg.open_best_ptw_cutoff(verbose=False)
dfbest2 = dfbest[dfbest.med_max_ptw == 'median']
dfbest2[cols]

### Is this cut statistically correct?
  - Chi-square test - confusion matrix:
    - the best defined cutoff
    - against the default cutoff (LFC=1, FDR=0.05) + random Genes (all othere genes from the microarray experiment)
   
### Test retrieve best param

In [None]:
case = case_list[0]
ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
enr.case

In [None]:
enr.case, enr.quantile, enr.abs_lfc_cutoff, enr.fdr_lfc_cutoff, enr.pathway_pval_cutoff, enr.pathway_fdr_cutoff

### Prepare for chi-square test
  - best cufoff
  - default cutoffs (fdr=0.05,  abs_lfc=1)
  - case[0] = g2a_male, as example

### Default cutoff values x BCA

In [None]:
for case in case_list:
    print(">>> case", case, "\n")
    
    print("Default params:")
    
    ret, degs_default, degs_ensembl_default, dflfc_default = enr.open_case_params(case, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05, verbose=False)
    # enr.echo_parameters(want_echo_default=False, jump_line=True)
    degs_in_pathways_default = enr.degs_in_pathways
    degs_not_in_pathways_default = enr.degs_not_in_pathways

    print(f"abs_lfc_cutoff={enr.abs_lfc_cutoff}, fdr_lfc_cutoff={enr.fdr_lfc_cutoff}, pathway_fdr_cutoff={enr.pathway_fdr_cutoff}")
    print(f"degs {len(degs_default)}, degs with ensembl_id {len(degs_ensembl_default)}")
    
    if enr.n_pathways == 0:
        print("No pahtway enriched.")
    else:
        print(f"degs in pathways={len(degs_in_pathways_default)}, degs not in pathways={len(degs_not_in_pathways_default)}")
        print(f"n pathways {enr.n_pathways}")

    print("")
    ret, degs_bca, degs_ensembl_bca, dflfc_best = enr.open_case(case)
    print("BCA params:")
    # enr.echo_parameters(want_echo_default=False, jump_line=True)
    
    degs_in_pathways_bca = enr.degs_in_pathways
    degs_not_in_pathways_bca = enr.degs_not_in_pathways

    print(f"abs_lfc_cutoff={enr.abs_lfc_cutoff}, fdr_lfc_cutoff={enr.fdr_lfc_cutoff}, pathway_fdr_cutoff={enr.pathway_fdr_cutoff}")
    print(f"degs {len(degs_bca)}, degs with ensembl_id {len(degs_ensembl_bca)}")

    if enr.n_pathways == 0:
        print("No pahtway enriched.")
    else:
        print(f"degs in pathways={len(degs_in_pathways_bca)}, degs not in pathways={len(degs_not_in_pathways_bca)}")
        print(f"n pathways {enr.n_pathways}")  # {len(enr.df_enr)}

    print("\n")

### Getting random GENES = #(FOUND_BEST DEGS - FOUND_IN_DEFAULT DEGS)

In [None]:
i = 0
case = case_list[i]
ret, degs_default, degs_ensembl_default, dflfc_default = enr.open_case_params(case, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05, verbose=False)
# enr.echo_parameters(want_echo_default=False, jump_line=True)
degs_in_pathways_default = enr.degs_in_pathways
degs_not_in_pathways_default = enr.degs_not_in_pathways

ret, degs_best, degs_ensembl_best, dflfc_best = enr.open_case(case, verbose=False)

degs_in_pathways_best = enr.degs_in_pathways
degs_not_in_pathways_best = enr.degs_not_in_pathways

mat = np.array([ [len(degs_in_pathways_default), len(degs_not_in_pathways_default)], 
                 [len(degs_in_pathways_best), len(degs_not_in_pathways_best)]])
mat

### Are the new discovery DEGs FP or TP?

In [None]:
new_degs = [x for x in degs_in_pathways_best if x not in (degs_in_pathways_default) ]
print(f"Default DEGs in pahtways {len(degs_in_pathways_default)} x new discovered DEGs {len(new_degs)}")

In [None]:
enr.get_best_ptw_cutoff_biopax()
f"case = {enr.case}, abs_lfc_cutoff = {enr.abs_lfc_cutoff}, fdr_lfc_cutoff = {enr.fdr_lfc_cutoff}, pathway_fdr_cutoff = {enr.pathway_fdr_cutoff}"

In [None]:
abs_lfc_cutoff_default = 1
fdr_lfc_cutoff_default = 0.05
pathway_fdr_cutoff_default = 0.05
print(">>>", enr.case)
i = 0
df_enr = enr.calc_enriched_pathways_random_genes(i, case, abs_lfc_cutoff_default,
                                                 fdr_lfc_cutoff_default, pathway_fdr_cutoff_default, prompt_verbose=True)

if df_enr is None:
    df_enr = pd.DataFrame()

print(len(df_enr))
df_enr.head(3)

In [None]:
enr.n_degs_in_pathways_bca, enr.n_degs_not_in_pathways_bca

In [None]:
enr.n_degs_in_pathways_random, enr.n_degs_not_in_pathways_random

In [None]:
enr.degs_in_pathways_random[:5], enr.degs_not_in_pathways_random[:5]

### Prepare for chi-square test
  - best cufoff
  - default cutoffs (fdr=0.05,  abs_lfc=1)
  - case[0] = g2a_male, as example

In [None]:
i = 0
case = case_list[i]
abs_lfc_cutoff_default = 1
fdr_lfc_cutoff_default = 0.05
pathway_fdr_cutoff_default = 0.05

df_enr = enr.calc_enriched_pathways_random_genes(i, case, abs_lfc_cutoff_default, fdr_lfc_cutoff_default,
                                                 pathway_fdr_cutoff_default, prompt_verbose=True)

dfmat, ret_chi, dof, stat, pvalue, stri_stat = enr.build_matrix_calc_chi_square(n_degs_in_pathways_bca=enr.n_degs_in_pathways_bca, 
                                                                                n_degs_not_in_pathways_bca=enr.n_degs_not_in_pathways_bca,
                                                                                n_degs_in_pathways_default=enr.n_degs_in_pathways_random, 
                                                                                n_degs_not_in_pathways_random=enr.n_degs_not_in_pathways_random)

print(f">>> chi-square statistics: {ret_chi} {stri_stat}, dof={dof}")
dfmat

In [None]:
i = 1
case = case_list[i]
abs_lfc_cutoff_default = 1
fdr_lfc_cutoff_default = 0.05
pathway_fdr_cutoff_default = 0.05

df_enr = enr.calc_enriched_pathways_random_genes(i, case, abs_lfc_cutoff_default, fdr_lfc_cutoff_default,
                                                 pathway_fdr_cutoff_default, prompt_verbose=True)

dfmat, ret_chi, dof, stat, pvalue, stri_stat = enr.build_matrix_calc_chi_square(n_degs_in_pathways_bca=enr.n_degs_in_pathways_bca, 
                                                                                n_degs_not_in_pathways_bca=enr.n_degs_not_in_pathways_bca,
                                                                                n_degs_in_pathways_default=enr.n_degs_in_pathways_random, 
                                                                                n_degs_not_in_pathways_random=enr.n_degs_not_in_pathways_random)

print(f">>> chi-square statistics: {ret_chi} {stri_stat}, dof={dof}")
dfmat

### 100 simulations

In [None]:
case_list

In [None]:
n_sim = 100
abs_lfc_cutoff_default = 1.
fdr_lfc_cutoff_default = 0.05
pathway_fdr_cutoff_default = 0.05

want = True
force = True

In [None]:
case = 'WNT'
print(">>>", case)

if want:
    dff_wnt = enr.run_n_simulations(n_sim, case, abs_lfc_cutoff_default, fdr_lfc_cutoff_default, pathway_fdr_cutoff_default,
                                    force=force, verbose=False)
    print(len(dff_wnt))
else:
    dff_wnt = pd.DataFrame()

dff_wnt.head(10)

In [None]:
dff_wnt[dff_wnt.stat_sig != True]

In [None]:
dff_wnt.stat_sig.mean()

In [None]:
case = 'G4'

if want:
    dff_g4 = enr.run_n_simulations(n_sim, case, abs_lfc_cutoff_default, fdr_lfc_cutoff_default, pathway_fdr_cutoff_default,
                                   force=force, verbose=False)
    print(len(dff_g4))
else:
    dff_g4 = pd.DataFrame()

dff_g4.head(10)

In [None]:
dff_g4[dff_g4.stat_sig != True]

In [None]:
dff_g4.stat_sig.mean()

### All 100 random tests are statisticall different from the best cutoffs for g2a_male

  - to rerun turn force = True

In [None]:
want = True
dic = {}

if want:
    n_sim = 100
    abs_lfc_cutoff_default = 1.
    fdr_lfc_cutoff_default = 0.05
    pathway_fdr_cutoff_default = 0.05
    
    force = False
    
    for case in case_list:
        print(">>>", case, end = ' ')
        dff = enr.run_n_simulations(n_sim, case, abs_lfc_cutoff_default, fdr_lfc_cutoff_default, pathway_fdr_cutoff_default, force=force, verbose=False)
        dff['FDR'] = fdr(dff.pvalue)
        print(len(dff), "/", len(dff[dff.stat_sig == False]))
        dic[case] = dff
    

In [None]:
dfchi_wnt = dic['WNT']
print(len(dfchi_wnt))
dic2 = dfchi_wnt[dfchi_wnt.FDR > 0.05]
dic2

In [None]:
dfchi_g4 = dic['G4']
print(len(dfchi_g4))
dic2 = dfchi_g4[dfchi_g4.FDR > 0.05]
dic2

### Running all cases

In [None]:
enr.cfg.open_best_ptw_cutoff()

enr.echo_default()
print("")

all_degs = []
for case in case_list:
    print(">>>", case)
    ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
    
    if not ret:
        print(f"\nError?? case {case}")
        enr.echo_degs()
        print("")
        continue

    enr.echo_parameters(want_echo_default=False, jump_line=True)
    print("")
    all_degs += enr.degs_in_pathways + enr.degs_not_in_pathways
    print("")

print_all = False

all_degs = np.unique(all_degs)
print(f"There are {len(all_degs)} {enr.s_deg_dap}s in all cases ")
if print_all: print("\n\nall degs:", "; ".join(all_degs))
print("\n")

### Enriched Pathways

### WNT

In [None]:
case = case_list[0]
ret, degs, degs_ensembl, dflfc = enr.open_case(case)
print(len(enr.df_enr))
enr.df_enr.head(50)

### Group 4

In [None]:
case = case_list[1]
ret, degs, degs_ensembl, dflfc = enr.open_case(case)
print(len(enr.df_enr))
enr.df_enr.head(50)

### Development & tests

### Genes stat for one Case

In [None]:
dfa = enr.calc_only_genes_in_pathway_per_case(force=False, verbose=False)
print(len(dfa))
dfa.head()

In [None]:
case = case_list[0]
verbose=False

dfq = enr.calc_pathway_gene_index_per_case(case, verbose=verbose)
print(len(dfq))
dfq.head()

### Summary genes

In [None]:
force=False
prompt_verbose=False
save_up_down = False

dfa = enr.calc_all_genes_in_pubmed_per_case(force=force, prompt_verbose=prompt_verbose, verbose=False)

print(len(dfa))
dfa.head(3)

### Summary pathways and quantiles

In [None]:
force=False
prompt_verbose=False

enr.calc_pathway_and_quantile_summary(quantile_list=[0.5, 0.75, 0.9],force=force, prompt_verbose=prompt_verbose, verbose=False)

df_enr_summ = enr.df_enr_summ
df_enr_anal = enr.df_enr_anal
df_quant_summ = enr.df_quant_summ

print(len(df_enr_summ))
df_enr_summ

In [None]:
dfqq = df_enr_summ[df_enr_summ.case == 'g2a_female']
dfqq

In [None]:
dfqq = df_enr_anal[df_enr_anal.case == 'g2a_male'].groupby('quantile').coundegs_bestt().reset_index().iloc[:,:2]
dfqq.columns = ['quantile', 'n_pathways']
dfqq

In [None]:
print(len(df_enr_anal))
df_enr_anal.head(2)

In [None]:
df_quant_summ = enr.df_quant_summ
print(len(df_quant_summ))
df_quant_summ.head(3)

In [None]:
width=800; height=400
col = 'mean_toi1'

for _plot in ['genes', 'pathways']:
    fig = go.Figure()
    
    for quantile in quantile_list:
        df2 = dfcut[dfcut['quantile'] == quantile]
        if df2.empty: continue

        if _plot == 'genes':
            fig.add_trace(go.Bar(x=df2.case, y=df2.n_genes_in_pahtways, name=f'{quantile:.3f}'))
        else:
            fig.add_trace(go.Bar(x=df2.case, y=df2.n_pathways, name=f'{quantile:.3f}'))

    fig.update_layout(title=f"{_plot} frequency per quantile cutoff",
                      width=width,
                      height=height,
                      xaxis_title='cases',
                      yaxis_title=f"{_plot} counts",
                      legend_title="Quantiles",
                      showlegend=True)


    fig.show()

In [None]:
width=1100; height=700
col = 'mean_toi1'

fig = make_subplots(rows=2, cols=1, subplot_titles=['genes', 'pathways'])

nrow = 0; ncol=1
for subplot in ['genes', 'pathways']:
    nrow += 1
    
    for quantile in quantile_list:
        df2 = dfcut[dfcut['quantile'] == quantile]
        if df2.empty: continue

        if subplot == 'genes':
            fig.add_trace(go.Bar(x=df2.case, y=df2.n_genes_in_pahtways, name=f'genes-{quantile:.3f}'), row=nrow, col=ncol)
        else:
            fig.add_trace(go.Bar(x=df2.case, y=df2.n_pathways, name=f'pathways-{quantile:.3f}'), row=nrow, col=ncol)

fig.update_layout(title=f"{enr.project}",
                  width=width,
                  height=height,
                  xaxis_title='cases',
                  yaxis_title="counts",
                  showlegend=True)


fig.show()

In [None]:
pd.DataFrame(dfcut.iloc[0]).T

### Build best params

In [None]:
df_list = []
for case in case_list:
    dfq = dfcut[dfcut.case == case][cols].sort_values(['n_pathways', 'n_genes_in_pahtways', 'pathway_fdr_cutoff'], ascending=[False, False, True])
    best_qtl = dfq.iloc[0]['quantile']
    print(">>>", case, best_qtl)
    dfa = dfq[dfq['quantile'] == best_qtl]
    df_list.append(dfa)

dfconfig = pd.concat(df_list)
dfconfig.index = np.arange(0, len(dfconfig))
dfconfig

### Save best params

In [None]:
enr.cfg.save_best_ptw_cutoff(dfconfig, force=True, verbose=True)

```
def get_any_ptw_cutoff(enr, case:str, normalization:str, geneset_num:int,
                            verbose:bool=False) -> (float, float, int, int, int, float):
        if enr.dfbest_ptw_cutoff is None:
            _ = enr.open_best_ptw_cutoff()
            if enr.dfbest_ptw_cutoff is None:
                if verbose:
                    print("Houston we have a problem: No best parameter file for Pathways was found.")
                    print(">>> run: new06_enricher_statistics_and_save_config_table.ipynb")
                return enr.param_ptw_defaults

        dfa = enr.dfbest_ptw_cutoff[(enr.dfbest_ptw_cutoff.case == case) & (enr.dfbest_ptw_cutoff.normalization == normalization) &
                                     (enr.dfbest_ptw_cutoff.geneset_num == geneset_num) & (enr.dfbest_ptw_cutoff['quantile'] == quantile) ]
        if dfa.empty:
            return enr.param_ptw_defaults
```            

In [None]:
files = [x for x in os.listdir(enr.root_enrichment) if '_pathway_pval_' in x \
                 and case in x and enr.geneset_lib in x and enr.normalization in x and not '~lock' in x]
len(files)

In [None]:
" - ".join(files[0].split("_"))

In [None]:
df_best = enr.cfg.dfbest_ptw_cutoff
df_best.head(2)

In [None]:
dfq = df_best[(df_best.case == case) & (df_best.geneset_num == geneset_num) & \
              (df_best.normalization == normalization) ]

row = dfq.iloc[0]
row

In [None]:
force=False; verbose=False

dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=force, verbose=verbose)
print(dfi.columns)
print(len(dfi))

In [None]:
col='mean_toi1'

quantiles = np.quantile(dfi[col], quantile_list)
quantiles

In [None]:
perc_tolerance=0.05

for i in range(len(quantiles)):
    if quantiles[i] < 0.95:
        lim_inf = quantiles[i]
        lim_sup = quantiles[i+1]
    else:
        lim_inf = quantiles[i]
        lim_sup = None
        
    print(lim_inf, quantile, lim_sup)

In [None]:
df2 = dfi[ (dfi[col] >= lim_inf) & (dfi[col] <= lim_sup) ].copy()
df2 = df2.sort_values(['lfc_cutoff', 'fdr_cutoff', 'pathway_fdr_cutoff'], ascending=[False, True, True])
df2.head(6)

In [None]:
df2 = dfi[ (dfi[col] >= lim_inf) & (dfi[col] <= lim_sup) ].copy()
df2 = df2.sort_values(['pathway_fdr_cutoff', 'fdr_cutoff', 'lfc_cutoff'], ascending=[True, True, False])
df2.head(6)

In [None]:
case = case_list[4]
print(">>>", case)
col='mean_toi1'
quantile_list=(0.50, 0.75, .90)
force = False
verbose = False

dic_quant = enr.best_cutoff_quantiles(case, col, quantile_list, force=force, verbose=verbose)
print(len(dic_quant))

In [None]:
print(len(df_enr_mod))
df_enr_mod.head(6)

In [None]:
df_enr0 = enr.df_enr0
print(len(df_enr0))
df_enr0.head(6)

### Development & tests

In [None]:
def all_equal_list(cols1, cols2):
    if cols1 is None and cols2 is None: return True
    if cols1 == [] and cols2 == []: return True
    
    if len(cols1) != len(cols2): return False
    
    soma = np.sum([1 if cols1[i] != cols2[i] else 0 for i in range(len(cols1))])
    return soma == 0

cols1 = list(enr.dflfc_all.columns)

cols2 = ['probe', 'symbol', 'geneid', 'description', 'logFC', 'meanExpr', \
        't.stat', 'p-value', 'fdr', 'B', 'chr.range', 'org.chromosome', \
        'forward.reverse', 'nuc.sequence', 'gemmaid', 'go.term']

all_equal_list(cols1, cols2)

In [None]:
all_genes = []
for i in range(len(dfsig)):
    genes = eval(dfsig.iloc[i].genes)
    # print(i, len(genes))
    all_genes += genes
    
all_genes = np.unique(all_genes)
all_genes.sort()
all_genes

not_found_genes = np.array([x for x in enr.deg_list if not x in all_genes])
not_found_genes

In [None]:
def find_hugo_symbol(gene):
    try:
        i = enr.dic_genes[gene]
        if isinstance(i, list):
def calc_enriched_pathways_random_genes(degs_best, degs_default, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05,
                                        fdr_pathway_cutoff=0.05, verbose=False):
    n_genes = len(degs_best)
    n_diff = n_genes - len(degs_default)
    
    if n_diff == 0:
        flag_ok = True
    else:
        flag_ok = False
    
        if n_diff > 0:
            best_has_more = True
        else:
            best_has_more = False
    
    if flag_ok or not best_has_more:
        return None, [], []

    dfa = enr.dflfc_ori[ ~enr.dflfc_ori.symbol.isin(degs_default) ].copy()
    dfa.index = np.arange(0, len(dfa))

    rows = np.random.randint(0, len(dfa), n_diff)
    # print(len(rows))
    # print(",".join([str(x) for x in rows]))
    random_genes = [x for x in dfa.iloc[rows].symbol if isinstance(x, str)]
    random_genes.sort()

    degs_random = degs_default + random_genes

    ret, degs_default, degs_ensembl_default, dflfc_default = enr.open_case_params(case, abs_lfc_cutoff=abs_lfc_cutoff, fdr_lfc_cutoff=fdr_lfc_cutoff)
    enr.calc_EA_dataset_symbol(degs_random, force=force, calc_many_sig=False, verbose=verbose)
    
    df_enr = enr.df_enr0.copy()
    df_enr = df_enr[(df_enr.pval < 0.05) & (df_enr.fdr < fdr_pathway_cutoff) & (df_enr.num_of_genes >= 3)]
    if df_enr.empty:
        print("No enrichment analysis wwere found.")
    else:
        df_enr = df_enr.sort_values('fdr', ascending=True)
        df_enr.index = np.arange(0, len(df_enr))
    
    if verbose: print(f"There are {len(df_enr)} enriched pathways.")

    if df_enr.empty:
        all_enr_degs, degs_in_pathways_random, degs_not_in_pathways_random = [], [], []
    else:
        all_enr_degs = []
        for i in range(len(df_enr)):
            genes = df_enr.iloc[i].genes
            if isinstance(genes, str):
                genes = eval(genes)
            all_enr_degs += genes
        
        all_enr_degs = list(np.unique(all_enr_degs))
        
        degs_in_pathways_random     = [x for x in degs_random if x     in all_enr_degs]
        degs_not_in_pathways_random = [x for x in degs_random if x not in all_enr_degs]

    
    return df_enr, degs_in_pathways_random, degs_not_in_pathways_random

            i = i[0]
            
        mat = enr.df_synonyms.iloc[i]['synonyms']
        print(">>>", gene, i, mat, type(mat))
        if isinstance(mat, str):
            mat = eval(mat)
            
        gene0 = mat[0]
    except:
        gene0 = gene

    return gene0

In [None]:
gene = 'SEPP1'
find_hugo_symbol(gene)

In [None]:
enr.dic_genes[gene]

In [None]:

def calc_enriched_pathways_random_genes(degs_best, degs_default, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05,
                                        fdr_pathway_cutoff=0.05, verbose=False):
    n_genes = len(degs_best)
    n_diff = n_genes - len(degs_default)
    
    if n_diff == 0:
        flag_ok = True
    else:
        flag_ok = False
    
        if n_diff > 0:
            best_has_more = True
        else:
            best_has_more = False
    
    if flag_ok or not best_has_more:
        return None, [], []

    dfa = enr.dflfc_ori[ ~enr.dflfc_ori.symbol.isin(degs_default) ].copy()
    dfa.index = np.arange(0, len(dfa))

    rows = np.random.randint(0, len(dfa), n_diff)
    # print(len(rows))
    # print(",".join([str(x) for x in rows]))
    random_genes = [x for x in dfa.iloc[rows].symbol if isinstance(x, str)]
    random_genes.sort()

    degs_random = degs_default + random_genes

    ret, degs_default, degs_ensembl_default, dflfc_default = enr.open_case_params(case, abs_lfc_cutoff=abs_lfc_cutoff, fdr_lfc_cutoff=fdr_lfc_cutoff)
    enr.calc_EA_dataset_symbol(degs_random, force=force, calc_many_sig=False, verbose=verbose)
    
    df_enr = enr.df_enr0.copy()
    df_enr = df_enr[(df_enr.pval < 0.05) & (df_enr.fdr < fdr_pathway_cutoff) & (df_enr.num_of_genes >= 3)]
    if df_enr.empty:
        print("No enrichment analysis wwere found.")
    else:
        df_enr = df_enr.sort_values('fdr', ascending=True)
        df_enr.index = np.arange(0, len(df_enr))
    
    if verbose: print(f"There are {len(df_enr)} enriched pathways.")

    if df_enr.empty:
        all_enr_degs, degs_in_pathways_random, degs_not_in_pathways_random = [], [], []
    else:
        all_enr_degs = []
        for i in range(len(df_enr)):
            genes = df_enr.iloc[i].genes
            if isinstance(genes, str):
                genes = eval(genes)
            all_enr_degs += genes
        
        all_enr_degs = list(np.unique(all_enr_degs))
        
        degs_in_pathways_random     = [x for x in degs_random if x     in all_enr_degs]
        degs_not_in_pathways_random = [x for x in degs_random if x not in all_enr_degs]

    
    return df_enr, degs_in_pathways_random, degs_not_in_pathways_random


In [None]:
def build_matrix_calc_chi_square(degs_in_pathways_best, degs_not_in_pathways_best,
                                 degs_in_pathways_random, degs_not_in_pathways_random):
    mat = [ [len(degs_in_pathways_best), len(degs_not_in_pathways_best)],
       [len(degs_in_pathways_random), len(degs_not_in_pathways_random)],
      ]
    dfmat = pd.DataFrame(mat)
    dfmat.index = ['enriched', 'random']
    dfmat.columns = ['degs_in', 'degs_out']

    ret_chi, dof, stat, pvalue, stri_stat = chisquare_2by2(dfmat)

    return dfmat, ret_chi, dof, stat, pvalue, stri_stat



### 30 simulations

In [None]:
dic = {}
for i in range(30):
    df_enr, degs_in_pathways_random, degs_not_in_pathways_random = calc_enriched_pathways_random_genes(degs_best, degs_default,
                                                               abs_lfc_cutoff=1, fdr_lfc_cutoff=0.5,
                                                               fdr_pathway_cutoff=0.05)
    dfmat, ret_chi, dof, stat, pvalue, stri_stat = build_matrix_calc_chi_square(degs_in_pathways_best, degs_not_in_pathways_best,
                                                                            degs_in_pathways_random, degs_not_in_pathways_random)

    dic[i] = {}
    dic2 = dic[i]

    dic2['stat_sig'] = ret_chi
    dic2['dof'] = dof
    dic2['stat'] = stat
    dic2['pvalue'] = f'{pvalue:.3e}'
    dic2['stri_stat'] = stri_stat
    dic2['dfmat'] = dfmat

dff = pd.DataFrame(dic).T
dff[ dff.stat_sig==False ]

In [None]:
def filter_my_gene():
    df_mg = enr.gene.df_my_gene.copy()
    df_mg = df_mg[ ~ df_mg.symbol.str.startswith('LOC')]
    df_mg = df_mg[ ~ df_mg.symbol.str.startswith('LINC')]
    df_mg = df_mg[ ~ df_mg.symbol.str.startswith('LNC')]
    df_mg = df_mg[ ~ df_mg.symbol.str.startswith('MIR')]
    df_mg = df_mg[ [False if '-' in x else True for x in df_mg.symbol] ]
    
    df_mg.index = np.arange(0, len(df_mg))

    return df_mg

df_mg = filter_my_gene()
n_genes = len(df_mg)

print(n_genes)

In [None]:
selected_toi_col='toi4_median'
dfcut = enr.build_all_cutoffs_table(selected_toi_col, force=False, verbose=False)
print(len(dfcut))
dfcut.head(3)

In [None]:
dfcut.case.unique()

In [None]:
df2  = dfcut[ (dfcut.case == 'WNT') ]
fdr_list = np.unique(df2.fdr_lfc_cutoff)
fdr_list

In [None]:
df2  = dfcut[ (dfcut.case == 'C4') ]
fdr_list = np.unique(df2.fdr_lfc_cutoff)
fdr_list

In [None]:
from graphic_lib import *

_plot = 'genes'
_plot = 'pathways'
width=1100; height=500
plot_all_dfi = True

colors = ['olivedrab', 'navy', 'red', 'darkcyan', 'darkgreen', 'orange', 'brown', 'darksalmon', 
          'magenta', 'darkturquoise', 'orange', 'darkred', 'indigo', 'magenta', 'maroon', 'black', 
          "darkblue", "darkcyan", "darkgoldenrod", "darkgray", "darkgrey", "darkgreen", 'navy'] 

selected_toi_col='toi4_median'
print(">>>", selected_toi_col)

dfcut = enr.build_all_cutoffs_table(selected_toi_col, force=False, verbose=False)

# fig = make_subplots(rows=2, cols=1, subplot_titles=['genes', 'pathways'])

i = -1
for _plot in ['genes', 'pathways']:

    if _plot == 'genes':
        plot_name = f"'Best' number of {enr.s_deg_dap}s in pathways"
        yaxis_title = f"# {enr.s_deg_dap}s in pathways"
    else:
        plot_name = f"'Best' number of enriched pathways"
        yaxis_title = "# of pathways"

    fig = go.Figure()
    
    for case in enr.case_list:
        df2  = dfcut[ (dfcut.case == case) ]
        fdr_list = np.unique(df2.fdr_lfc_cutoff)

        if plot_all_dfi: dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case)

        for fdr in fdr_list:
            i += 1
  
            df2  = dfcut[ (dfcut.case == case) & (dfcut.fdr_lfc_cutoff == fdr) ]
            if plot_all_dfi: dfi2 = dfi[   (dfi.case == case)   & (dfi.fdr_lfc_cutoff == fdr) ]
            
            if df2.empty:
                continue
    
            name1 = f"{case} fdr={fdr:.3f} for {_plot}"
            name2 = name1 + '_all'
            df2  = df2.sort_values( selected_toi_col, ascending=True)
            if plot_all_dfi: dfi2 = dfi2.sort_values(selected_toi_col, ascending=True)
            color = colors[i]
    
            if _plot == 'genes':
                fig.add_trace(go.Scatter(x=df2[selected_toi_col],   y=df2.n_degs_in_pathways,  marker_color=color, name=name1) ) # , row=1, col=1)
                if plot_all_dfi: fig.add_trace(go.Scatter(x=dfi2[selected_toi_col], y=dfi2.n_degs_in_pathways, line=dict(dash='dash'), marker_color=color, name=name2)) # , row=1, col=1)
            else:
                fig.add_trace(go.Scatter(x=df2[selected_toi_col],   y=df2.n_pathways, marker_color=color, name=name1)) #, row=2, col=1)
                if plot_all_dfi: fig.add_trace(go.Scatter(x=dfi2[selected_toi_col], y=dfi2.n_pathways, line=dict(dash='dash'), marker_color=color, name=name2)) #, row=2, col=1)
    
    fig.update_layout(title=f"{selected_toi_col} x {_plot}",
                      width=width,
                      height=height,
                      xaxis_title=selected_toi_col,
                      yaxis_title=yaxis_title,
                      # xaxis2_title=selected_toi_col,
                      legend_title="cases",
                      showlegend=True)
    
    #fig['layout']['yaxis']['title']=yaxis_title1
    # fig['layout']['yaxis2']['title']=yaxis_title2
    
    fig.show()
    print("")

In [None]:
dflfc = enr.dflfc_ori
dflfc = dflfc.sort_values(['symbol', 'abs_lfc'], ascending=[True, False]).copy()
dflfc.index = np.arange(0, len(dflfc))

previous = ''; goods = []
for i in range(len(dflfc)):

    if not isinstance(dflfc.iloc[i].symbol, str):
        goods.append(False)
    elif dflfc.iloc[i].symbol != previous:
        previous = dflfc.iloc[i].symbol
        goods.append(True)
    else:
        goods.append(False)

dflfc['symbol_ori'] = dflfc.symbol
dflfc.loc[:, 'symbol'] = [enr.gene.replace_synonym_and_hugo_symbol(x) for x in dflfc.symbol]

print(len(dflfc))
dflfc = dflfc[goods]
print(len(dflfc))

fname = "medulloblastoma_NO_DUP_LFC_%s_x_CTRL_%s.tsv"%(enr.case, enr.normalization)
# pdwritecsv(dflfc, fname, enr.root_result, verbose=True)

In [None]:
diff = [True if dflfc.iloc[i].symbol != dflfc.iloc[i].symbol_ori else False for i in range(len(dflfc))  ]

cols = ['symbol', 'symbol_ori', 'probe', 'description']
dflfc[diff][cols]