In [None]:
from platform import python_version
print(python_version())

### Bayesian approach

Gene Expression or Protein Abundance is calculated in Transcriptomics and Proteomics experiments, usually calculating a case's log fold change (LFC) compared to control samples. Since there are multiple comparisons, the p-value of LFC is usually corrected using the False Discovery Rate (FDR) method. According to statistical practice, to calculate Differentially Expressed Genes (DEG) or Differentially Abundance Protein (DAP), there is a hard cutoff: absolute LFC (aLFC) >= 1 and FDRL < 0.05. However, many experiments did not find DEGs, and others found DEGs but could not find Enriched Pathways (EP), where again, a hard cutoff exists: FDRP < 0.05. If one applies Bayesian techniques, hard cutoffs will turn into posterior distribution with confidential intervals. Therefore, the cutoffs may be displaced and have an interval, not a hard cutoff. In this first study, we apply the Best Cutoff Algorithm to find ...

#### Applying Bayes Theorem  

$p(cutoffs | DEG) ∝ p(DEG | cutoffs) * p(cutoffs)$

then,  

$p( LFC_{abs}, FDR_{LFC} | DEG) ∝ p(DEG | LFC_{abs}, FDR_{LFC}) * p(LFC_{abs}, FDR_{LFC})$

#### fixing FDR  

$p( LFC_{abs} | DEG) ∝p(DEG | LFC_{abs}) * p(LFC_{abs}) ; for each  FDR_{LFC}$

where,  

$p(DEG | LFC_{abs}) = n_{DEGS} / n_{Genes}$

defining,  

$p(LFC_{abs}) = N(LFC_{default cutoff}, STD_{default cutoff})$

In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from biopax_lib import *
from config_lib import *
from stat_lib import *

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

email = "flalix@gmail.com"

### Up and Down DEGs simulation

In [None]:
root_chibe = "../../chibe/"
root_colab = '../../colaboracoes/'
root0      = '../../colaboracoes/aparecida/'

project = 'Medulloblastoma microarray study'
s_project = 'medulloblastoma'

gene_protein = 'dna'
s_omics = 'microarray'

has_age = False
has_gender = False

want_normalized = False
exp_normalization='quantile_norm' if want_normalized else None
normalization = 'not_normalized' if exp_normalization is None else exp_normalization

abs_lfc_cutoff_inf = 0.80
s_pathw_enrichm_method = 'enricher'
num_min_degs_for_ptw_enr=3

#------------ pathway pseudo-modulation index ------------
tolerance_pathway_index = 0.15
type_sat_ptw_index = 'linear_sat'
saturation_lfc_index = 5

case_list = ['WNT', 'G4']
case = case_list[0]

cfg = Config(project, s_project, case_list, root0)

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

pval_pathway_cutoff = 0.05
fdr_pathway_cutoff = .05
num_of_genes_cutoff = 3

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
pathway_name_id = 'Sensory Processing Of Sound By Inner Hair Cells Of Cochlea - R-HSA-9662360'
pathway_name_id = 'Cardiac Conduction - R-HSA-5576891'
pathway_name_id = 'RHOB GTPase Cycle - R-HSA-9013026'
pathway_name_id = 'Gap Junction Assembly - R-HSA-190861'
pathway_name_id = 'Opioid Signaling - R-HSA-111885'
pathway_name_id = 'Neuronal System - R-HSA-112316'

bpx = Biopax(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method,
             abs_lfc_cutoff_inf = abs_lfc_cutoff_inf, 
             type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case = case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, prompt_verbose=True, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()
geneset_num = bpx.geneset_num

In [None]:
bpx.case, bpx.group, bpx.gender, bpx.age, bpx.geneset_num, bpx.geneset_lib

In [None]:
fdr_lista = np.round(np.arange(0.05, 0.80, 0.05), 2)

width=1100
height=600
verbose=False

dic_case = {}
for case in case_list:
    dic = bpx.calc_bayesian_cutoffs(case, ndraws=1000, xaxis_title='lfc', yaxis_title='p',
            	 				    fdr_lista=fdr_lista, perc_delta=0.01,
            					    width=width, height=height, plot_bgcolor='lightgray', verbose=verbose)

    dic_case[case] = dic

In [None]:
list(dic.keys())

In [None]:
for case in case_list:
    print(">>>", case)
    dic = dic_case[case]

    for key, dic2 in dic.items():
        case2= dic2['case']
        fdr= dic2['fdr']
        ci= dic2['CI_0.9']
        lfc_max= dic2['lfc_max']
        print(f"case {case2}, fdr={fdr:.2e}, lfc_max={lfc_max:.3f}, ci={ci}")
        fig= dic2['fig']
        fig.show()
        print("")

In [None]:
width=900
height=600
verbose=False

for case in case_list:
    print(">>>", case)
    dic = dic_case[case]
    dfc = pd.DataFrame(dic).T

    fig= bpx.plot_bayesian_cutoff_series(case, dfc, width=width, height=height, plot_bgcolor='lightgray', verbose=verbose)
    fig.show()
    print("")

In [None]:
cols2 = ['case', 'med_max_ptw', 'quantile', 'toi4_median', 'toi4_mean', 'abs_lfc_cutoff', 'fdr_lfc_cutoff', 'pathway_fdr_cutoff', 
        'n_pathways', 'n_degs_in_pathways', 'n_degs_in_pathways_mean',
        'n_degs_in_pathways_median', 'n_degs_in_pathways_std',
        'toi1_median', 'toi2_median', 'toi3_median']

In [None]:
dfbest = bpx.cfg.open_best_ptw_cutoff(verbose=False)
dfbest = dfbest[dfbest.med_max_ptw == 'median']
dfbest[cols2]