In [None]:
from platform import python_version
print(python_version())

### Enriched Pathway Statistics

In [None]:
import json, requests
import os, sys
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
from plotly.subplots import make_subplots

from scipy.stats import shapiro

sys.path.insert(1, '../src/')

from Basic import *
from enricher_lib import *
from nlp_cluster_lib import *
from config_lib import *

pd.set_option("display.precision", 3)
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [None]:
root_chibe = "../../chibe/"
root_colab = '../../colaboracoes/'
root0       = '../../colaboracoes/covid/sonia_andrade/taubate/proteomics_202205/'

project = 'Taubate COVID-19'
s_project = 'taubate_covid19'

gene_protein = 'protein'
s_omics = 'proteomics'

has_age = True
has_gender = True

want_normalized = False
exp_normalization='quantile_norm' if want_normalized else None
normalization = 'not_normalized' if exp_normalization is None else exp_normalization

abs_lfc_cutoff_inf = 0.40
s_pathw_enrichm_method = 'enricher'
num_min_degs_for_ptw_enr=3
tolerance_gene_reg_index = 0.15

case_list = ['g2a_male', 'g2a_female', 
             'g2b_male', 'g2b_female', 
             'g3_male_adult',   'g3_male_elder',
             'g3_female_adult', 'g3_female_elder']

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

pval_pathway_cutoff = 0.05
fdr_pathway_cutoff = .05
num_of_genes_cutoff = 3

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
pathway_name_id = 'Hemostasis - R-HSA-109582'
pathway_name_id = 'Regulation Of IGF Transport And Uptake By IGFBPs - R-HSA-381426'
pathway_name_id = 'Platelet degranulate - R-HSA-114608'
pathway_name_id = 'Platelet Activation, Signaling And Aggregation - R-HSA-76002'
pathway_name_id = 'Integrin Cell Surface Interactions - R-HSA-216083'
pathway_name_id = 'Neutrophil Degranulation - R-HSA-6798695'
pathway_name_id = 'Regulation of Complement cascade - R-HSA-977606'
pathway_name_id = 'Response To Elevated Platelet Cytosolic Ca2+ - R-HSA-76005'

enr = enricheR(gene_protein, s_omics, project, s_project, root0,
              case_list, has_age, has_gender, clone_objects=False,
              exp_normalization=exp_normalization, geneset_num=0, 
              num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
              tolerance_gene_reg_index=tolerance_gene_reg_index, 
              s_pathw_enrichm_method = s_pathw_enrichm_method)


case = case_list[0]

enr.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=False)
print("\nEcho Parameters:")
enr.echo_parameters()

geneset_num = enr.geneset_num

In [None]:
print(len(enr.gene.df_my_gene))
enr.gene.df_my_gene.head(2)

In [None]:
lista = [x for x in os.listdir(enr.root_result) if '_DAP_' in x and not '~lock' in x]
lista.sort()
print(len(lista))
lista[:5]

In [None]:
lista = [x for x in os.listdir(enr.root_enrichment) if 'g2a' in x and not '~lock' in x]
lista.sort()
print(len(lista))
lista[:5]

In [None]:
try:
    df_enr = enr.df_enr
    print(len(df_enr))
except:
    print("df_enr is None")
    df_enr = pd.DataFrame()

df_enr.head(3)

In [None]:
enr.cfg.open_best_ptw_cutoff()

enr.echo_default()
print("")

for case in case_list:
    ret, degs, degs_ensembl, dflfc = enr.open_case(case)
    if not ret:
        print(f"\nError?? case {case}")
        continue

    print(f"Case {case}")
    print('--------')
    
    enr.echo_enriched_pathways()
    print("")

In [None]:
enr.df_enr

### Reactome

In [None]:
dfreact = enr.reactome.open_reactome_abstract()
print(len(dfreact))
dfreact.head(3)

### Define case

In [None]:
i=1
verbose = False

case = case_list[i]
ret, degs, degs_ensembl, dfdegs = enr.open_case(case, verbose=verbose)

print(f"LFC cutoff: lfc={enr.abs_lfc_cutoff:.3f} fdr={enr.fdr_lfc_cutoff}")

if enr.df_enr is None:
    print(len(degs), 'df_enr is None')
else:
    print(len(degs), len(enr.df_enr))

In [None]:
# dfi = enr.calc_best_cutoff_parameters_by_case_geneset(case, force=False, verbose=False)
dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
print(len(dfi))
dfi.tail(3)

### Plot test

In [None]:
_n_rows = int(np.ceil(len(case_list)/4))
_n_rows

In [None]:
dfi.columns

In [None]:
width=1100; height=700
cols = ['toi1_median',  'toi2_median','toi3_median',  'toi4_median','n_pathways', 'n_degs_in_pathways', 'n_degs_in_pathways_mean', 'n_degs_in_pathways_median']

for col in cols:
    fig = enr.plot_cutoff_simulation_histograms(col, width=1100, height=270)
    fig.show()

In [None]:
cols = ['cutoff', 'pathway_fdr_cutoff', 
        'n_pathways', 'all_genes_annotatted_in_pathway', 'n_degs_in_pathways',
        'n_degs_in_pathways_mean', 'n_degs_in_pathways_median',
        'toi1_mean', 'toi1_median', 'toi2_mean', 'toi2_median', 
        'toi3_mean', 'toi3_median', 'toi4_mean', 'toi4_median']

### Barplots

In [None]:
dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
print(dfi.case.unique())

dfi2 = dfi[dfi.fdr_lfc_cutoff < 0.7]
dfi2 = dfi2.sort_values(['n_pathways', 'n_degs_in_pathways'], ascending=[False, False])
dfi2[cols].head(3)

In [None]:
dfi.fdr_lfc_cutoff.unique()

### WNT

In [None]:
for case in case_list:
    print(">>>", case)
    
    width = 1100
    height = 700
    yaxis_title = "num of pathways"
    xaxis_title = "abs_LFC cutoff"
    
    title = f'num of Pathways cutoff curve per LFC cutoff for {case}'
    
    fig = go.Figure()
    
    dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
    
    fdrs = dfi.fdr_lfc_cutoff.unique()
    fdrs = fdrs[:8]
    
    dfi = dfi.sort_values(['fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False])
    
    for fdr in fdrs:
        dfi2 = dfi[ (dfi.fdr_lfc_cutoff == fdr) & (dfi.pathway_fdr_cutoff == 0.05) ]
    
        name = f'{fdr:.2e}'
        fig.add_trace(go.Scatter(x=dfi2.abs_lfc_cutoff, y=dfi2.n_pathways, name=name))  # marker_color=color,
    
    fig.update_layout(
                autosize=True,
                title=title,
                width=width,
                height=height,
                xaxis_title=xaxis_title,
                yaxis_title=yaxis_title,
                legend_title='FDR_LFC cutoff',
                showlegend=True,
                font=dict(
                    family="Arial",
                    size=14,
                    color="Black"
                )
    )
    
    fig.show()

In [None]:
width = 1100
height = 700
yaxis_title = "num of pathways"
xaxis_title = "FDR_LFC cutoff"

for case in case_list:
    print(">>>", case)
    
    title = f'num of Pathways cutoff curve per FDR_LFC for {case}'
    
    fig = go.Figure()
    
    dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
    
    lfcs = [1, 0.9, 0.8, 0.7, 0.6, 0.5]
    
    dfi = dfi.sort_values(['fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False])
    
    for lfc in lfcs:
        dfi2 = dfi[ (dfi.abs_lfc_cutoff == lfc) & (dfi.pathway_fdr_cutoff == 0.05) ]
    
        name = f'{lfc:.2e}'
        fig.add_trace(go.Scatter(x=dfi2.fdr_lfc_cutoff, y=dfi2.n_pathways, name=name))  # marker_color=color,
    
    fig.update_layout(
                autosize=True,
                title=title,
                width=width,
                height=height,
                xaxis_title=xaxis_title,
                yaxis_title=yaxis_title,
                showlegend=True,
                legend_title='LFC cutoff',
                font=dict(
                    family="Arial",
                    size=14,
                    color="Black"
                )
    )
    
    fig.show()

In [None]:
width = 1100
height = 700
xaxis_title = "num of DEGs in pathws"
yaxis_title = "n pathways"

colors = ['red', 'orange', 'brown', 'green',  'blue', 'darkgreen', 'darkcyan']
fdrs = [0.05, 0.10, 0.20, 0.30, 0.50]


for case in case_list:
    print(">>>", case)
    
    title = f'Split diagram: nDegs x nPtws for {case}'
    
    fig = go.Figure()
    
    dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
    dfi = dfi.sort_values(['fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False])
    
    for i in range(len(fdrs)):
        fdr = fdrs[i]
        color = colors[i]
        name = f"{fdr:.2e}"
    
        dfi2 = dfi[dfi.fdr_lfc_cutoff == fdr]
        
        fig.add_trace(go.Scatter(x=dfi2.n_degs_in_pathways, y=dfi2.n_pathways , mode='markers', marker={'color':color}, name=name)) 
    
    fig.update_layout(
                autosize=True,
                title=title,
                width=width,
                height=height,
                xaxis_title=xaxis_title,
                yaxis_title=yaxis_title,
                showlegend=True,
                legend_title='FDR_LFC cutoff',
                font=dict(
                    family="Arial",
                    size=14,
                    color="Black"
                )
    )
    
    fig.show()

### G4

In [None]:
case = case_list[1]

In [None]:
width = 1100
height = 700
yaxis_title = "num of pathways"
xaxis_title = "abs_LFC cutoff"

title = f'num of Pathways cutoff curve per LFC cutoff for {case}'

fig = go.Figure()

dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)

fdrs = dfi.fdr_lfc_cutoff.unique()
fdrs = fdrs[:8]

dfi = dfi.sort_values(['fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False])

for fdr in fdrs:
    dfi2 = dfi[ (dfi.fdr_lfc_cutoff == fdr) & (dfi.pathway_fdr_cutoff == 0.05) ]

    name = f'{fdr:.2e}'
    fig.add_trace(go.Scatter(x=dfi2.abs_lfc_cutoff, y=dfi2.n_pathways, name=name))  # marker_color=color,

fig.update_layout(
            autosize=True,
            title=title,
            width=width,
            height=height,
            xaxis_title=xaxis_title,
            yaxis_title=yaxis_title,
            legend_title='FDR_LFC cutoff',
            showlegend=True,
            font=dict(
                family="Arial",
                size=14,
                color="Black"
            )
)

fig.show()

In [None]:
width = 1100
height = 700
yaxis_title = "num of pathways"
xaxis_title = "FDR_LFC cutoff"

title = f'num of Pathways cutoff curve per FDR_LFC for {case}'

fig = go.Figure()

dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)

lfcs = [1, 0.9, 0.8, 0.7, 0.6, 0.5]

dfi = dfi.sort_values(['fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False])

for lfc in lfcs:
    dfi2 = dfi[ (dfi.abs_lfc_cutoff == lfc) & (dfi.pathway_fdr_cutoff == 0.05) ]

    name = f'{lfc:.2e}'
    fig.add_trace(go.Scatter(x=dfi2.fdr_lfc_cutoff, y=dfi2.n_pathways, name=name))  # marker_color=color,

fig.update_layout(
            autosize=True,
            title=title,
            width=width,
            height=height,
            xaxis_title=xaxis_title,
            yaxis_title=yaxis_title,
            showlegend=True,
            legend_title='LFC cutoff',
            font=dict(
                family="Arial",
                size=14,
                color="Black"
            )
)

fig.show()

In [None]:
width = 1100
height = 700
yaxis_title = "num of DEGs in pathws"
xaxis_title = "FDR_LFC cutoff"

case = case_list[1]
title = f'num of DEGs cutoff curve per FDR_LFC for {case}'

fig = go.Figure()

dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)

lfcs = [1, 0.9, 0.8, 0.7, 0.6, 0.5]

dfi = dfi.sort_values(['fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False])

for lfc in lfcs:
    dfi2 = dfi[ (dfi.abs_lfc_cutoff == lfc) & (dfi.pathway_fdr_cutoff == 0.05) ]

    name = f'{lfc:.2e}'
    fig.add_trace(go.Scatter(x=dfi2.fdr_lfc_cutoff, y=dfi2.n_degs_in_pathways, name=name))  # marker_color=color,

fig.update_layout(
            autosize=True,
            title=title,
            width=width,
            height=height,
            xaxis_title=xaxis_title,
            yaxis_title=yaxis_title,
            showlegend=True,
            legend_title='LFC cutoff',
            font=dict(
                family="Arial",
                size=14,
                color="Black"
            )
)

fig.show()

In [None]:
width = 1100
height = 700
xaxis_title = "num of DEGs in pathws"
yaxis_title = "n pathways"

title = f'Split diagram: nDegs x nPtws for {case}'

fig = go.Figure()

dfi = enr.calc_enrichment_cutoff_params_and_ndxs_per_case_and_geneset_lib(case, force=False, verbose=False)
dfi = dfi.sort_values(['fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False])

colors = ['red', 'orange', 'brown', 'green',  'blue']
fdrs = [0.05, 0.10, 0.20, 0.30, 0.50]

for i in range(len(fdrs)):
    fdr = fdrs[i]
    color = colors[i]
    name = f"{fdr:.2e}"

    dfi2 = dfi[dfi.fdr_lfc_cutoff == fdr]
    
    fig.add_trace(go.Scatter(x=dfi2.n_degs_in_pathways, y=dfi2.n_pathways , mode='markers', marker={'color':color}, name=name)) 

fig.update_layout(
            autosize=True,
            title=title,
            width=width,
            height=height,
            xaxis_title=xaxis_title,
            yaxis_title=yaxis_title,
            showlegend=True,
            legend_title='FDR_LFC cutoff',
            font=dict(
                family="Arial",
                size=14,
                color="Black"
            )
)

fig.show()

### Searching for the best cutoffs - looking inside quantiles

In [None]:
fig_list = enr.plot_genes_and_pathways_frequecies_per_cases('n_pathways',  width=1100, height=700)

fig_list[0].show()
print("")
fig_list[1].show()

In [None]:
fig_list = enr.plot_genes_and_pathways_frequecies_per_cases('toi4_median',  width=1100, height=700)

fig_list[0].show()
print("")
fig_list[1].show()

### Having defined the best cutoffs

In [None]:
per_biotype = False
ensembl = False
before_best_cutoff = False
fig, dfa = enr.barplot_up_down_genes_per_case(per_biotype=per_biotype, ensembl=ensembl, before_best_cutoff=before_best_cutoff, width=1100, height=700, verbose=False)
fig.show()

In [None]:
per_biotype = True
ensembl = False
fig, dfa = enr.barplot_up_down_genes_per_case(per_biotype=per_biotype, ensembl=ensembl, before_best_cutoff=before_best_cutoff, width=1100, height=700, verbose=False)
fig.show()

In [None]:
per_biotype = True
ensembl = True
fig, dfa = enr.barplot_up_down_genes_per_case(per_biotype=per_biotype, ensembl=ensembl, before_best_cutoff=before_best_cutoff, width=1100, height=700, verbose=False)
fig.show()