In [None]:
from platform import python_version
print(python_version())

## 1) Comparing Default Cutoff x BCA: TP, FP, TN, FN

  - Given the default enriched table (tab1)
  - Given the BCA enriched Table, removing the default pathways = extra pathways (tab2)

### TP and FP
   - tab1
     - Gemini Yes --> TP
     - Gemini False --> FP
    
### TN and FN
   - tab2
     - Gemini Yes --> FN
     - Gemini False --> TN 

## 2) Comparing G0 x G1 pathways: TP, FP, TN, FN

  - Given the G0 table (tab1)
  - Given the G1 table(tab2)

### TP and FP
   - tab1
     - Gemini Yes --> TP
     - Gemini False --> FP
    
### TN and FN
   - tab2
     - Gemini Yes --> FN
     - Gemini False --> TN

In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
pd.set_option("display.precision", 3)

import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from biopax_lib import *
from gemini_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

with open('params.yml', 'r') as file:
    dic_yml=yaml.safe_load(file)

In [None]:
root0=dic_yml['root0']
root_data_aux=dic_yml['root_data_aux']
email=dic_yml['email']

project=dic_yml['project']
s_project=dic_yml['s_project']

gene_protein=dic_yml['gene_protein']
s_omics=dic_yml['s_omics']

has_age=dic_yml['has_age']
has_gender=dic_yml['has_gender']

want_normalized=dic_yml['want_normalized']

abs_lfc_cutoff_inf=dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method=dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr=dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index=dic_yml['tolerance_pathway_index']
type_sat_ptw_index=dic_yml['type_sat_ptw_index']
saturation_lfc_index=dic_yml['saturation_lfc_index']
chosen_model_sampling=dic_yml['chosen_model_sampling']

case_list=dic_yml['case_list']
case_sel_list=dic_yml['case_sel_list']

pval_pathway_cutoff=dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff=dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff=dic_yml['num_of_genes_cutoff']

run_list=dic_yml['run_list']
chosen_model_list=dic_yml['chosen_model_list']
i_dfp_list=dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg=Config(project, s_project, case_list, root0)

case=case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio=-1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw=cfg.get_best_lfc_cutoff(case, 'not_normalized')

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
bpx=Biopax(gene_protein, s_omics, project, s_project, root0,
           case_list, has_age, has_gender, clone_objects=False,
           exp_normalization=exp_normalization, geneset_num=0, 
           num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
           tolerance_pathway_index=tolerance_pathway_index, 
           s_pathw_enrichm_method=s_pathw_enrichm_method,
           abs_lfc_cutoff_inf=abs_lfc_cutoff_inf, 
           type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case=case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs=bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

geneset_num=bpx.geneset_num

### Ensemble: is_seldata=False

In [None]:
###################
is_seldata=False

with_gender=bpx.has_gender
with_gender_list = [False, True] if with_gender else [False]

print(f"with_gender = {with_gender} because has_gender = {bpx.has_gender}")
###################

## Instantiating Gemini

In [None]:
API_KEY=dic_yml['API_KEY']

disease=dic_yml['disease']
context_disease=dic_yml['context_disease']
n_sentences=dic_yml['n_sentences']
chosen_model_sampling=dic_yml['chosen_model_sampling']

gem=Gemini(bpx=bpx, is_seldata=is_seldata, disease=disease, context_disease=context_disease, 
           API_KEY=API_KEY, n_sentences=n_sentences, root0=root0, 
           chosen_model_list=chosen_model_list, i_dfp_list=i_dfp_list, chosen_model_sampling=chosen_model_sampling)
print("\n")
print(gem.disease, gem.is_seldata, gem.i_dfp_list, gem.chosen_model_list)
print("Context:", context_disease)

In [None]:
is_seldata, case_list, run_list, chosen_model_list, i_dfp_list, chosen_model_sampling

In [None]:
gem.set_case(bpx.case, bpx.df_enr, bpx.df_enr0)

In [None]:
gem.is_seldata, gem.bpx.case_list, gem.chosen_model_list, gem.i_dfp_list, gem.chosen_model_sampling

In [None]:
verbose=True

run='run01'
case=case_list[0]
print(">>>", case, '\n')

dfpiva = gem.open_gemini_dfpiva_all_models_one_run(run=run, chosen_model_list=chosen_model_list, verbose=verbose)
dfpiv = dfpiva[(dfpiva.case == case) & (dfpiva.i_dfp == 0)]

dfpiv.consensus.unique()

## 1) Statistics Default x BCA: confusion table

In [None]:
verbose=True
force=False

run='run01'
case = case_list[0]
# group_discovery_fp_fn_enriched_bca
df, conf_list = gem.confusion_table_fp_fn_enriched_bca(run=run, case=case, chosen_model_list=chosen_model_list, force=force, verbose=verbose)

print(case, conf_list, '\n')
print(len(df))
df.head(3)

In [None]:
df.groupby('group').count().reset_index().iloc[:,:2]

In [None]:
verbose=False
force=False

run='run01'

for case in case_list:
    df, conf_list = gem.confusion_table_fp_fn_enriched_bca(run=run, case=case, chosen_model_list=chosen_model_list, force=force, verbose=verbose)

print(case, conf_list, '\n')
print(len(df))
df.head(3)

In [None]:
df.groupby('group').count().reset_index().iloc[:,:2]

### stats

In [None]:
force=False
verbose=True
prompt=False
param_perc=0.9


dfa = gem.calc_confusion_stats_enriched_bca_per_run_case(run=run, case_list=case_list, chosen_model_list=chosen_model_list, 
                                                         param_perc=param_perc, prompt=prompt, force=force, verbose=verbose)
dfa

## 2) Statistics G0 x G1: confusion table
  - Positive control: i_dfp==0
  - fuzzy negative1: i_dfp==1
  - fuzzy negative2: i_dfp==2
  - negative control: i_dfp==3
  - calc:
    - TP, FP: positive control
    - TN1, FN1: fuzzy negative1
    - TN2, FN2: fuzzy negative2
    - TN, FN: negative control
    - calc: Sensitivity, Specificity, Accuracy, and F1-score

In [None]:
text, dfcons = gem.open_gemini_summary_consensus_statitics_idfp(chosen_model_list=chosen_model_list, verbose=verbose)

In [None]:
# print(text)

In [None]:
dfcons.head(2)

In [None]:
verbose=True
force=False

dfconf1, dfconf2, dfconf3 = \
    gem.calc_gemini_4groups_confusion_table(run_list=run_list, case_list=case_list,
                                            chosen_model_list=chosen_model_list,
                                            force=force, verbose=verbose)

if dfconf1 is None:
    print("Could not calcualte")
    df3 = pd.DataFrame()
else:
    cols = ['case', 'which', 'n', 'npos', 'nneg', 'TP', 'FP', 'TN', 'FN', 'sens', 'spec', 'accu', 'prec', 'f1_score']
    
    run = run_list[0]
    
    df1 = dfconf1[dfconf1.run == run][cols]
    df2 = dfconf2[dfconf2.run == run][cols]
    df3 = dfconf3[dfconf3.run == run][cols]

    print("Ok")
  

In [None]:
pd.options.display.float_format = '{:,.3f}'.format

print(">>>", run, '\n')
mu_f1 = df1.f1_score.mean()
std_f1 = df1.f1_score.std()
print(f"G0 x G1: mu_f1 {100*mu_f1:.1f}% ({100*std_f1:.1f}%)\n")
df1

In [None]:
print(">>>", run, '\n')
mu_f1 = df2.f1_score.mean()
std_f1 = df2.f1_score.std()
print(f"G0 x G2: mu_f1 {100*mu_f1:.1f}% ({100*std_f1:.1f}%)\n")
df2

In [None]:
print(">>>", run, '\n')
mu_f3 = df3.f1_score.mean()
std_f3 = df3.f1_score.std()
print(f"G0 x G3: mu_f3 {100*mu_f3:.1f}% ({100*std_f3:.1f}%)\n")
df3

In [None]:
mat = [('sens', 'Sensitivity'), ('spec', 'Specificity'), ('accu', 'Accuracy'),
	   ('prec', 'Precision'), ('f1_score', 'F1-score')]

for i in range(3):
    if i==0:
        dfa = df1
        dfb = df2
        compare = 'G0xG1 x G0xG2'
    elif i==1:
        dfa = df1
        dfb = df3
        compare = 'G0xG1 x G0xG3'
    else:
        dfa = df2
        dfb = df3
        compare = 'G0xG2 x G0xG3'

    for col, test in mat:
        # print(dfa[col])
        # print(dfb[col])
        
        mua  = dfa[col].mean()
        stda = dfa[col].std()

        mub  = dfb[col].mean()
        stdb = dfb[col].std()
        
        s_stat, stat, pval = calc_ttest(dfa[col], dfb[col])
        print(f"{compare}: {100*mua:.1f}% ({100*stda:.1f}%) x {100*mub:.1f}% ({100*stdb:.1f}%) -> {test:12} pval {pval:.2e}")
    print("")

In [None]:
s_stat, stat, pval = calc_ttest(df1[col], df3[col])
s_stat

In [None]:
s_stat, stat, pval = calc_ttest(df2[col], df3[col])
s_stat

In [None]:
cols = ['run', 'which', 'case', 'n', 'npos', 'nneg', 'TP', 'FP', 'TN', 'FN', 
        'sens', 'spec', 'accu', 'prec', 'f1_score']

cols = ['run', 'which', 'case', 'n', 'npos', 'nneg', 'TP', 'FP', 'TN', 'FN', 
        'sens', 'spec', 'accu', 'prec', 'f1_score']


In [None]:
dfconf1.columns

In [None]:
run=run_list[1]

df11 = dfconf1[dfconf1.run == run][cols]
df12 = dfconf2[dfconf2.run == run][cols]
df13 = dfconf3[dfconf3.run == run][cols]

print(">>>", run, '\n')
df13

In [None]:
run='run01'
verbose=True
force=True

df, df1, df2, df3 = gem.calc_gemini_4groups_confusion_stats(run=run, run_list=run_list,
                                                            case_list=case_list, chosen_model_list=chosen_model_list,
                                                            alpha=0.05, force=force, verbose=verbose)

print(len(df))
df.head(2)

In [None]:
col = 'sens'
compare = 'G1xG0'
df[(df.run == run) & (df.test == col)   ] #  & (df['compare'] == compare)

In [None]:
show_bar_errors=True

fig = gem.barplot_comparing_confusion_groups(run=run, run_list=run_list, case_list=case_list,
                                       chosen_model_list=chosen_model_list,
                                       width=1100, height=500, 
                                       fontsize=14, fontcolor='black',
                                       margin=dict( l=20, r=20, b=100, t=120, pad=4),
                                       plot_bgcolor="whitesmoke",
                                       xaxis_title="parameters", yaxis_title='percentage (%)',
                                       minus_test=-.2, minus_group=-0.1, 
                                       annot_fontfamily="Arial, monospace", annot_fontsize=12, 
                                       annot_fontcolor='black', show_bar_errors=show_bar_errors,
                                       savePlot=True, force=force, verbose=verbose)

fig.show()

In [None]:
print(len(df))

# cols = ['run', 'test', 'test_desc', 'compare', 'n', 'mu', 'std', 'error', 'SEM', 'cinf', 'csup', 'pval', 'asteristics', 'stat', 's_stat']
cols = ['test_desc', 'compare', 'n', 'mu', 'std', 'error', 'SEM', 'cinf', 'csup', 'pval', 'pval_bonf', 'asteristics', 'stat', 's_stat']
df[df.test_desc == 'Sensitivity'][cols]

In [None]:
# df.columns

### Comparing runs: stats

In [None]:
verbose=True
force=False

# calc_stats_gemini_4groups_confusion_table
dfstat = gem.calc_stats_gemini_4groups_confusion_compare_runs(run_list=run_list, case_list=case_list,
                                                              chosen_model_list=chosen_model_list,
                                                              force=force, verbose=verbose)

# cols = ['run0', 'run1', 'which', 'test', 'test_ext', 'mu_param0', 'std_param0', 'mu_param1', 'std_param1', 'stat_test', 'pvalue', 'stat', 'text_stat', 'text_ext']
dfstat.test_ext.unique()

In [None]:
cols = ['which', 'test_ext', 'mu_param0', 'std_param0', 'mu_param1', 'std_param1', 'stat_test', 'pvalue', 'stat', 'text_ext']
tests = ['Sensitivity', 'Specificity', 'Accuracy', 'Precision', 'F1-score']

In [None]:
dfstat[dfstat.test_ext == tests[0]][cols]

In [None]:
dfstat[dfstat.test_ext == tests[1]][cols]

In [None]:
dfstat[dfstat.test_ext == tests[2]][cols]

In [None]:
dfstat[dfstat.test_ext == tests[3]][cols]

In [None]:
dfstat[dfstat.test_ext == tests[4]][cols]

In [None]:
run='run01'
case=case_list[0]
df, conf_list = gem.confusion_table_fp_fn_enriched_bca(run=run, case=case, chosen_model_list=chosen_model_list, force=force, verbose=verbose)
df

In [None]:
tp, fp, tn, fn = conf_list
tp, fp, tn, fn

In [None]:
dfpiva = gem.open_gemini_dfpiva_all_models_one_run(run=run, chosen_model_list=chosen_model_list, 
                                                    verbose=verbose)
dfpiva.i_dfp.unique()