In [None]:
from platform import python_version
print(python_version())

### Calculating DEGs statistics

### For each LFC/FDR cutoff set we get diferent set of DEGs
  - LFC: LFC cutoff and FDR_LFC cutoff
  - Pathway: fdr and pval pathway cutoff and min num of genes

### Up and Down
  - Up and Down DEGs/DEPs
  - Up and Down in pathways

### there are 2 statistical tables
  - pval/fdr cutoff x degs
  - pval/fdr/geneset/quantile degs_in_pathway, num_pathways

In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
pd.set_option("display.precision", 3)

import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from biopax_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

# !pip3 install pyyaml
with open('params.yml', 'r') as file:
    dic_yml = yaml.safe_load(file)

In [None]:
root0 = dic_yml['root0']
email = dic_yml['email']

project = dic_yml['project']
s_project = dic_yml['s_project']

gene_protein = dic_yml['gene_protein']
s_omics = dic_yml['s_omics']

has_age = dic_yml['has_age']
has_gender = dic_yml['has_gender']

want_normalized = dic_yml['want_normalized']

abs_lfc_cutoff_inf = dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method = dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr = dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index = dic_yml['tolerance_pathway_index']
type_sat_ptw_index = dic_yml['type_sat_ptw_index']
saturation_lfc_index = dic_yml['saturation_lfc_index']
chosen_model_sampling = dic_yml['chosen_model_sampling']

case_list = dic_yml['case_list']

pval_pathway_cutoff = dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff = dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff = dic_yml['num_of_genes_cutoff']

run_list = dic_yml['run_list']
chosen_model_list = dic_yml['chosen_model_list']
i_dfp_list = dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')


print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

### Deleteing or Renaming config files --> the defautl cutoffs are defined

In [None]:
bpx = Biopax(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method,
             abs_lfc_cutoff_inf = abs_lfc_cutoff_inf, 
             type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case = case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

geneset_num = bpx.geneset_num

In [None]:
bpx.fname_lfc_table0, bpx.fname_final_lfc_table0

In [None]:
bpx.case, bpx.group, bpx.gender, bpx.age, bpx.s_omics

In [None]:
bpx.geneset_num, bpx.geneset_lib

In [None]:
bpx.gene.df_my_gene.head(2)

In [None]:
case = case_list[6]
print(">>>", case)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)

fname_nodup, fname_ori, title = bpx.set_lfc_names()
print(f"fname '{fname_nodup}' and title '{title}'")
print(f"LFC cutoff: lfc={bpx.abs_lfc_cutoff:.3f} fdr={bpx.fdr_lfc_cutoff}")

print(f"{bpx.s_deg_dap}s = {len(degs)}\n")

In [None]:
print(case)
bpx.split_case(case)
bpx.case, bpx.gender, bpx.age

In [None]:
bpx.fname_lfc_table0, bpx.fname_final_lfc_table0, bpx.fname_enrich_table0

In [None]:
fname, fname_cutoff = bpx.set_enrichment_name()
fname, os.path.exists(os.path.join(bpx.root_enrich, fname)), fname_cutoff, os.path.exists(os.path.join(bpx.root_enrich, fname_cutoff))

In [None]:
want_rename = False

if want_rename:
    for root in [bpx.root_figure, bpx.root_ressum, bpx.root_enrich, bpx.root_result]:
        for _type in ['.tsv', '.txt', '.xlsx']:
            
            pattern_src = '_DEP'
            pattern_dst = '_DAP'
    
            rename_files(root, pattern_src, pattern_dst, _type=_type, verbose=False)
    
    print("ok")
else:
    print("No rename needed.")

In [None]:
try:
    dflfc_ori = bpx.dflfc_ori
    print(len(dflfc_ori))
except:
    dflfc_ori = pd.DataFrame()
    
dflfc_ori.head(3)

### Proteomics COVID-19 with 272 proteins

In [None]:
try:
    symbols = np.unique(dflfc_ori.symbol)
except:
    symbols = []

len(symbols)

In [None]:
try:
    dflfc = bpx.dflfc
    print(len(dflfc))
except:
    dflfc = pd.DataFrame()
    
dflfc.head(3)

In [None]:
dfbest = bpx.cfg.open_best_ptw_cutoff()
dfbest

In [None]:
want_see_best_cutoff = False

if want_see_best_cutoff:
    dfbest = bpx.cfg.dfbest_cutoffs
else:
    dfbest = pd.DataFrame()
dfbest    

In [None]:
if want_see_best_cutoff:
    dfbest = bpx.cfg.dfbest_cutoffs
    dfa = dfbest[(dfbest.case == case) & (dfbest.normalization == normalization) & (dfbest.geneset_num == geneset_num) ]
else:
    dfa = pd.DataFrame()

dfa

### Deleting or Renaming config files --> the default cutoffs are defined

In [None]:
try:
    dflfc = bpx.dflfc_ori[(bpx.dflfc_ori.fdr < bpx.fdr_lfc_cutoff)]
    print(len(dflfc))
except:
    dflfc = pd.DataFrame()

dflfc.head(3)

In [None]:
for case in case_list:
    print(">>>", case)
    ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)

    if not ret: continue
    
    fname, fname_ori, title = bpx.set_lfc_names()
    print(f"fname '{fname}' and title '{title}'")
    print(f"LFC cutoff: lfc={bpx.abs_lfc_cutoff:.3f} fdr={bpx.fdr_lfc_cutoff}")
    
    print(f"{bpx.s_deg_dap}s = {len(degs)}\n")
    

### Minimum LFC cutoff

In [None]:
np.log2(1.4)

### DEGs simulation: no DEG/DAPs per cases
### Saving simulation file dfsim in config:
  - all_lfc_cutoffs_taubate_covid19.tsv

#### Sampling

### Cutoff sets to generate the statistical data
  - inf lfc cutoff: 0.40 --> 0.48 ~ 40% modulation
  - sup fdr cutoff: 0.75 --> no more than

In [None]:
lfc_list = np.round(np.arange(1.0, -0.05, -.05), 3)
lfc_list[-1] = 0.0
lfc_list

In [None]:
fdr_list = np.arange(0.05, 0.80, .05)
fdr_list

In [None]:
cutoff_list = np.round([(x, y) for x in lfc_list for y in fdr_list],3)
cutoff_list[:5], cutoff_list[-5:]

In [None]:
bpx.lfc_list  

### calc_degs_cutoff_simulation()

  - while looping in case_list -> save_file -> save txt files

In [None]:
force=False
save_file=False

dfsim = bpx.calc_degs_cutoff_simulation(cutoff_list=cutoff_list, force=force, save_file=save_file, n_echo=-1, verbose=False)
print(len(dfsim))
dfsim.head(3)

### Does the simulation worked?

In [None]:
dfsim = bpx.open_simulation_table()
print(len(dfsim))
print(">>>", case)

dfsim2 = dfsim[dfsim.case == case]
dfsim2.head(3)

In [None]:
abs_lfc_cutoff = 0.95
fdr_lfc_cutoff = 0.05
print(">>>", case)

dfsim[ (dfsim.case == case) & (dfsim.abs_lfc_cutoff == abs_lfc_cutoff) & (dfsim.fdr_lfc_cutoff == fdr_lfc_cutoff)]

In [None]:
np.unique(dfsim.abs_lfc_cutoff)

In [None]:
np.unique(dfsim.fdr_lfc_cutoff)

In [None]:
dfsim.abs_lfc_cutoff.min(), dfsim.abs_lfc_cutoff.max(), 

In [None]:
dfsim.fdr_lfc_cutoff.min(), dfsim.fdr_lfc_cutoff.max(), 

In [None]:
dfsim.abs_lfc_cutoff.min(), dfsim.abs_lfc_cutoff.max(), 

In [None]:
dfsim.fdr_lfc_cutoff.min(), dfsim.fdr_lfc_cutoff.max(), 

In [None]:
dfsim[ dfsim.case == case ].head(3)

In [None]:
for case in case_list:
    dfsim2 = dfsim[ dfsim.case == case ]
    print(f"{case} \thas {len(dfsim2)} LFC cutoff simulations")

In [None]:
want_review_data = False

if want_review_data:
    i=3
    case = case_list[i]
    bpx.open_case(case, verbose=False)
    
    fname, fname_ori, title = bpx.set_lfc_names()
    print(f"fname '{fname}' and title '{title}'")
    print(f"LFC cutoff: lfc={bpx.abs_lfc_cutoff:.3f} fdr={bpx.fdr_lfc_cutoff}")
    
    print("")
    bpx.echo_parameters()

In [None]:
if want_review_data:

    for case in case_list:
        bpx.open_case(case, verbose=False)
        print(">>>", case)
        bpx.echo_parameters()
        print("\n\n")

### DEGs/DAPs frequency
### Not Normalized

In [None]:
#dfsim = pdreadcsv( bpx.cfg.fname_lfc_cutoff, bpx.cfg.root_config)
dfsim = bpx.cfg.open_all_lfc_cutoff()
print(len(dfsim))
dfsim.tail(3)

In [None]:
bpx.set_db(0)

i=0
case = case_list[i]
print(">>>", case)
df2 = dfsim[dfsim.case == case].copy()
print(len(df2))
df2.head(2)

In [None]:
dfsim = bpx.open_simulation_table()
print(len(dfsim))
dfsim.head(2)

In [None]:
bpx.abs_lfc_cutoff_inf 

In [None]:
bpx.abs_lfc_cutoff_inf = 0

## Calc all Spearman Correlations - filter the 5 best not repeated fdrs
#### Plot abs_LFC x num of DEG/DAPs
#### corr_cutoff = -.09
#### calc corelation with bpx.abs_lfc_cutoff_inf = 0.4

In [None]:
# !pip3 install -U kaleido

In [None]:
want_calc = False
corr_cutoff=-0.90
nregs_fdr = 5

bpx.abs_lfc_cutoff_inf = 0.4
force=False
verbose=False

df_all_fdr = bpx.calc_all_LFC_FDR_cutoffs(cols2=['n_degs', 'abs_lfc_cutoff'], corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr, force=force, verbose=verbose)
print(len(df_all_fdr))

In [None]:
i = 2
df_all_fdr[df_all_fdr.case == case_list[i]].head(3)

In [None]:
df_all_fdr[df_all_fdr.case == case_list[i+1]].head(3)

In [None]:
df_all_fdr.case.unique()

In [None]:
i = 6
df_all_fdr[df_all_fdr.case == case_list[i]]

In [None]:
i = 7
df_all_fdr[df_all_fdr.case == case_list[i]]

### Plot abs_LFC x num of DEP/DEGs
  - set abs_lfc_cutoff_inf

In [None]:
corr_cutoff, nregs_fdr, case_list

In [None]:
case  = case_list[2]

In [None]:
cols2=['n_degs', 'abs_lfc_cutoff']
limit_fdr = -1
method='spearman'

ret, dic_return = bpx.calc_nDEG_curve_per_LFC_FDR(case=case, cols2=cols2, 
                                                  corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr,
                                                  method=method, verbose=verbose)

len(dic_return)

In [None]:
list(dic_return.keys())

In [None]:
len(dic_return['df_fdr'])

In [None]:
len(dic_return['name_list']), dic_return['name_list']

In [None]:
len(dic_return['fdrs']), dic_return['fdrs']

In [None]:
df_fdr = dic_return['df_fdr']
df_fdr

In [None]:
df_fdr.columns

In [None]:
df_fdr['corr']

In [None]:
bpx.abs_lfc_cutoff_inf = 0.
verbose = False

case = bpx.case_list[2]
bpx.open_case(case)

ret, dic_fig, df_fdr = bpx.plot_nDEG_curve_per_LFC_FDR(case, width=1100, height=700, title=None, 
                                                       corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr, verbose=verbose)

for key, fig in dic_fig.items():
    print(key)
    fig.show()

In [None]:
bpx.abs_lfc_cutoff_inf = 0.

dic_fig = bpx.plot_all_LFC_FDR_cutoffs(width=1100, height=700, title=None, 
                                       corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr, verbose=force)

for case in case_list:
    print(">>>", case)
    try:
        dic2 = dic_fig[case]
    except:
        continue
        
    for key, fig in dic2.items():
        print("\t", key)
        fig.show()
        break
    print("")

In [None]:
bpx.abs_lfc_cutoff_inf = 0.4
force=False

df_all_fdr = bpx.calc_all_LFC_FDR_cutoffs(cols2=cols2, corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr, force=force, verbose=force)
print(len(df_all_fdr))
df_all_fdr.head(3)

In [None]:
abs_lfc_cutoff_inf = bpx.abs_lfc_cutoff_inf
case = case_list[2]

df_all_fdr = bpx.open_fdr_lfc_correlation(case, abs_lfc_cutoff_inf)
print(len(df_all_fdr))

# df_all_fdr[df_all_fdr['first'] == True]

df_all_fdr

### Summary DEG/DEPs + Up and Down (pre-best cutoff)

In [None]:
bpx.abs_lfc_cutoff_inf

In [None]:
verbose=False
per_biotype= False
ensembl = False

dfa = bpx.summary_degs_up_down(per_biotype=per_biotype, ensembl=ensembl, verbose=verbose)
print(len(dfa))
dfa

In [None]:
verbose=False
per_biotype= True
ensembl = False

dfa = bpx.summary_degs_up_down(per_biotype=per_biotype, ensembl=ensembl, verbose=verbose)
print(len(dfa))
dfa

In [None]:
per_biotype = True
ensembl = True
before_best_cutoff = True

fig, dfa = bpx.barplot_up_down_genes_per_case(per_biotype=per_biotype, ensembl=ensembl, before_best_cutoff=before_best_cutoff, width=1100, height=700, verbose=False)
fig.show()

In [None]:
fig = bpx.plot_all_degs_up_down_per_cutoffs(width=1100, height=450, title=None, y_anchor=1.05, verbose=True)
fig.show()

In [None]:
dfa = bpx.summary_degs_up_down(per_biotype=False, ensembl=False, verbose=False)
dfa

In [None]:
dfa = bpx.summary_degs_up_down(per_biotype=True, ensembl=False, verbose=False)
dfa

In [None]:
dfa = bpx.summary_degs_up_down(per_biotype=True, ensembl=True, verbose=False)
dfa

In [None]:
want_review_data = True

if want_review_data:
    
    for case in case_list:
        bpx.open_case(case, verbose=False)
        
        fname, fname_ori, title = bpx.set_lfc_names()
        print(f"fname '{fname}' and title '{title}'")
        print(f"LFC cutoff: lfc={bpx.abs_lfc_cutoff:.3f} fdr={bpx.fdr_lfc_cutoff}")
        
        print("")
        bpx.echo_parameters()
        print("\n\n---------------------------\n\n")