In [None]:
from platform import python_version
print(python_version())

### Calculating DEGs statistics

### For each LFC/FDR cutoff set we get diferent set of DEGs
  - LFC: LFC cutoff and FDR_LFC cutoff
  - Pathway: fdr and pval pathway cutoff and min num of genes

### Up and Down
  - Up and Down DEGs/DAPs
  - Up and Down in pathways

### there are 2 statistical tables
  - pval/fdr cutoff x degs
  - pval/fdr/geneset/quantile degs_in_pathway, num_pathways

In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
pd.set_option("display.precision", 3)

import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from biopax_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

# !pip3 install pyyaml
with open('params.yml', 'r') as file:
    dic_yml = yaml.safe_load(file)

In [None]:
root0 = dic_yml['root0']
email = dic_yml['email']

project = dic_yml['project']
s_project = dic_yml['s_project']

gene_protein = dic_yml['gene_protein']
s_omics = dic_yml['s_omics']

has_age = dic_yml['has_age']
has_gender = dic_yml['has_gender']

want_normalized = dic_yml['want_normalized']

abs_lfc_cutoff_inf = dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method = dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr = dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index = dic_yml['tolerance_pathway_index']
type_sat_ptw_index = dic_yml['type_sat_ptw_index']
saturation_lfc_index = dic_yml['saturation_lfc_index']
chosen_model_sampling = dic_yml['chosen_model_sampling']

case_list = dic_yml['case_list']

pval_pathway_cutoff = dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff = dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff = dic_yml['num_of_genes_cutoff']

run_list = dic_yml['run_list']
chosen_model_list = dic_yml['chosen_model_list']
i_dfp_list = dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')


print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
bpx = Biopax(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method,
             abs_lfc_cutoff_inf = abs_lfc_cutoff_inf, 
             type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case = case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, prompt_verbose=True, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()
geneset_num = bpx.geneset_num

In [None]:
case = case_list[1]
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, prompt_verbose=True, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

In [None]:
bpx.fname_lfc_table0, bpx.fname_final_lfc_table0, bpx.fname_enrich_table0

In [None]:
fname, fname_cutoff = bpx.set_enrichment_name()
fname, os.path.exists(os.path.join(bpx.root_enrich, fname)), fname_cutoff, os.path.exists(os.path.join(bpx.root_enrich, fname_cutoff))

In [None]:
bpx.case, bpx.group, bpx.gender, bpx.age, bpx.s_omics

In [None]:
bpx.geneset_num, bpx.geneset_lib

In [None]:
# bpx.gene.df_my_gene.head(2)

### Removing or Renaming config files only the defautl cutoffs are defined

In [None]:
for case in case_list:
    print(">>>", case)
    ret, degs, fname_final_ori, dfdegs = bpx.open_case(case, verbose=False)

    if not ret: continue
    
    fname_final_ori, fname_ori, title = bpx.set_lfc_names()
    print(f"fname '{fname_final_ori}' and title '{title}'")
    print(f"LFC cutoff: lfc={bpx.abs_lfc_cutoff:.3f} fdr={bpx.fdr_lfc_cutoff}")
    
    print(f"{bpx.s_deg_dap}s = {len(degs)}\n")

In [None]:
print(case)
bpx.split_case(case)
bpx.case, bpx.gender, bpx.age

In [None]:
fname_final_ori, fname_ori, title = bpx.set_lfc_names()
fname_final_ori, fname_ori, title

In [None]:
fname, fname_cutoff = bpx.set_enrichment_name()
fname, os.path.exists(os.path.join(bpx.root_enrich, fname)), fname_cutoff, os.path.exists(os.path.join(bpx.root_enrich, fname_cutoff))

In [None]:
try:
    dflfc_ori = bpx.dflfc_ori
    print(len(dflfc_ori))
except:
    dflfc_ori = pd.DataFrame()
    
dflfc_ori.head(3)

In [None]:
lista = ['lncRNA', 'LNC']
dflfc_lnc = dflfc_ori[dflfc_ori.biotype.isin(lista)]
print(len(dflfc_lnc))
dflfc_lnc.tail(3)

In [None]:
dflfc_ori = bpx.dflfc_ori
print(len(dflfc_ori))

try:
    dflfc_ori_symb = dflfc_ori[~pd.isnull(dflfc_ori)]
except:
    dflfc_ori_symb = pd.DataFrame()
    
print(len(dflfc_ori_symb))
dflfc_ori_symb.head(3)

### Microarray with 28,232 unique symbols

In [None]:
try:
    symbols = np.unique(dflfc_ori.symbol)
except:
    symbols = []
    
len(symbols)

In [None]:
try:
    dflfc = bpx.dflfc
    print(len(dflfc))
except:
    dflfc = pd.DataFrame()
    
dflfc.head(3)

In [None]:
dfbest = bpx.cfg.open_best_ptw_cutoff()
dfbest

In [None]:
want_see_best_cutoff = False

if want_see_best_cutoff:
    dfbest = bpx.cfg.dfbest_cutoffs
else:
    dfbest = pd.DataFrame()
dfbest    

In [None]:
if want_see_best_cutoff:
    dfbest = bpx.cfg.dfbest_cutoffs
    dfa = dfbest[(dfbest.case == case) & (dfbest.normalization == normalization) & (dfbest.geneset_num == geneset_num) ]
else:
    dfa = pd.DataFrame()

dfa

In [None]:
try:
    dflfc = bpx.dflfc_ori[(bpx.dflfc_ori.fdr < bpx.fdr_lfc_cutoff)]
    print(len(dflfc))
except:
    dflfc = pd.DataFrame()

dflfc.head(3)

In [None]:
for case in case_list:
    print(">>>", case)
    ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)

    if not ret: continue
    
    fname_final_ori, fname_ori, title = bpx.set_lfc_names()
    print(f"fname '{fname_final_ori}' and title '{title}'")
    print(f"LFC cutoff: lfc={bpx.abs_lfc_cutoff:.3f} fdr={bpx.fdr_lfc_cutoff}")
    
    print(f"{bpx.s_deg_dap}s = {len(degs)}\n")

### Minimum LFC cutoff

### DEGs simulation: no DEG/DAPs per cases
### Saving simulation file dfsim in config:
  - all_lfc_cutoffs_taubate_covid19.tsv

#### Sampling

### Cutoff sets to generate the statistical data
  - inf lfc cutoff: 0.40 --> 0.48 ~ 40% modulation  --> 0.65
  - sup fdr cutoff: 0.75 --> no more than

In [None]:
lfc_list = np.round(np.arange(1.0, -0.01, -.025), 3)
bpx.lfc_list = lfc_list
lfc_list[-1] = 0.0
lfc_list

In [None]:
fdr_list = np.arange(0.05, 0.76, .01)
bpx.fdr_list = fdr_list
fdr_list

In [None]:
cutoff_list = np.round([(x, y) for x in lfc_list for y in fdr_list],3)
print(len(cutoff_list))
cutoff_list[:5], cutoff_list[-5:]

### Saving simulationns

config/all_lfc_cutoffs_medulloblastoma.tsv

In [None]:
force=False
save_file=False

# save_file
# in list_of_degs_set_params ... save excel files

dfsim = bpx.calc_degs_cutoff_simulation(cutoff_list=cutoff_list, force=force, save_file=force, n_echo=-1, verbose=False)
dfsim = dfsim.sort_values(['case', 'fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[False, True, False])
print(dfsim.columns)
print(len(dfsim))

In [None]:
dfsim.head(3)

In [None]:
dfsim.tail(3)

### Does the simulation worked?

In [None]:
dfsim = bpx.open_simulation_table()
print(len(dfsim))

dfsim2 = dfsim[dfsim.case == case]
dfsim2.head(3)

In [None]:
bpx.lfc_list

In [None]:
bpx.fdr_list

In [None]:
abs_lfc_cutoff = -0.0
fdr_lfc_cutoff = 0.05

# (dfsim.case == case) &
dfsim[ (dfsim.abs_lfc_cutoff == abs_lfc_cutoff) & (dfsim.fdr_lfc_cutoff == fdr_lfc_cutoff)]

In [None]:
abs_lfc_cutoff = 0.95
fdr_lfc_cutoff = 0.05

# (dfsim.case == case) &
dfsim[ (dfsim.abs_lfc_cutoff == abs_lfc_cutoff) & (dfsim.fdr_lfc_cutoff == fdr_lfc_cutoff)]

### Simulations

In [None]:
for case in case_list:
    dfsim2 = dfsim[ dfsim.case == case ]
    print(f"{case} \thas {len(dfsim2)} LFC cutoff simulations")

## Calc all Spearman Correlations - filter the 5 best not repeated fdrs
#### Plot abs_LFC x num of DEG/DAPs
#### corr_cutoff = -.90
#### calc corelation with bpx.abs_lfc_cutoff_inf = 0.4

In [None]:
df2 = dfsim[dfsim.case == 'WNT']
len(df2)

In [None]:
bpx.abs_lfc_cutoff_inf = 0.2; fdr = 0.05
bpx.abs_lfc_cutoff_inf, fdr

In [None]:
df2 = df2.sort_values(['fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False])

dfsim2 = df2[ (df2.fdr_lfc_cutoff == fdr) & (df2.abs_lfc_cutoff >= bpx.abs_lfc_cutoff_inf) ]

cols2=['n_degs', 'fdr_lfc_cutoff', 'abs_lfc_cutoff']

dfsim2[cols2]

In [None]:
fdr_list

### Plot all dfsim

In [None]:
# !pip3 install -U kaleido

In [None]:
verbose=False

for case in case_list:
    print(">>>", case)
    dic_fig = bpx.plot_all_dfsim(dfsim, case=case, fdr_list=fdr_list, width=1100, height=700, title=None, verbose=verbose)
        
    for key, fig in dic_fig.items():
        print("\t", key)
        fig.show()
        break # remove to see Up and Dw

    print("\n")
    

In [None]:
# dfsim.columnscutoff_list

### Restricting the best fdr by Spearman's Correlation

### Must calc for each abs_lfc_cutoff_inf

In [None]:
corr_cutoff=-0.90
nregs_fdr = 10
bpx.abs_lfc_cutoff_inf = 0

verbose=False
force=False

'''
    calc_all_LFC_FDR_cutoffs:
        for case_list
            call calc_nDEG_curve_per_LFC_FDR()
'''
df_all_fdr = bpx.calc_all_LFC_FDR_cutoffs(cols2=['n_degs', 'abs_lfc_cutoff'], corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr,
                                          force=force, verbose=verbose)
print(len(df_all_fdr))

### Medulloblastoma abs_lfc_cutoff_inf = 0.65

In [None]:
dfsim = bpx.dfsim[bpx.dfsim.case == case]
dfsim = dfsim.sort_values(['fdr_lfc_cutoff', 'abs_lfc_cutoff'], ascending=[True, False])

dfsim.fdr_lfc_cutoff.unique(), dfsim.abs_lfc_cutoff.unique()

### For FDR == 0.05 (default cutoff) - there is no correlation, is a horizontal flat line for 0.05

In [None]:
fdr = 0.05
dfsim2 = dfsim[ (dfsim.fdr_lfc_cutoff == fdr) & (dfsim.abs_lfc_cutoff >= bpx.abs_lfc_cutoff_inf) ]
len(dfsim2)

In [None]:
dfsim2[cols2].head(5)

In [None]:
dfsim2[cols2].tail(5)

In [None]:
method='spearman'
corr = dfsim2[cols2].corr(method=method).iloc[0,1]
corr

In [None]:
pd.isnull(corr)

### bpx.abs_lfc_cutoff_inf = 0.80

In [None]:
nregs_fdr = 10
bpx.abs_lfc_cutoff_inf = 0.80

verbose=True
force=False

'''
    calc_all_LFC_FDR_cutoffs:
        for case_list
            call calc_nDEG_curve_per_LFC_FDR()
'''
df_all_fdr = bpx.calc_all_LFC_FDR_cutoffs(cols2=['n_degs', 'abs_lfc_cutoff'], corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr,
                                          force=force, verbose=verbose)
print(len(df_all_fdr))

### WNT - Spearman starts 1714 DEGs !!!

In [None]:
case = case_list[0]

df2 = df_all_fdr[ (df_all_fdr.case == case) & ( pd.notnull(df_all_fdr['corr'])  ) ]
print(len(df2))
df2.head(6)

### G4 Spearman starts 1555 DEGs

In [None]:
case = case_list[1]

df2 = df_all_fdr[ (df_all_fdr.case == case) & ( pd.notnull(df_all_fdr['corr'])  ) ]
print(len(df2))
df2.head(6)

In [None]:
df2.tail(5)

### Plot abs_LFC x num of DEGs/DAPs
  - set abs_lfc_cutoff_inf

In [None]:
corr_cutoff, nregs_fdr

In [None]:
case = case_list[0]

cols2=['n_degs', 'abs_lfc_cutoff']
method='spearman'
verbose = True

ret, dic_return = bpx.calc_nDEG_curve_per_LFC_FDR(case=case, cols2=cols2, 
                                                  corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr,
                                                  method=method, force=False, verbose=verbose)

In [None]:
list(dic_return.keys())

In [None]:
len(dic_return['df_fdr'])

In [None]:
len(dic_return['name_list']), dic_return['name_list']

In [None]:
len(dic_return['fdrs']), np.array(dic_return['fdrs'])

In [None]:
bpx.abs_lfc_cutoff_inf

In [None]:
df_fdr = dic_return['df_fdr']
df_fdr.head(3)

In [None]:
corr_cutoff

In [None]:
verbose = False

case = case_list[0]
bpx.open_case(case)

ret, dic_fig, df_fdr = bpx.plot_nDEG_curve_per_LFC_FDR(case, width=1100, height=700, title=None, 
                                                       corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr, verbose=verbose)

for key, fig in dic_fig.items():
    print(key)
    fig.show()

In [None]:
verbose = False

case = case_list[1]
bpx.open_case(case)

ret, dic_fig, df_fdr = bpx.plot_nDEG_curve_per_LFC_FDR(case, width=1100, height=700, title=None, 
                                                       corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr, verbose=verbose)

for key, fig in dic_fig.items():
    print(key)
    fig.show()

### Testing calc_nDEG_curve_per_LFC_FDR()

In [None]:
corr_cutoff, nregs_fdr, method

In [None]:
cols2 = ['n_degs', 'abs_lfc_cutoff']
verbose = True

case = case_list[0]

ret, dic_return = bpx.calc_nDEG_curve_per_LFC_FDR(case=case, cols2=cols2,
                                                  corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr,
                                                  method=method, verbose=verbose)

In [None]:
dic_return.keys(), len(dic_return['df_fdr'])

In [None]:
dic_return['df_fdr'].head(3)

## abs_lfc_cutoff_inf = 0.80

In [None]:
bpx.abs_lfc_cutoff_inf

### Ploting only Spearman's limiar curves

In [None]:
verbose=False

dic_fig = bpx.plot_all_LFC_FDR_cutoffs(width=1100, height=700, title=None, 
                                       corr_cutoff=corr_cutoff, nregs_fdr=nregs_fdr, verbose=verbose)

for case in case_list:
    print(">>>", case)
    try:
        dic2 = dic_fig[case]
    except:
        continue
        
    for key, fig in dic2.items():
        print("\t", key)
        fig.show()
        break
    print("")

In [None]:
abs_lfc_cutoff_inf = .80

In [None]:
case = case_list[0]

df_all_fdr = bpx.open_fdr_lfc_correlation(case, abs_lfc_cutoff_inf)
df2 = df_all_fdr[ pd.notnull(df_all_fdr['corr']) ]
print(len(df2))
df2.head(6)

In [None]:
case = case_list[1]

df_all_fdr = bpx.open_fdr_lfc_correlation(case, abs_lfc_cutoff_inf)
df2 = df_all_fdr[ pd.notnull(df_all_fdr['corr']) ]
print(len(df2))
df2.head(6)

### Summary DEG/DAPs + Up and Down (pre-best cutoff)

In [None]:
verbose=False
per_biotype= False
ensembl = False

dfa = bpx.summary_degs_up_down(per_biotype=per_biotype, ensembl=ensembl, verbose=verbose)
print(len(dfa))
dfa.T

In [None]:
verbose=False
per_biotype= True
ensembl = False

dfa = bpx.summary_degs_up_down(per_biotype=per_biotype, ensembl=ensembl, verbose=verbose)
print(len(dfa))
dfa

In [None]:
per_biotype = True
ensembl = True
before_best_cutoff = True
fig, dfa = bpx.barplot_up_down_genes_per_case(per_biotype=per_biotype, ensembl=ensembl, before_best_cutoff=before_best_cutoff, width=1100, height=700, verbose=False)
fig.show()

In [None]:
width = 1700

fig = bpx.plot_all_degs_up_down_per_cutoffs(width=width, height=600, title=None, y_anchor=1.05, verbose=True)
fig.show()

In [None]:
dfa = bpx.summary_degs_up_down(per_biotype=False, ensembl=False, verbose=False)
dfa

In [None]:
dfa = bpx.summary_degs_up_down(per_biotype=True, ensembl=False, verbose=False)
dfa

In [None]:
dfa = bpx.summary_degs_up_down(per_biotype=True, ensembl=True, verbose=False)
dfa

In [None]:
want_review_data = True

if want_review_data:
    i=0
    case = case_list[i]
    bpx.open_case(case, verbose=False)
    
    fname, fname_ori, title = bpx.set_lfc_names()
    print(f"fname '{fname}' and title '{title}'")
    print(f"LFC cutoff: lfc={bpx.abs_lfc_cutoff:.3f} fdr={bpx.fdr_lfc_cutoff}")
    
    print("")
    bpx.echo_parameters()

In [None]:
if want_review_data:

    for case in case_list:
        bpx.open_case(case, verbose=False)
        print(">>>", case)
        bpx.echo_parameters()
        print("\n\n")

### LNCs

In [None]:
lista = ['lncRNA', 'LNC']
dflfc_lnc = dflfc_ori[dflfc_ori.biotype.isin(lista)]
print(len(dflfc_lnc))
dflfc_lnc.tail(3)

In [None]:
np.unique(dflfc_lnc._type)

In [None]:
np.unique(dflfc_lnc.biotype)

In [None]:
cols = ['probe', 'symbol', 'symbol_prev', 'symb_or_syn', 'biotype', '_type', 'lfc', 'abs_lfc', 'pval', 'fdr', 'mean_exp', 't', 'B', 'description', 
        'desc_gff', 'description_prev',   'accession', 'ensembl_id', 'ensembl_transc_id', 'geneid', 'cytoband', 'symbol_pipe',  'seqname', 'start', 'end', 'go_id', 'seq']

cols = ['probe', 'symbol', 'biotype', '_type', 'lfc', 'fdr', 'desc_gff', 'accession', 'ensembl_id', 'ensembl_transc_id', 'cytoband', 'seqname', 'start', 'end', 'seq']
print(len(dflfc_lnc))

dflfc_lnc = dflfc_lnc.sort_values('abs_lfc', ascending=False)
df2 = dflfc_lnc[cols]
df2.head(30)

In [None]:
fname = 'microarray_ncRNAs.tsv'
pdwritecsv(df2, fname, bpx.root_result, verbose=True)

### Havana is the Ensembl curation project

In [None]:
dfgff = bpx.gene.prepare_final_gff(force=False, verbose=True)
print(len(dfgff))
dfgff.head(3)

### DEGs/DAPs frequency
### Not Normalized

In [None]:
#dfsim = pdreadcsv( bpx.cfg.fname_lfc_cutoff, bpx.cfg.root_config)
dfsim = bpx.cfg.open_all_lfc_cutoff()
print(len(dfsim))
dfsim.tail(3)

### WNT

In [None]:
bpx.set_db(0)

i=0
case = case_list[i]
print(">>>", case)
df2 = dfsim[dfsim.case == case]
print(len(df2))
df2.head(3)

### G4

In [None]:
i=1
case = case_list[i]
print(">>>", case)
df2 = dfsim[dfsim.case == case]
print(len(df2))
df2.head(3)