## Sampling pathways

In [None]:
from platform import python_version
print(python_version())

In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
pd.set_option("display.precision", 3)

import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from biopax_lib import *
from gemini_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

# !pip3 install pyyaml
with open('params.yml', 'r') as file:
    dic_yml=yaml.safe_load(file)

In [None]:
root0=dic_yml['root0']
email=dic_yml['email']

project=dic_yml['project']
s_project=dic_yml['s_project']

gene_protein=dic_yml['gene_protein']
s_omics=dic_yml['s_omics']

has_age=dic_yml['has_age']
has_gender=dic_yml['has_gender']

want_normalized=dic_yml['want_normalized']

abs_lfc_cutoff_inf=dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method=dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr=dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index=dic_yml['tolerance_pathway_index']
type_sat_ptw_index=dic_yml['type_sat_ptw_index']
saturation_lfc_index=dic_yml['saturation_lfc_index']
chosen_model_sampling=dic_yml['chosen_model_sampling']

case_list=dic_yml['case_list']
case_sel_list=dic_yml['case_sel_list']

pval_pathway_cutoff=dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff=dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff=dic_yml['num_of_genes_cutoff']

run_list=dic_yml['run_list']
chosen_model_list=dic_yml['chosen_model_list']
i_dfp_list=dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg=Config(project, s_project, case_list, root0)

case=case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio=-1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw=cfg.get_best_lfc_cutoff(case, 'not_normalized')


print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
bpx=Biopax(gene_protein, s_omics, project, s_project, root0,
           case_list, has_age, has_gender, clone_objects=False,
           exp_normalization=exp_normalization, geneset_num=0, 
           num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
           tolerance_pathway_index=tolerance_pathway_index, 
           s_pathw_enrichm_method=s_pathw_enrichm_method,
           abs_lfc_cutoff_inf=abs_lfc_cutoff_inf, 
           type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case=case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs=bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

geneset_num=bpx.geneset_num

In [None]:
fname, fname_cutoff=bpx.set_enrichment_name()
fname, fname_cutoff

### Run all, after finding a case
  - please open dfr (reactome) before

In [None]:
Nenr=len(bpx.df_enr)
Nenr

In [None]:
bpx.case, len(bpx.df_enr)

### MSD (multiple sources dataset): is_seldata=True

In [None]:
is_seldata=True

In [None]:
API_KEY=dic_yml['API_KEY']

disease=dic_yml['disease']
context_disease=dic_yml['context_disease']
n_sentences=dic_yml['n_sentences']
chosen_model_sampling=dic_yml['chosen_model_sampling']

gem=Gemini( bpx=bpx, is_seldata=is_seldata, disease=disease, context_disease=context_disease, 
             API_KEY=API_KEY, n_sentences=n_sentences, root0=root0, 
             chosen_model_list=chosen_model_list, i_dfp_list=i_dfp_list, chosen_model_sampling=chosen_model_sampling)
print("\n")
print(gem.disease, gem.is_seldata, gem.i_dfp_list, gem.chosen_model_list)
print("Context:", context_disease)

In [None]:
gem.set_case(bpx.case, bpx.df_enr, bpx.df_enr0)

In [None]:
gem.bpx.case_list

### Settings: selected data

In [None]:
test=False
force=False
verbose=False

chosen_model=3
gemini_model='gemini-1.5-flash'

query_type='strong'
N=30

case_sel0=case_sel_list[0]
case_sel1=case_sel_list[1]

with_gender=False

############## for selected ###############
run_list=['run01']
chosen_model_list=[3]
gem.chosen_model_list=chosen_model_list

i_dfp_list=[0]
case_list=case_sel_list
#############################

case=case_sel0
print("")
dfsel=gem.open_yes_no_sampling(case=case, N=N, query_type=query_type, verbose=True)
print("")
dfsel.head(3)

### Select random results 

In [None]:
# 15 Yes and 15 No
N

In [None]:
case = case_sel0
cols=['pathway_id', 'pathway', 'fdr', 'curation']

root=os.path.join(gem.root_gemini0, 'run01')

verbose=False
df_yes, df_no, df_sel_yes, df_sel_no=gem.select_random_results(case=case, chosen_model=chosen_model,
                                                               N=N, root=root, query_type=query_type, verbose=verbose)

print(gem.gemini_model)
len(df_sel_yes), len(df_sel_no)

In [None]:
df_sel_yes.head(2)

In [None]:
df_sel_no.head(2)

### Save random selected pathways

In [None]:
verbose=True
force=False

dff=gem.merge_and_save_random_df_yes_no(N, case, query_type,
                                          df_sel_yes, df_sel_no, force=force, verbose=verbose)
print(len(dff))
dff.head(6)

In [None]:
verbose=False

df_yes, df_no, df_sel_yes, df_sel_no=gem.select_random_results(case=case, chosen_model=chosen_model,
                                                                 N=N, root=root, query_type=query_type, 
                                                                 verbose=verbose)

print(gem.gemini_model)
len(df_sel_yes), len(df_sel_no)

In [None]:
verbose=True
force=False

dff=gem.merge_and_save_random_df_yes_no(N, case, query_type, df_sel_yes, df_sel_no, force=force, verbose=verbose)
print(len(dff))
dff.head(6)