In [1]:
from platform import python_version
print(python_version())

3.11.9


In [2]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from entrez_conversion import *
from pubmed_lib import *
from biopax_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

email = "flalix@gmail.com"

### Pubmed search

#### scipy corpus data

https://allenai.github.io/scispacy/

In [3]:
root_chibe = "../../chibe/"
root_colab = '../../colaboracoes/'
root0      = '../../colaboracoes/aparecida/'

project = 'Medulloblastoma microarray study'
s_project = 'medulloblastoma'

gene_protein = 'dna'
s_omics = 'microarray'

has_age = False
has_gender = False

want_normalized = False
exp_normalization='quantile_norm' if want_normalized else None
normalization = 'not_normalized' if exp_normalization is None else exp_normalization

abs_lfc_cutoff_inf = 0.80
s_pathw_enrichm_method = 'enricher'
num_min_degs_for_ptw_enr=3

#------------ pathway pseudo-modulation index ------------
tolerance_pathway_index = 0.15
type_sat_ptw_index = 'linear_sat'
saturation_lfc_index = 5

case_list = ['WNT', 'G4']
case = case_list[0]

cfg = Config(project, s_project, case_list, root0)

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

pval_pathway_cutoff = 0.05
fdr_pathway_cutoff = .05
num_of_genes_cutoff = 3

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

G/P LFC cutoffs: lfc=1.000; fdr=0.050
Pathway cutoffs: pval=0.050; fdr=0.050; num of genes=3


In [4]:
pathway_name_id = 'Sensory Processing Of Sound By Inner Hair Cells Of Cochlea - R-HSA-9662360'
pathway_name_id = 'Cardiac Conduction - R-HSA-5576891'
pathway_name_id = 'RHOB GTPase Cycle - R-HSA-9013026'
pathway_name_id = 'Gap Junction Assembly - R-HSA-190861'
pathway_name_id = 'Opioid Signaling - R-HSA-111885'
pathway_name_id = 'Neuronal System - R-HSA-112316'

bpx = Biopax(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method)

case = case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

geneset_num = bpx.geneset_num

Start opening tables ....
Building synonym dictionary ...


Echo Parameters:
For case 'WNT', there are 1043/766 DEGs/DEGs with ensembl_id
DEG's cutoffs: abs(LFC)=1.000; FDR=0.130
	1043/766 DEGs/ensembl.
		Up 340/218 DEGs/ensembl.
		Dw 703/548 DEGs/ensembl.

Found 68 (best=68) pathways for geneset num=0 'Reactome_2022'
Pathway cutoffs p-value=0.050 fdr=0.150 min genes=3
DEGs found in enriched pathways:
	There are 766 DEGs found in pathways
	279 (best=279) DEGs in pathways and 764/487 DEGs/ensembl not in pathways

	74 DEGs ensembl Up in pathways
	144 DEGs Up ensembl not in pathways

	205 DEGs ensembl Dw in pathways
	343 DEGs Dw ensembl not in pathways


In [5]:
prefix = s_project
inidate="2019/10/01"
enddate="2030/12/31"

force_query = False
verbose_query=False

sleep_entrez = [30, 90, 300]; retmax=100000,

''' CAP: community-acquired pneumonia
    MV: mechanical ventilator
'''
remove_synonym_list =  ['CAP', 'MV', 'MDB']

gem=None

pub = Pubmed(bpx, gem, email, prefix, inidate, enddate, 
             root0, remove_synonym_list=remove_synonym_list, 
             sleep_entrez = [5, 7, 10], retmax=100000,  
             try_all_text=True, text_quote='',
             root_colab=root_colab, dec_ncpus=2)

Start opening tables ....
Building synonym dictionary ...

File ../src/down_pdf_pmid.sh exists True


### PubMed Search

In [10]:
verbose=False

terms_not_param = ['NOT', 'COVID', 'SARS-CoV']
terms1_param = ['medulloblastoma']
connective_param = 'AND'

dfg = pub.run_all_cases_summarize_pubmed_search(with_gender=False, terms1=terms1_param, 
                                                terms_not=terms_not_param, connective=connective_param, 
                                                test=False, save_file=False, force=False, verbose=verbose)

print(len(dfg))
dfg.head(3)

>>> WNT
>>> Total pmid's: 59
>>> G4
>>> Total pmid's: 4
12


Unnamed: 0,case,pathway_id,pathway,n
0,WNT,R-HSA-195721,Signaling By WNT,49
1,WNT,R-HSA-162582,Signal Transduction,2
2,WNT,R-HSA-888590,"GABA Synthesis, Release, Reuptake And Degradation",2


In [8]:
i=0
case=case_list[i]
dfg[dfg.case == case]

Unnamed: 0,case,pathway_id,pathway,n
0,WNT,R-HSA-195721,Signaling By WNT,49
1,WNT,R-HSA-162582,Signal Transduction,2
2,WNT,R-HSA-888590,"GABA Synthesis, Release, Reuptake And Degradation",2
3,WNT,R-HSA-112316,Neuronal System,1
4,WNT,R-HSA-1296071,Potassium Channels,1
5,WNT,R-HSA-1630316,Glycosaminoglycan Metabolism,1
6,WNT,R-HSA-3000157,Laminin Interactions,1
7,WNT,R-HSA-419037,NCAM1 Interactions,1
8,WNT,R-HSA-4641265,Repression Of WNT Target Genes,1


In [9]:
i=1
case=case_list[i]
dfg[dfg.case == case]

Unnamed: 0,case,pathway_id,pathway,n
9,G4,R-HSA-112316,Neuronal System,2
10,G4,R-HSA-109582,Hemostasis,1
11,G4,R-HSA-162582,Signal Transduction,1


### Development & tests

In [None]:
dfg = dfs.groupby(['pathway_id', 'pathway']).pmid.count().reset_index().iloc[:,:3]
dfg['case'] = case
cols = ['pathway_id', 'pathway', 'n', 'case']
dfg.columns = cols
cols = ['case', 'pathway_id', 'pathway', 'n']
dfg = dfg[cols]
dfg = dfg.sort_values('n', ascending=False)
dfg.index = np.arange(0, len(dfg))
dfg

In [None]:
for case in case_list:
    fname_case = pub.fname_no_symb_case%(pub.prefix, case, pub.s_all_text_or_abstract)
    fname_case = title_replace(fname_case)
    fullname = os.path.join(pub.root_pubmed, fname_case)
    
    if not os.path.exists(fullname):
        print(f"Could not find {case}: {fullname}")
        continue

    dfs = pdreadcsv(fname_case, pub.root_pubmed)
    print(f"Found {case}: {len(dfs)} regs")

    dfg = summarize_pubmed_search(dfs)

    # fname = 'summary_per_pathway_%s_case_%s.tsv'%(pub.prefix, case)
    # ret = pdwritecsv(dfg, fname, pub.root_pubmed, verbose=True)


In [None]:
i = 0
case = case_list[i]

fname = 'summary_per_pathway_%s_case_%s.tsv'%(pub.prefix, case)

dfg = pdreadcsv(fname, pub.root_pubmed, verbose=False)
print(len(dfg))
dfg

In [None]:
ret, _, _, _ = bpx.open_case(case, prompt_verbose=False, verbose=False)
df_enr = bpx.df_enr
print(len(df_enr))

In [None]:
dfg_not = df_enr[ ~df_enr.pathway_id.isin(dfg.pathway_id) ].copy()
dfg_not.index = np.arange(0, len(dfg_not))
print(len(dfg_not))

cols = ['pathway', 'pathway_id', 'pval', 'fdr', 'odds_ratio', 'combined_score', 'genes', 'num_of_genes']
cols = ['pathway', 'pathway_id', 'fdr', 'genes', 'num_of_genes']

dfg_not[cols]

In [None]:
dfg_not.columns

In [None]:
i = 1
case = case_list[i]

fname = 'summary_per_pathway_%s_case_%s.tsv'%(pub.prefix, case)

dfg = pdreadcsv(fname, pub.root_pubmed, verbose=False)
print(len(dfg))
dfg

In [None]:
verbose=False
test = False

terms_not_param = ['NOT', 'MERS', 'SARS-CoV-1']
terms1_param = ["OR", 'COVID', 'SARS-CoV-2']
connective_param = 'AND'

i = 0
case = case_list[i]
print(">>>", case)

df_case = pub.run_case_pathway_pubmed_search(case=case, with_gender=False, terms1=terms1_param, 
                                             terms_not=terms_not_param, connective=connective_param, 
                                             test=test, save_file=False, force=False, verbose=verbose)
if df_case is None:
    df_case = pd.DataFrame()
print(len(df_case))
df_case.head(3)

In [None]:
def summarize_pubmed_search(dfs):
    dfg = dfs.groupby(['pathway_id', 'pathway']).pmid.count().reset_index().iloc[:,:3]
    dfg['case'] = case
    cols = ['pathway_id', 'pathway', 'n', 'case']
    dfg.columns = cols
    cols = ['case', 'pathway_id', 'pathway', 'n']
    dfg = dfg[cols]
    dfg = dfg.sort_values('n', ascending=False)
    dfg.index = np.arange(0, len(dfg))
    return dfg


dfg = summarize_pubmed_search(df_case)
print(len(dfg))
dfg

### With Gender

In [None]:
case

In [None]:
dfgen = pub.df_summ_pathway 
dfgen

### Without Gender

In [None]:
i = 0
case = case_list[i]

df_case = pub.run_case_pathway_pubmed_search(case=case, with_gender=False, terms1=terms1_param, 
                                             terms_not=terms_not_param, connective=connective_param, 
                                             test=test, save_file=False, force=False, verbose=verbose)

dfwog = pub.df_summ_pathway
dfwog