In [1]:
from platform import python_version
print(python_version())

3.12.0


In [2]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from entrez_conversion import *
from pubmed_lib import *
from gemini_lib import *
from biopax_lib import *

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

email = "flalix@gmail.com"

### Pubmed search

#### scipy corpus data

https://allenai.github.io/scispacy/

In [3]:
root_chibe = "../../chibe/"
root_colab = '../../colaboracoes/'
root0       = '../../colaboracoes/covid/sonia_andrade/taubate/proteomics_202205/'

project = 'Taubate COVID-19'
s_project = 'taubate_covid19'

gene_protein = 'protein'
s_omics = 'proteomics'

has_age = True
has_gender = True

want_normalized = False
exp_normalization='quantile_norm' if want_normalized else None
normalization = 'not_normalized' if exp_normalization is None else exp_normalization

abs_lfc_cutoff_inf = 0.40
s_pathw_enrichm_method = 'enricher'
num_min_degs_for_ptw_enr=3
tolerance_pathway_index = 0.15

case_list = ['g2a_male', 'g2a_female', 
             'g2b_male', 'g2b_female', 
             'g3_male_adult',   'g3_male_elder',
             'g3_female_adult', 'g3_female_elder']

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

pval_pathway_cutoff = 0.05
fdr_pathway_cutoff = .05
num_of_genes_cutoff = 3

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

G/P LFC cutoffs: lfc=1.000; fdr=0.050
Pathway cutoffs: pval=0.050; fdr=0.050; num of genes=3


In [4]:
pathway_name_id = 'Hemostasis - R-HSA-109582'
pathway_name_id = 'Regulation Of IGF Transport And Uptake By IGFBPs - R-HSA-381426'
pathway_name_id = 'Platelet degranulate - R-HSA-114608'
pathway_name_id = 'Platelet Activation, Signaling And Aggregation - R-HSA-76002'
pathway_name_id = 'Integrin Cell Surface Interactions - R-HSA-216083'
pathway_name_id = 'Neutrophil Degranulation - R-HSA-6798695'
pathway_name_id = 'Regulation of Complement cascade - R-HSA-977606'
pathway_name_id = 'Response To Elevated Platelet Cytosolic Ca2+ - R-HSA-76005'

bpx = Biopax(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method)

case = case_list[5]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

geneset_num = bpx.geneset_num

Start opening tables ....
Building synonym dictionary ...


Echo Parameters:
For case 'g3_male_elder', there are 140/140 DAPs/DAPs with ensembl_id
DAP's cutoffs: abs(LFC)=0.600; FDR=0.400
	140/140 DAPs/ensembl.
		Up 40/40 DAPs/ensembl.
		Dw 100/100 DAPs/ensembl.

Found 60 (best=60) pathways for geneset num=0 'Reactome_2022'
Pathway cutoffs p-value=0.050 fdr=0.050 min genes=3
DAPs found in enriched pathways:
	There are 140 DAPs found in pathways
	105 (best=105) DAPs in pathways and 35/35 DAPs/ensembl not in pathways

	34 DAPs ensembl Up in pathways
	6 DAPs Up ensembl not in pathways

	71 DAPs ensembl Dw in pathways
	29 DAPs Dw ensembl not in pathways


In [5]:
API_KEY='AIzaSyA1ZXcSe6NP5jiIw93sUpZYb8RKK1PgYDE'
disease='COVID-19'
context_disease="COVID-19 is the disease caused by the virus SARS-CoV-2. When the virus enters your body, it infects cells, primarily in the respiratory system."

n_sentences=5
chosen_model_list=[1,3]
i_dfp_list=[0,1,2,3]

gem=Gemini(bpx=bpx, disease=disease, context_disease=context_disease, n_sentences=n_sentences, API_KEY=API_KEY, 
             root0=root0, i_dfp_list=i_dfp_list, chosen_model_list=chosen_model_list)
print("\n",context_disease)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.



 COVID-19 is the disease caused by the virus SARS-CoV-2. When the virus enters your body, it infects cells, primarily in the respiratory system.


In [6]:
prefix = s_project
inidate="2019/10/01"
enddate="2030/12/31"

print(prefix, inidate, enddate)

force_query = False
verbose_query=False

sleep_entrez = [30, 90, 300]; retmax=100000,

''' CAP: community-acquired pneumonia
    MV: mechanical ventilator
'''
remove_synonym_list =  ['CAP', 'MV', 'MDB']

pub = Pubmed(bpx, gem, email, prefix, inidate, enddate, 
             root0, remove_synonym_list=remove_synonym_list, 
             sleep_entrez = [5, 7, 10], retmax=100000,  
             try_all_text=True, text_quote='',
             root_colab=root_colab, dec_ncpus=2)

taubate_covid19 2019/10/01 2030/12/31
Start opening tables ....
Building synonym dictionary ...

File ../src/down_pdf_pmid.sh exists True


### PubMed Search Selected Pathways

In [7]:
fullname = os.path.join(pub.root_refseq, pub.fname_pubmed_reatome)
os.path.exists(fullname), fullname

(True, '../../colaboracoes/refseq/pubmed_to_reatome_terms_table.tsv')

In [8]:
force=False
verbose=False

query_type='strong'
N=30

chosen_model=3
gemini_model='gemini-1.5-flash'
case_sel_list = ['g3_male_adult', 'g3_female_elder']
case=case_sel_list[0]


dfsel = pub.gem.open_yes_no_sampling(case=case, gemini_model=gemini_model, N=N, query_type=query_type, verbose=verbose)
dfsel.head(3)

Unnamed: 0,pathway_id,pathway,fdr
0,R-HSA-8953854,Metabolism Of RNA,0.906
1,R-HSA-194068,Bile Acid And Bile Salt Metabolism,0.255
2,R-HSA-9029569,NR1H3 And NR1H2 Regulate Gene Expression Linked To Cholesterol Transport And...,0.000128


In [9]:
df_ptw_terms = pub.calc_reactome_terms_table(dfsel, verbose=True)
df_ptw_terms.head(3)

Table opened ((227, 3)) at '../../colaboracoes/refseq/pubmed_to_reatome_terms_table.tsv'
All pathways already have terms.


Unnamed: 0,pathway_id,pathway,term
0,R-HSA-114608,Platelet Degranulation,Platelet Degranulation
1,R-HSA-76005,Response To Elevated Platelet Cytosolic Ca2+,Platelet Calcium
2,R-HSA-140877,Formation Of Fibrin Clot (Clotting Cascade),Fibrin Clot


In [10]:
dfa = df_ptw_terms[  pd.isnull(df_ptw_terms.term) | (df_ptw_terms.term == '')]
dfa.empty

True

In [11]:
df_ptw_pubmed = df_ptw_terms[df_ptw_terms.pathway_id.isin(dfsel.pathway_id)].copy()
df_ptw_pubmed.head(3)

Unnamed: 0,pathway_id,pathway,term
0,R-HSA-114608,Platelet Degranulation,Platelet Degranulation
31,R-HSA-6798695,Neutrophil Degranulation,Neutrophil Degranulation
36,R-HSA-5663202,Diseases Of Signal Transduction By Growth Factor Receptors And Second Messen...,Growth Factor Receptors And Second Messenger


In [12]:
test=False
force=False
verbose=False

case_sel_list = ['g3_male_adult', 'g3_female_elder']
run_list=['run01', 'run02']

query_type='strong'
N=30

chosen_model=3
gemini_model='gemini-1.5-flash'

i_dfp=0

for case in case_sel_list:
    for with_gender in [True, False]:
        print(">>>",  case, with_gender)

        terms_not_param = ['NOT', 'MERS', 'SARS-CoV-1']
        terms1_param = ["OR", 'COVID', 'SARS-CoV-2']
        connective_param = 'AND'

    
        _ = pub.run_case_pathway_pubmed_search(case=case, i_dfp=i_dfp, with_gender=with_gender, 
                                               gemini_model=gemini_model, N=N, query_type=query_type,
                                               force=force, verbose=verbose)

    print("")
print("-------------- end --------------")

>>> g3_male_adult True
# Searching in PubMed crossing terms and pathway terms; no symbols:
 0 0 0 0 3221 - found for for R-HSA-1643685 'Disease': AND '['OR', 'COVID', 'SARS-CoV-2']' '['male', 'severe']' '['Disease']'
 3 - found for for R-HSA-373076 'Class A/1 (Rhodopsin-like Receptors)': AND '['OR', 'COVID', 'SARS-CoV-2']' '['male', 'severe']' '['Class', 'A/1']'
 0 0 0 0 0 2 - found for for R-HSA-112316 'Neuronal System': AND '['OR', 'COVID', 'SARS-CoV-2']' '['male', 'severe']' '['Neuronal', 'System']'
 0 0 0 0 0 0 2 - found for for R-HSA-8953854 'Metabolism Of RNA': AND '['OR', 'COVID', 'SARS-CoV-2']' '['male', 'severe']' '['Metabolism', 'RNA']'
 0 0 0 7 - found for for R-HSA-1280218 'Adaptive Immune System': AND '['OR', 'COVID', 'SARS-CoV-2']' '['male', 'severe']' '['Adaptive', 'Immune', 'System']'
 0 134 - found for for R-HSA-74160 'Gene Expression (Transcription)': AND '['OR', 'COVID', 'SARS-CoV-2']' '['male', 'severe']' '['Transcription']'
 1 - found for for R-HSA-196071 'Metaboli