In [1]:
from platform import python_version
print(python_version())

3.12.0


In [2]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from entrez_conversion import *
from pubmed_lib import *
from gemini_lib import *
from biopax_lib import *

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

email = "flalix@gmail.com"

### Pubmed search

#### scipy corpus data

https://allenai.github.io/scispacy/

In [3]:
root_chibe = "../../chibe/"
root_colab = '../../colaboracoes/'
root0       = '../../colaboracoes/covid/sonia_andrade/taubate/proteomics_202205/'

project = 'Taubate COVID-19'
s_project = 'taubate_covid19'

gene_protein = 'protein'
s_omics = 'proteomics'

has_age = True
has_gender = True

want_normalized = False
exp_normalization='quantile_norm' if want_normalized else None
normalization = 'not_normalized' if exp_normalization is None else exp_normalization

abs_lfc_cutoff_inf = 0.40
s_pathw_enrichm_method = 'enricher'
num_min_degs_for_ptw_enr=3
tolerance_pathway_index = 0.15

case_list = ['g2a_male', 'g2a_female', 
             'g2b_male', 'g2b_female', 
             'g3_male_adult',   'g3_male_elder',
             'g3_female_adult', 'g3_female_elder']

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

pval_pathway_cutoff = 0.05
fdr_pathway_cutoff = .05
num_of_genes_cutoff = 3

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

G/P LFC cutoffs: lfc=1.000; fdr=0.050
Pathway cutoffs: pval=0.050; fdr=0.050; num of genes=3


In [4]:
pathway_name_id = 'Hemostasis - R-HSA-109582'
pathway_name_id = 'Regulation Of IGF Transport And Uptake By IGFBPs - R-HSA-381426'
pathway_name_id = 'Platelet degranulate - R-HSA-114608'
pathway_name_id = 'Platelet Activation, Signaling And Aggregation - R-HSA-76002'
pathway_name_id = 'Integrin Cell Surface Interactions - R-HSA-216083'
pathway_name_id = 'Neutrophil Degranulation - R-HSA-6798695'
pathway_name_id = 'Regulation of Complement cascade - R-HSA-977606'
pathway_name_id = 'Response To Elevated Platelet Cytosolic Ca2+ - R-HSA-76005'

bpx = Biopax(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method)

case = case_list[5]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

geneset_num = bpx.geneset_num

Start opening tables ....
Building synonym dictionary ...


Echo Parameters:
For case 'g3_male_elder', there are 140/140 DAPs/DAPs with ensembl_id
DAP's cutoffs: abs(LFC)=0.600; FDR=0.400
	140/140 DAPs/ensembl.
		Up 40/40 DAPs/ensembl.
		Dw 100/100 DAPs/ensembl.

Found 60 (best=60) pathways for geneset num=0 'Reactome_2022'
Pathway cutoffs p-value=0.050 fdr=0.050 min genes=3
DAPs found in enriched pathways:
	There are 140 DAPs found in pathways
	105 (best=105) DAPs in pathways and 35/35 DAPs/ensembl not in pathways

	34 DAPs ensembl Up in pathways
	6 DAPs Up ensembl not in pathways

	71 DAPs ensembl Dw in pathways
	29 DAPs Dw ensembl not in pathways


In [5]:
API_KEY='AIzaSyA1ZXcSe6NP5jiIw93sUpZYb8RKK1PgYDE'
disease='COVID-19'
context_disease="COVID-19 is the disease caused by the virus SARS-CoV-2. When the virus enters your body, it infects cells, primarily in the respiratory system."

n_sentences=5
chosen_model_list=[1,3]
i_dfp_list=[0,1,2,3]

gem=Gemini(bpx=bpx, disease=disease, context_disease=context_disease, n_sentences=n_sentences, API_KEY=API_KEY, 
             root0=root0, i_dfp_list=i_dfp_list, chosen_model_list=chosen_model_list)
print("\n",context_disease)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.



 COVID-19 is the disease caused by the virus SARS-CoV-2. When the virus enters your body, it infects cells, primarily in the respiratory system.


In [6]:
prefix = s_project
inidate="2019/10/01"
enddate="2030/12/31"

print(prefix, inidate, enddate)

force_query = False
verbose_query=False

sleep_entrez = [30, 90, 300]; retmax=100000,

''' CAP: community-acquired pneumonia
    MV: mechanical ventilator
'''
remove_synonym_list =  ['CAP', 'MV', 'MDB']

pub = Pubmed(bpx, gem, email, prefix, inidate, enddate, 
             root0, remove_synonym_list=remove_synonym_list, 
             sleep_entrez = [5, 7, 10], retmax=100000,  
             try_all_text=True, text_quote='',
             root_colab=root_colab, dec_ncpus=2)

taubate_covid19 2019/10/01 2030/12/31
Start opening tables ....
Building synonym dictionary ...

File ../src/down_pdf_pmid.sh exists True


### PubMed Search

In [None]:
test=False
force=False
verbose=False

for case in case_list:
    for with_gender in [True, False]:
        print(">>>",  case, with_gender)

        terms_not_param = ['NOT', 'MERS', 'SARS-CoV-1']
        terms1_param = ["OR", 'COVID', 'SARS-CoV-2']
        connective_param = 'AND'

    
        _ = pub.run_case_pathway_pubmed_search(case=case, with_gender=with_gender, terms1=terms1_param, 
                                               terms_not=terms_not_param, connective=connective_param, 
                                               test=test, force=force, verbose=verbose)

    print("")
print("-------------- end --------------")

In [None]:
not_list = ['elder']
not_list + ['young', 'child', 'neonat', 'newborn']

In [None]:
terms_and_or, terms_not2 = pub.case_to_terms()
terms_not2

In [None]:
verbose=False

terms_not_param = ['NOT', 'MERS', 'SARS-CoV-1']
terms1_param = ["OR", 'COVID', 'SARS-CoV-2']
connective_param = 'AND'

with_gender = True

i = 0
case = case_list[i]
print(">>>", case)

df_case = pub.run_case_pathway_pubmed_search(case=case, with_gender=with_gender, terms1=terms1_param, 
                                             terms_not=terms_not_param, connective=connective_param, 
                                             inidate=inidate, enddate=enddate,
                                             test=False, save_file=False, force=False, verbose=verbose)
if df_case is None:
    df_case = pd.DataFrame()
print(len(df_case))
df_case.head(3)

In [None]:
pub.df_summ_pmid.head(3)

In [None]:
pub.df_summ_pathway.head(3)

In [None]:
df_case.columns

### PubMed Search Old

In [None]:
pathway_name_id_list = [['Hemostasis - R-HSA-109582', 'Platelet degranulate - R-HSA-114608', 'Platelet Activation, Signaling And Aggregation - R-HSA-76002',
                         'Response To Elevated Platelet Cytosolic Ca2+ - R-HSA-76005'], ['Regulation Of IGF Transport And Uptake By IGFBPs - R-HSA-381426'],
                        ['Integrin Cell Surface Interactions - R-HSA-216083'], ['Neutrophil Degranulation - R-HSA-6798695'],
                        ['Regulation of Complement cascade - R-HSA-977606']]

pathway_concept_name_list = ['Hemostasis', 'IGF Transport', 'Integrins', 'Neutrophils', 'thrombosis', 'vaccination']

pathway_concept_list = [['hemost'], ['IGF', 'Insulin-Like Growth Factor'], ['integrin'], ['neutrophil', 'nets'], ['thrombo', 'clot'], ['vaccin']]

len(pathway_concept_name_list), len(pathway_concept_list), pathway_concept_list

In [None]:
pathway_concept_list

In [None]:
connective = "AND"
choices = ['hemostasis', 'platelet', 'inflammation', 'complement', 'lung', 'cardio',
           'gut', 'elder', 'obese',  'brain', 'antibody', 'mab', 'cellular', 
           'therapy', 'bad_therapy', 'death', 'sequel', 'severe', 'moderate', 'biomarker', 'omics']

''' CAP: community-acquired pneumonia
    MV: mechanical ventilator
'''
remove_synonyms =  ['CAP', 'MV', 'MDB']

In [None]:
dic_choice = {}

terms_not = ['NOT', 'MERS', 'SARS-CoV-1']
terms1 = ["OR", 'COVID', 'SARS-CoV-2']

# choices = ['hemostasis', 'platelet', 'inflammation', 'death', 'sequel']

for choice in choices:
    
    if choice == 'hemostasis':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'hemostasis', 'haemostasis']
        dic2['terms_not'] = terms_not
        
    elif choice == 'platelet':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'platelet']
        dic2['terms_not'] = terms_not
        
    elif choice == 'inflammation':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'inflammation', 'innate']
        dic2['terms_not'] = terms_not + ['complement']

    elif choice == 'complement':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'complement']
        dic2['terms_not'] = terms_not

    elif choice == 'lung':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'lung', 'alveolus', 'trachea', 'bronch']
        dic2['terms_not'] = terms_not
        
    elif choice == 'cardio':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'cardi', 'miocardi']
        dic2['terms_not'] = terms_not

    elif choice == 'gut':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'gut']
        dic2['terms_not'] = terms_not
        
    elif choice == 'elder':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'elder']
        dic2['terms_not'] = terms_not
        
    elif choice == 'obese':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'obese']

        dic2['terms_not'] = terms_not
        
    elif choice == 'brain':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'brain', 'nervous system']
        dic2['terms_not'] = terms_not

    elif choice == 'antibody':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'antibody', 'b-cell', 'innactivation', 'humoral']
        dic2['terms_not'] = terms_not + ['monoclonal antibody', 'MAB', 'cellular response', 't-cell']

    elif choice == 'mab':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'monoclonal antibody', 'MAB']
        dic2['terms_not'] = terms_not + ['cellular response', 't-cell']

    elif choice == 'cellular':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'cellular response', 't-cell']
        dic2['terms_not'] = terms_not + ['antibody', 'humoral']
        
    elif choice == 'death':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'death', 'necrosis', 'apoptosis', 'mortal', 'fatal']
        dic2['terms_not'] = terms_not
        
    elif choice == 'sequel':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'sequel', 'disorder']
        dic2['terms_not'] = terms_not
        
    elif choice == 'severe':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'severe']
        dic2['terms_not'] = terms_not
        
    elif choice == 'moderate':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'moderate', 'mild']
        dic2['terms_not'] = terms_not

    elif choice == 'therapy':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'therapy', 'treatment']
        dic2['terms_not'] = terms_not + ['ivermectin', 'chloroquine']
      
    elif choice == 'bad_therapy':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'ivermectin', 'chloroquine']
        dic2['terms_not'] = terms_not
      
    elif choice == 'biomarker':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['biomarker']
        dic2['terms_not'] = terms_not
      
    elif choice == 'omics':
        dic_choice[choice] = {}
        dic2 = dic_choice[choice]
        dic2['terms1'] = terms1
        dic2['terms2'] = ['OR', 'signature', 'transcriptom', 'proteom', 'epigenom']
        dic2['terms_not'] = terms_not
        
    else:
        print("Which choice???", choice)
        continue        

In [None]:
np.array(dic_choice.keys())

In [None]:
inidate, enddate

In [None]:
test=True
save_file=True
force=True
verbose=False

'''
old ...

df_all = pub.run_choice_concept_comparisons(dic_choice, pathway_concept_name_list, pathway_concept_list,
                                            connective, inidate=inidate, enddate=enddate,
                                            test=test, save_file=save_file, force=force, verbose=verbose)
print("-------------- end --------------")
len(df_all)
'''

### Development & tests

In [None]:
df_all.columns

In [None]:
cols = ['concept', 'choice', 'pmid', 'pub_date', 'title',
       'keywords', 'abstract', 'abreviation', 'authors', 'created_date', 'doc_type', 'docid',
       'journalTitle', 'language', 'cases', 'case_comparisons', 'terms', 'dates']
df_all = df_all[cols]

In [None]:
listax = [x for x in df_all.pmid if not isint(x)]
listax

In [None]:
pdwritecsv(df_all, 'df_all.tsv')

In [None]:
df3 = pdreadcsv('df_all.tsv')
print(len(df3))

In [None]:
df3.pmid = df3.pmid.astype(int)

In [None]:
df_all = df_all[ ~df_all.pmid.isna()]
df_all = df_all[ ~df_all.pmid.isna()]
df_all.pmid = df_all.pmid.astype(int)
df_all = df_all.sort_values('pmid')

len(df_all)

In [None]:
for pmid in df_all.pmid:
    if not isinstance(pmid, int):
        print(pmid, type(pmid))

In [None]:
previous_pmid = '';  new_row = None
df_list = []
for i in range(len(df_all)):
    row = df_all.iloc[i].copy()
    pmid = row.pmid

    if previous_pmid != pmid:
        if new_row is not None:
            df_list.append(pd.DataFrame(new_row).T)

        previous_pmid = pmid
        new_row = row

        choice = row.choice
        if isinstance(choice, str):
            choice = eval(choice) if choice.startswith('[') else [choice]

        concept = row.concept
        if isinstance(concept, str):
            concept = eval(concept) if concept.startswith('[') else [concept]

    else:
        choice2 = row.choice
        if isinstance(choice2, str):
            choice2 = eval(choice2) if choice2.startswith('[') else [choice2]

        concept2 = row.concept
        if isinstance(concept2, str):
            concept2 = eval(concept2) if concept2.startswith('[') else [concept2]

        if choice == []:
            choice = choice2
        elif choice2 == []:
            pass
        else:
            choice += choice2

        if concept == concept2 or concept2 == []:
            pass
        elif concept == []:
            concept = concept2
        else:
            concept += concept2

        new_row.choice = choice
        new_row.concept = concept
        new_row.concept_list = None


df_list.append(pd.DataFrame(new_row).T)
dfa = pd.concat(df_list)

dfa['concept'] = [";".join(np.unique(x)) for x in dfa.concept]
dfa['choice']  = [";".join(np.unique(x)) for x in dfa.choice]

dfa = dfa.sort_values(['concept', 'choice', 'pub_date', 'pmid'], ascending=[True, True, False, False])
print("Total found: %d articles"%(len(dfa)))
fname = pub.fname_nosymb0%(pub.prefix, pub.s_abstract)
fname = title_replace(fname)
ret = pdwritecsv(dfa, fname, pub.root_pubmed, verbose=True)


In [None]:
terms = [  ['AND'],
         [],
           ['AND', 'AAAA',     'BBBBB'],
           ['OR',       'CCCC', 'CCCCS'],
           ['AND',      'VVVV', 'UUUU'],
           ['AND NOT ', 'CANCER', 'DIABETIS']]
    
term_genes = []

pub.terms = terms
pub.term_genes = term_genes

pub.build_query(terms, term_genes, verbose=False)

In [None]:
verbose_query=False; force_query = False

for choice in choices:
    dic2 = dic_choice[choice]
    
    terms1 = dic2['terms1'] 
    terms2 = dic2['terms2'] 
    terms_not = dic2['terms_not'] 
    
    print(choice)
    print('\t', terms1, terms2, terms_not,'\n')
    
    #----------------------------------------------------------
    pub = Pubmed(email, prefix, root0, sleep_entrez=sleep_entrez, 
                 force_query=force_query, verbose_query=verbose_query, retmax=retmax)
    pub.biobert_init(root_biobert = root_biobert)


    df_choi = pub.run_big_comparisons(bpx, choice, pathway_concept_name_list, pathway_concept_list,
                                      connective=connective, terms1=terms1, terms2=terms2,
                                      terms_not=terms_not, remove_synonyms=remove_synonyms,
                                      inidate=inidate, enddate=enddate, 
                                      only_title_abstract=True, text_quote='', save_file=True,
                                      force=force, verbose=verbose)
    
    df_all_list.append(df_choi)


print("\n--------------- final end -------------------\n")


In [None]:
df_all = pd.concat(df_all_list)
df_all = df_all.sort_values('pmid')
print(len(df_all))

previous_pmid = ''
new_row = None
df_list = []

for i in range(len(df_all)):
    row = df_all.iloc[i].copy()
    pmid = row.pmid

    if previous_pmid != pmid:
        if new_row is not None:
            df_list.append(pd.DataFrame(new_row).T)
            
        previous_pmid = pmid
        new_row = row
        choice  = [] if row.choice  is None or row.choice  == [] else [row.choice]
        concept = [] if row.concept is None or row.concept == '' else [row.concept]
    else:
        choice2  = [] if row.choice  is None or row.choice  == [] else [row.choice]
        concept2 = [] if row.concept is None or row.concept == '' else [row.concept]
        
        if choice == []:
            choice = choice2
        elif choice2 == []:
            pass
        else:
            choice += choice2
            
        if concept == concept2 or concept2 == []:
            pass
        elif concept == []:
            concept = concept2
        else:
            concept == concept2
            
        new_row.choice = choice
        new_row.concept = concept
        new_row.concept_list = None
        
        
df_list.append(pd.DataFrame(new_row).T)
print(len(df_list))

In [None]:
dfa = pd.concat(df_list)
print(len(dfa))
dfa.head()

In [None]:
dfa['concept'] = [";".join(list(np.unique(x))) for x in dfa.concept]
dfa['choice']  = [";".join(list(np.unique(x))) for x in dfa.choice]

dfa = dfa.sort_values(['concept', 'choice', 'pub_date', 'pmid'], ascending=[True, True, False, False])
print("Total found: %d articles"%(len(dfa)))
fname = "pubmed_summ_%s_no_symbol_%s.tsv"%(pub.prefix, pub.s_abstract)
ret = pdwritecsv(dfa, fname, pub.root_pubmed, verbose=True)