In [1]:
from platform import python_version
print(python_version())

3.12.0


## Gemini API

https://ai.google.dev/gemini-api/docs

#### API key - Free of charge

https://aistudio.google.com/app/apikey

AIzaSyA1ZXcSe6NP5jiIw93sUpZYb8RKK1PgYDE

### Google Enable API

  - You are about to enable 'Generative Language API'.

https://ai.google.dev/gemini-api/docs/oauth

### Google Python projects

#### Gemini API Text Implementation

https://github.com/RepellentSpy/Gemini-API-Text-Implementation/tree/main

#### gemini-api 0.1.6

https://pypi.org/project/gemini-api/


#### Gemini-API

https://github.com/dsdanielpark/Gemini-API

## LLM - Large Language Model

### Gemini flash

gemini-1.5-flash-latest

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
# !nvidia-smi

In [4]:
# !lsmod | grep nvidia

In [5]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from entrez_conversion import *
from pubmed_lib import *
from biopax_lib import *
from gemini_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

email = "flalix@gmail.com"

# !pip3 install pyyaml
with open('config_taubate.yml', 'r') as file:
    dic_yml = yaml.safe_load(file)

print(dic_yml)

{'root_chibe': '../../chibe/', 'root_colab': '../../colaboracoes/', 'root0': '../../colaboracoes/covid/sonia_andrade/taubate/proteomics_202205/', 'project': 'Taubate COVID-19', 's_project': 'taubate_covid19', 'gene_protein': 'protein', 's_omics': 'proteomics', 'has_age': True, 'has_gender': True, 'want_normalized': False, 'abs_lfc_cutoff_inf': 0.4, 's_pathw_enrichm_method': 'enricher', 'num_min_degs_for_ptw_enr': 3, 'tolerance_pathway_index': 0.15, 'case_list': ['g2a_male', 'g2a_female', 'g2b_male', 'g2b_female', 'g3_male_adult', 'g3_male_elder', 'g3_female_adult', 'g3_female_elder'], 'pval_pathway_cutoff': 0.05, 'fdr_pathway_cutoff': 0.05, 'num_of_genes_cutoff': 3, 'API_KEY': 'AIzaSyA1ZXcSe6NP5jiIw93sUpZYb8RKK1PgYDE', 'disease': 'COVID-19', 'context_disease': 'COVID-19 is the disease caused by the virus SARS-CoV-2. When the virus enters your body, it infects cells, primarily in the respiratory system.', 'n_sentences': 5, 'run_list': ['run01', 'run02'], 'chosen_model_list': [1, 3], 'i_

In [6]:
root_chibe = dic_yml['root_chibe']
root_colab = dic_yml['root_colab']
root0 = dic_yml['root0']

project = dic_yml['project']
s_project = dic_yml['s_project']

gene_protein = dic_yml['gene_protein']
s_omics = dic_yml['s_omics']

has_age = dic_yml['has_age']
has_gender = dic_yml['has_gender']

want_normalized = dic_yml['want_normalized']

abs_lfc_cutoff_inf = dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method = dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr = dic_yml['num_min_degs_for_ptw_enr']
tolerance_pathway_index = dic_yml['tolerance_pathway_index']

case_list = dic_yml['case_list']

pval_pathway_cutoff = dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff = dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff = dic_yml['num_of_genes_cutoff']

run_list = dic_yml['run_list']
chosen_model_list = dic_yml['chosen_model_list']
i_dfp_list = dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

G/P LFC cutoffs: lfc=1.000; fdr=0.050
Pathway cutoffs: pval=0.050; fdr=0.050; num of genes=3


In [7]:
bpx = Biopax(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method)

case = case_list[5]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

Start opening tables ....
Building synonym dictionary ...


Echo Parameters:
For case 'g3_male_elder', there are 140/140 DAPs/DAPs with ensembl_id
DAP's cutoffs: abs(LFC)=0.600; FDR=0.400
	140/140 DAPs/ensembl.
		Up 40/40 DAPs/ensembl.
		Dw 100/100 DAPs/ensembl.

Found 60 (best=60) pathways for geneset num=0 'Reactome_2022'
Pathway cutoffs p-value=0.050 fdr=0.050 min genes=3
DAPs found in enriched pathways:
	There are 140 DAPs found in pathways
	105 (best=105) DAPs in pathways and 35/35 DAPs/ensembl not in pathways

	34 DAPs ensembl Up in pathways
	6 DAPs Up ensembl not in pathways

	71 DAPs ensembl Dw in pathways
	29 DAPs Dw ensembl not in pathways


In [8]:
fname, fname_cutoff=bpx.set_enrichment_name()
fname, fname_cutoff

('enricher_Reactome_2022_taubate_covid19_proteomics_for_g3_male_elder_x_ctrl_not_normalized_cutoff_lfc_0.600_fdr_0.400.tsv',
 'enricher_Reactome_2022_taubate_covid19_proteomics_for_g3_male_elder_x_ctrl_not_normalized_cutoff_lfc_0.600_fdr_0.400_pathway_pval_0.050_fdr_0.050_num_genes_3.tsv')

### Run all, after finding a case
  - please open dfr (reactome) before

In [9]:
Nenr=len(bpx.df_enr)
Nenr

60

In [10]:
bpx.df_enr.head(3)

Unnamed: 0,pathway,pathway_id,pval,fdr,odds_ratio,combined_score,genes,num_of_genes
0,Platelet Degranulation,R-HSA-114608,4.890639e-33,2.239913e-30,48.18259,3584.687599,"['LGALS3BP', 'SERPINA3', 'ECM1', 'ORM1', 'ITIH3', 'SERPINA1', 'F13A1', 'LOC1...",27
1,Response To Elevated Platelet Cytosolic Ca2+,R-HSA-76005,1.555951e-32,3.563128e-30,45.83203,3356.767042,"['LGALS3BP', 'SERPINA3', 'ECM1', 'ORM1', 'ITIH3', 'SERPINA1', 'F13A1', 'LOC1...",27
2,"Platelet Activation, Signaling And Aggregation",R-HSA-76002,1.140647e-25,1.741388e-23,21.719027,1247.389535,"['LGALS3BP', 'SERPINA3', 'ITIH3', 'ECM1', 'ORM1', 'SERPINA1', 'F13A1', 'LOC1...",28


In [11]:
df_enr0=bpx.df_enr0
len(df_enr0)

458

### Instantiating Gemini

In [12]:
##################
is_seldata=True
i_dfp=0
i_dfp_list=[0]
##################

In [13]:
API_KEY = dic_yml['API_KEY']

disease = dic_yml['disease']
context_disease = dic_yml['context_disease']
n_sentences = dic_yml['n_sentences']
chosen_model_sampling = dic_yml['chosen_model_sampling']

gem = Gemini( bpx=bpx, is_seldata=is_seldata, disease=disease, context_disease=context_disease, 
             API_KEY=API_KEY, n_sentences=n_sentences, root0=root0, 
             chosen_model_list=chosen_model_list, i_dfp_list=i_dfp_list, chosen_model_sampling=chosen_model_sampling)
print("\n")
print(gem.disease, gem.is_seldata, gem.i_dfp_list, gem.chosen_model_list)
print("Context:", context_disease)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.




COVID-19 True [0] [1, 3]
Context: COVID-19 is the disease caused by the virus SARS-CoV-2. When the virus enters your body, it infects cells, primarily in the respiratory system.


In [14]:
gem.set_case(bpx.case, bpx.df_enr, bpx.df_enr0)

In [15]:
terms1_param = dic_yml['terms1_param']
terms2_param = dic_yml['terms2_param']
terms_not_param = dic_yml['terms_not_param']
connective_param = dic_yml['connective_param']
remove_synonym_list = dic_yml['remove_synonym_list']
inidate = dic_yml['inidate']
enddate = dic_yml['enddate']
verbose_query = dic_yml['verbose_query']
force_query = dic_yml['force_query']
sleep_entrez = dic_yml['sleep_entrez']
retmax = dic_yml['retmax']
try_all_text = dic_yml['try_all_text']
text_quote = dic_yml['text_quote']
dec_ncpus = dic_yml['dec_ncpus']
sleep_TIKA = dic_yml['sleep_TIKA']
min_words_text = dic_yml['min_words_text']

prefix = s_project

pub = Pubmed(bpx, gem, email, prefix, root0=root0, 
             inidate=inidate, enddate=enddate, 
             terms1_param=terms1_param, terms2_param=terms2_param,
             terms_not_param=terms_not_param, connective_param=connective_param,
             remove_synonym_list=remove_synonym_list, 
             sleep_entrez = sleep_entrez, retmax=retmax,  
             try_all_text=try_all_text, text_quote=text_quote,
             root_colab=root_colab, dec_ncpus=dec_ncpus, sleep_TIKA=sleep_TIKA, min_words_text=min_words_text)

pub.is_seldata

Start opening tables ....
Building synonym dictionary ...

File ../src/down_pdf_pmid.sh exists True


True

### Settings: selected data

In [16]:
chosen_model_list

[1, 3]

In [17]:
test=False
force=False
verbose=False

chosen_model=1
gemini_model='gemini-1.5-pro'

case_sel_list = ['g3_male_adult', 'g3_female_elder']
case_list = case_sel_list
case_sel0 = case_sel_list[0]
case_sel1 = case_sel_list[1]

with_gender=True

query_type='strong'
N=30

############## for selected ###############
pub.gem.chosen_model_list = chosen_model_list
#############################

case=case_sel_list[0]

print("")
dfsel = pub.gem.open_yes_no_sampling(case=case, N=N, query_type=query_type, verbose=True)
print("")
dfsel.head(3)


Table opened ((30, 3)) at '../../colaboracoes/covid/sonia_andrade/taubate/proteomics_202205/gemini/sampling_30_regs_yes_no_case_g3_male_adult_model_gemini-1.5-flash_query_type_strong.tsv'



Unnamed: 0,pathway_id,pathway,fdr
0,R-HSA-8953854,Metabolism Of RNA,0.906
1,R-HSA-194068,Bile Acid And Bile Salt Metabolism,0.255
2,R-HSA-9029569,NR1H3 And NR1H2 Regulate Gene Expression Linked To Cholesterol Transport And...,0.000128


In [18]:
pub.gem.is_seldata, pub.is_seldata,run_list, chosen_model_list, chosen_model, i_dfp_list, pub.gem.n_sentences, pub.gem.chosen_model_sampling

(True, True, ['run01', 'run02'], [1, 3], 3, [0], 5, 3)

In [19]:
dfr=gem.reactome.open_reactome(verbose=True)
print(len(dfr))
dfr.tail(3)

Table opened ((2673, 2)) at '../../colaboracoes/reactome/data/ReactomePathways_hsa.tsv'
2673


Unnamed: 0,pathway_id,pathway
2670,R-HSA-199992,trans-Golgi Network Vesicle Budding
2671,R-HSA-192814,vRNA Synthesis
2672,R-HSA-192905,vRNP Assembly


In [20]:
pathway_id='R-HSA-71406'
df2=dfr[dfr.pathway_id == pathway_id]
df2

Unnamed: 0,pathway_id,pathway
1780,R-HSA-71406,Pyruvate metabolism and Citric Acid (TCA) cycle


### Reactome abstracts

In [21]:
i=0

row=gem.dfr.iloc[i]
pathway=row.pathway
ptw_abst=row.abstract
ptw_abst2=gem.prepare_abstract_n_sentences(ptw_abst)
ptw_abst2

'cdc25A protein is degraded by the ubiquitin-proteasome machinery in both terminally differentiating and cycling cells (Bernardi et al. 2000).'

### Starting run all SELECTED PATHWAYS - save in pubmed directory

In [22]:
pub.is_seldata, run_list, case_sel_list, i_dfp_list, chosen_model_sampling

(True, ['run01', 'run02'], ['g3_male_adult', 'g3_female_elder'], [0], 3)

In [30]:
%%time

force=False
verbose=False
num_tries=3
pause_secs=0

chosen_model=1

for run in run_list:
    print(">>>", run,)
    for case in case_sel_list:
        print(f'\t{case} model {chosen_model}', end=' ')
        gem.run_all_selected_gemini(run=run, case=case, i_dfp_list=i_dfp_list, chosen_model=chosen_model, 
                                    N=N, query_type=query_type,
                                    num_tries=num_tries, pause_secs=pause_secs,
                                    force=force, verbose=verbose)
        print("")
    print("\n====================================\n")

>>> run01
	g3_male_adult model 1 >> Gemini model: gemini-1.5-pro
	$$$ iq=0 simple 		dfp 0 # 30
	$$$ iq=1 simple+pubmed 		dfp 0 # 30
	$$$ iq=2 disease 		dfp 0 # 30
	$$$ iq=3 disease+pubmed 		dfp 0 # 30

-------------- final end --------------


	g3_female_elder model 1 >> Gemini model: gemini-1.5-pro
	$$$ iq=0 simple 		dfp 0 # 30
	$$$ iq=1 simple+pubmed 		dfp 0 # 30
	$$$ iq=2 disease 		dfp 0 # 30
	$$$ iq=3 disease+pubmed 		dfp 0 # 30

-------------- final end --------------




>>> run02
	g3_male_adult model 1 >> Gemini model: gemini-1.5-pro
	$$$ iq=0 simple 		dfp 0 # 30
	$$$ iq=1 simple+pubmed 		dfp 0 # 30
	$$$ iq=2 disease 		dfp 0 # 30
	$$$ iq=3 disease+pubmed 		dfp 0 # 30

-------------- final end --------------


	g3_female_elder model 1 >> Gemini model: gemini-1.5-pro
	$$$ iq=0 simple 		dfp 0 # 30
	$$$ iq=1 simple+pubmed 		dfp 0 # 30
	$$$ iq=2 disease 		dfp 0 # 30
	$$$ iq=3 disease+pubmed 		dfp 0 # 30

-------------- final end --------------




CPU times: user 1.19 s, sys: 84.2 ms

In [31]:
verbose=True

case=case_sel_list[0]
run='run01'
query_type='_strong'
want_pubmed=True

dfgem = gem.get_gemini_results_by_case_model_semantics(run=run, case=case, chosen_model=chosen_model,
                                                       i_dfp_list=i_dfp_list, want_pubmed=want_pubmed, 
                                                       query_type=query_type, verbose=verbose)
if dfgem is None:
    dfgem = pd.DataFrame()
    
print(len(dfgem))
dfgem.head(3)

Table opened ((30, 11)) at '../../colaboracoes/covid/sonia_andrade/taubate/proteomics_202205/pubgem/gemini/run01/gemini_search_for_COVID-19_g3_male_adult_question_with_PubMed_yes_no_possible_low_evidence_question_strong_relationship_in_studies_of_modulated_pathways_0_selected_model_gemini-1.5-pro_selected.tsv'
30


Unnamed: 0,run,case,pathway_id,pathway,i_dfp,fdr,curation,response_explain,score_explain
0,run01,g3_male_adult,R-HSA-114608,Platelet Degranulation,0,6.98e-10,Yes,Yes; Several studies in PubMed demonstrate a strong relationship between pla...,0.989471
1,run01,g3_male_adult,R-HSA-8964058,HDL Remodeling,0,8.93e-07,Low,Low evidence.\n\nWhile HDL is known to play a role in lipid metabolism and i...,-0.990575
2,run01,g3_male_adult,R-HSA-6798695,Neutrophil Degranulation,0,9.76e-06,Yes,"Yes; Neutrophil degranulation is strongly implicated in severe COVID-19, par...",0.903832
