## Gemini API

https://ai.google.dev/gemini-api/docs

#### API key - Free of charge

https://aistudio.google.com/app/apikey

AIzaSyA1ZXcSe6NP5jiIw93sUpZYb8RKK1PgYDE

### Google Enable API

  - You are about to enable 'Generative Language API'.

https://ai.google.dev/gemini-api/docs/oauth

### Google Python projects

#### Gemini API Text Implementation

https://github.com/RepellentSpy/Gemini-API-Text-Implementation/tree/main

#### gemini-api 0.1.6

https://pypi.org/project/gemini-api/


#### Gemini-API

https://github.com/dsdanielpark/Gemini-API

## LLM - Large Language Model

### Gemini flash

gemini-1.5-flash-latest

In [3]:
from platform import python_version
print(python_version())

3.11.11


In [4]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
pd.set_option("display.precision", 3)

import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from biopax_lib import *
from gemini_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

# !pip3 install pyyaml
with open('params.yml', 'r') as file:
    dic_yml=yaml.safe_load(file)

In [5]:
root0=dic_yml['root0']
email=dic_yml['email']

project=dic_yml['project']
s_project=dic_yml['s_project']

gene_protein=dic_yml['gene_protein']
s_omics=dic_yml['s_omics']

has_age=dic_yml['has_age']
has_gender=dic_yml['has_gender']

want_normalized=dic_yml['want_normalized']

abs_lfc_cutoff_inf=dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method=dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr=dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index=dic_yml['tolerance_pathway_index']
type_sat_ptw_index=dic_yml['type_sat_ptw_index']
saturation_lfc_index=dic_yml['saturation_lfc_index']
chosen_model_sampling=dic_yml['chosen_model_sampling']

case_list=dic_yml['case_list']

pval_pathway_cutoff=dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff=dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff=dic_yml['num_of_genes_cutoff']

run_list=dic_yml['run_list']
chosen_model_list=dic_yml['chosen_model_list']
i_dfp_list=dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg=Config(project, s_project, case_list, root0)

case=case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio=-1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw=cfg.get_best_lfc_cutoff(case, 'not_normalized')


print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

G/P LFC cutoffs: lfc=1.000; fdr=0.050
Pathway cutoffs: pval=0.050; fdr=0.050; num of genes=3


In [6]:
bpx=Biopax(gene_protein, s_omics, project, s_project, root0,
           case_list, has_age, has_gender, clone_objects=False,
           exp_normalization=exp_normalization, geneset_num=0, 
           num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
           tolerance_pathway_index=tolerance_pathway_index, 
           s_pathw_enrichm_method=s_pathw_enrichm_method,
           abs_lfc_cutoff_inf=abs_lfc_cutoff_inf, 
           type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case=case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs=bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

geneset_num=bpx.geneset_num

Start opening tables ....
Building synonym dictionary ...


Echo Parameters:
For case 'g2a_male', there are 42/42 DAPs/DAPs with ensembl_id
DAP's cutoffs: abs(LFC)=0.900; FDR=0.150
	42/42 DAPs/ensembl.
		Up 26/26 DAPs/ensembl.
		Dw 16/16 DAPs/ensembl.

Found 45 (best=45) pathways for geneset num=0 'Reactome_2022'
Pathway cutoffs p-value=0.050 fdr=0.050 min genes=3
DAPs found in enriched pathways:
	There are 42 DAPs found in pathways
	27 (best=27) DAPs in pathways and 15/15 DAPs/ensembl not in pathways

	20 DAPs ensembl Up in pathways
	6 DAPs Up ensembl not in pathways

	7 DAPs ensembl Dw in pathways
	9 DAPs Dw ensembl not in pathways


In [7]:
fname, fname_cutoff=bpx.set_enrichment_name()
fname, fname_cutoff

('enricher_Reactome_2022_taubate_covid19_proteomics_for_g2a_male_x_ctrl_not_normalized_cutoff_lfc_0.900_fdr_0.150.tsv',
 'enricher_Reactome_2022_taubate_covid19_proteomics_for_g2a_male_x_ctrl_not_normalized_cutoff_lfc_0.900_fdr_0.150_pathway_pval_0.050_fdr_0.050_num_genes_3.tsv')

### Run all, after finding a case
  - please open dfr (reactome) before

In [8]:
Nenr=len(bpx.df_enr)
Nenr

45

In [9]:
bpx.df_enr.head(3)

Unnamed: 0,pathway,pathway_id,pval,fdr,odds_ratio,combined_score,genes,num_of_genes
0,Platelet Degranulation,R-HSA-114608,1.952e-17,3.425e-15,70.248,2702.787,"['FGB', 'FGA', 'ORM1', 'CLEC3B', 'VWF', 'AHSG', 'FGG', 'SERPING1', 'FLNA', '...",12
1,Response To Elevated Platelet Cytosolic Ca2+,R-HSA-76005,3.172e-17,3.425e-15,67.254,2554.97,"['FGB', 'FGA', 'ORM1', 'CLEC3B', 'VWF', 'AHSG', 'FGG', 'SERPING1', 'FLNA', '...",12
2,"Platelet Activation, Signaling And Aggregation",R-HSA-76002,7.631e-17,5.494e-15,41.079,1524.518,"['FGB', 'FGA', 'ORM1', 'VWF', 'AHSG', 'FGG', 'PPBP', 'GP5', 'YWHAZ', 'CLEC3B...",14


In [10]:
df_enr0=bpx.df_enr0
len(df_enr0)

216

### Ensemble: is_seldata=False

In [11]:
#####################
is_seldata=False
#####################

In [12]:
API_KEY=dic_yml['API_KEY']

disease=dic_yml['disease']
context_disease=dic_yml['context_disease']
n_sentences=dic_yml['n_sentences']
chosen_model_sampling=dic_yml['chosen_model_sampling']

gem=Gemini( bpx=bpx, is_seldata=is_seldata, disease=disease, context_disease=context_disease, 
             API_KEY=API_KEY, n_sentences=n_sentences, root0=root0, 
             chosen_model_list=chosen_model_list, i_dfp_list=i_dfp_list, chosen_model_sampling=chosen_model_sampling)
print("\n")
print(gem.disease, gem.is_seldata, gem.i_dfp_list, gem.chosen_model_list)
print("Context:", context_disease)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0




COVID-19 False [0, 1, 2, 3] [1, 3]
Context: COVID-19 is the disease caused by the virus SARS-CoV-2. When the virus enters your body, it infects cells, primarily in the respiratory system.


In [13]:
gem.set_case(bpx.case, bpx.df_enr, bpx.df_enr0)

In [14]:
dfr=gem.reactome.open_reactome(verbose=True)
print(len(dfr))
dfr.tail(3)

Table opened ((2673, 2)) at '../../../data_aux/reactome/data/ReactomePathways_hsa.tsv'
2673


Unnamed: 0,pathway_id,pathway
2670,R-HSA-199992,trans-Golgi Network Vesicle Budding
2671,R-HSA-192814,vRNA Synthesis
2672,R-HSA-192905,vRNP Assembly


In [15]:
pathway_id='R-HSA-71406'
df2=dfr[dfr.pathway_id == pathway_id]
df2

Unnamed: 0,pathway_id,pathway
1780,R-HSA-71406,Pyruvate metabolism and Citric Acid (TCA) cycle


In [17]:
i=0

row=gem.dfr.iloc[i]
pathway=row.pathway
ptw_abst=row.abstract
ptw_abst2=gem.prepare_abstract_n_sentences(ptw_abst)
ptw_abst2

'cdc25A protein is degraded by the ubiquitin-proteasome machinery in both terminally differentiating and cycling cells (Bernardi et al. 2000).'

### Question type

In [18]:
for quest_type in ['simple', 'simple+pubmed', 'disease', 'disease+pumed'] :
    question, with_without_PubMed, sufix=gem.define_question(quest_type)
    print(quest_type)
    print(f"{with_without_PubMed} and sufix '{sufix}'")
    print(question%(pathway), '\n\n')

simple
without_PubMed and sufix 'yes_no_possible_low_evidence_studies_of_modulated_pathways'
is the pathway 'Ubiquitin Mediated Degradation of Phosphorylated Cdc25A' studied about the disease COVID-19 for male adult mild outpatient? 


simple+pubmed
with_PubMed and sufix 'yes_no_possible_low_evidence_studies_of_modulated_pathways'
is the pathway 'Ubiquitin Mediated Degradation of Phosphorylated Cdc25A' studied in PubMed, about the disease COVID-19 for male adult mild outpatient? 


disease
without_PubMed and sufix 'yes_no_possible_low_evidence_question_strong_relationship_in_studies_of_modulated_pathways'
has the pathway 'Ubiquitin Mediated Degradation of Phosphorylated Cdc25A' a strong relationship in studies related to the disease COVID-19 for male adult mild outpatient? 


disease+pumed
with_PubMed and sufix 'yes_no_possible_low_evidence_question_strong_relationship_in_studies_of_modulated_pathways'
has the pathway 'Ubiquitin Mediated Degradation of Phosphorylated Cdc25A' a strong r

In [19]:
gem.gemini_models

['gemini-1.0-pro',
 'gemini-1.5-pro',
 'gemini-1.5-flash-8b',
 'gemini-1.5-flash',
 'gemini-2.0-flash-exp',
 'gemma-2-2b-it',
 'gemma-2-9b-it',
 'gemma-2-27b-it']

### gemini model

In [20]:
# ['gemini-1.0-pro', 'gemini-1.5-pro', 'gemini-1.5-pro-exp-0801', 'gemini-1.5-flash', 'gemma-2-2b-it', 'gemma-2-9b-it', 'gemma-2-27b-it']

# gem.chosen_model=3
# gem.gemini_model=gem.gemini_models[gem.chosen_model]
# gem.gemini_URL=f"https://generativelanguage.googleapis.com/v1beta/models/{gem.gemini_model}-latest:generateContent?key={API_KEY}"

chosen_model=3
gem.set_gemini_num_model(chosen_model)
gem.gemini_model

'gemini-1.5-flash'

In [21]:
want=True
i_try=0

print(gem.gemini_model)

list_candidates=[]

quest_type='simple'
print_ok=True

if want:
    i=0
    while True:
        print(".", end='')

        question0, with_without_PubMed, sufix=gem.define_question(quest_type)

        question_name0=f'{with_without_PubMed}_{sufix}_0_default'
        question_name1=f'{with_without_PubMed}_{sufix}_0_first'
        question_name2=f'{with_without_PubMed}_{sufix}_1_middle' 
        question_name3=f'{with_without_PubMed}_{sufix}_2_final'
        question_name4=f'{with_without_PubMed}_{sufix}_3_others'

        multiple_data =[ [0, question_name0, gem.df_enr], ]

        dfr=gem.dfr[gem.dfr.pathway_id == pathway_id]
        if dfr.empty:
            print(f"\nError: pathway_id {pathway_id} not found in reactome dfr.")
        else:
            pathway =dfr.iloc[0].pathway
            ptw_abst=dfr.iloc[0].abstract
            ptw_abst=gem.prepare_abstract_n_sentences(ptw_abst)
    
            if ptw_abst[-1] == '.':
                ptw_abst=ptw_abst[:-1]
    
            for i_dfp, question_name, dfp in multiple_data:
        
                s_question0=question0%(pathway)
                question=gem.prefix_question + s_question0 + f" Context: {ptw_abst}. And {gem.context_disease}"
    
                break
    
            list_candidates=gem.run_curl_gemini(question, temperature=.2, topP=.2, verbose=print_ok)
            print_ok=False
            
            time.sleep(3)
            if len(list_candidates) != 0:
                break
    
        i += 1
        if i == 2:
            break

len(list_candidates)

gemini-1.5-flash
.Please, configure API_KEY in params.yml
.Please, configure API_KEY in params.yml


0

In [22]:
if want:
    if len(list_candidates) != 0:
        response=gem.response_candidate(list_candidates, 0)
    else:
        response=''
else:
    response=''

Markdown(response)



### Starting run all

#### Runs

  - 0 - default cutoff
  - 1 - BCA cutoff
  - 2 - middle of the table
  - 3 - end of the table
  - 4 - others random genes

In [23]:
print(chosen_model, gem.gemini_model)

3 gemini-1.5-flash


In [None]:
%%time

force=False
verbose=False
num_tries=3
pause_secs=0

print(gem.gemini_model, '\n\n')

for run in run_list:
    gem.run_all_gemini(run=run, case_list=case_list, chosen_model=chosen_model, 
                       num_tries=num_tries, pause_secs=pause_secs, force=force, verbose=verbose) 
    print("\n====================================\n")

In [None]:
verbose=True

case=case_list[3]
run='run01'
chosen_model=3
query_type='_strong'
want_pubmed=True

dfgem=gem.get_gemini_results_by_case_model_semantics(run=run, case=case, chosen_model=chosen_model,
                                                     i_dfp_list=i_dfp_list, want_pubmed=want_pubmed, 
                                                     query_type=query_type, verbose=verbose)
if dfgem is None:
    dfgem=pd.DataFrame()
    
print(len(dfgem))
dfgem.head(3)