In [None]:
from platform import python_version
print(python_version())

## Gemini API

https://ai.google.dev/gemini-api/docs

#### API key - Free of charge

https://aistudio.google.com/app/apikey

AIzaSyA1ZXcSe6NP5jiIw93sUpZYb8RKK1PgYDE

### Google Enable API

  - You are about to enable 'Generative Language API'.

https://ai.google.dev/gemini-api/docs/oauth

### Costs & Billing

https://console.cloud.google.com/billing/01C02C-666E6E-D731B9?project=gen-lang-client-0516343733


### Google Python projects

#### Gemini API Text Implementation

https://github.com/RepellentSpy/Gemini-API-Text-Implementation/tree/main

#### gemini-api 0.1.6

https://pypi.org/project/gemini-api/


#### Gemini-API

https://github.com/dsdanielpark/Gemini-API

## LLM - Large Language Model

### Gemini flash

gemini-1.5-flash-latest

In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
pd.set_option("display.precision", 3)

import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from biopax_lib import *
from gemini_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

# !pip3 install pyyaml
with open('params.yml', 'r') as file:
    dic_yml=yaml.safe_load(file)

In [None]:
root0=dic_yml['root0']
email=dic_yml['email']

project=dic_yml['project']
s_project=dic_yml['s_project']

gene_protein=dic_yml['gene_protein']
s_omics=dic_yml['s_omics']

has_age=dic_yml['has_age']
has_gender=dic_yml['has_gender']

want_normalized=dic_yml['want_normalized']

abs_lfc_cutoff_inf=dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method=dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr=dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index=dic_yml['tolerance_pathway_index']
type_sat_ptw_index=dic_yml['type_sat_ptw_index']
saturation_lfc_index=dic_yml['saturation_lfc_index']
chosen_model_sampling=dic_yml['chosen_model_sampling']

case_list=dic_yml['case_list']

pval_pathway_cutoff=dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff=dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff=dic_yml['num_of_genes_cutoff']

run_list=dic_yml['run_list']
chosen_model_list=dic_yml['chosen_model_list']
i_dfp_list=dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg=Config(project, s_project, case_list, root0)

case=case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio=-1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw=cfg.get_best_lfc_cutoff(case, 'not_normalized')


print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
bpx=Biopax(gene_protein, s_omics, project, s_project, root0,
           case_list, has_age, has_gender, clone_objects=False,
           exp_normalization=exp_normalization, geneset_num=0, 
           num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
           tolerance_pathway_index=tolerance_pathway_index, 
           s_pathw_enrichm_method=s_pathw_enrichm_method,
           abs_lfc_cutoff_inf=abs_lfc_cutoff_inf, 
           type_sat_ptw_index=type_sat_ptw_index, saturation_lfc_index=saturation_lfc_index)

case=case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs=bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

geneset_num=bpx.geneset_num

In [None]:
fname, fname_cutoff = bpx.set_enrichment_name()
fname, fname_cutoff

### Run all, after finding a case
  - please open dfr (reactome) before

In [None]:
Nenr = len(bpx.df_enr)
Nenr

In [None]:
bpx.df_enr.head(3)

In [None]:
df_enr0 = bpx.df_enr0
len(df_enr0)

In [None]:
chosen_model_list, i_dfp_list

### Ensemble: is_seldata=False

In [None]:
##################
is_seldata=False
##################

In [None]:
API_KEY = dic_yml['API_KEY']

disease = dic_yml['disease']
context_disease = dic_yml['context_disease']
n_sentences = dic_yml['n_sentences']

gem = Gemini(bpx=bpx, is_seldata=is_seldata, disease=disease, context_disease=context_disease, n_sentences=n_sentences, 
             API_KEY=API_KEY, root0=root0, chosen_model_list=chosen_model_list, i_dfp_list=i_dfp_list,
             chosen_model_sampling=chosen_model_sampling)
print("\n")
print(gem.disease)
print("Context:", context_disease)

In [None]:
gem.chosen_model_list, gem.i_dfp_list, gem.chosen_model_sampling

In [None]:
bpx.case_list, bpx.case, len(bpx.df_enr), len(bpx.df_enr0)

In [None]:
gem.set_case(bpx.case, bpx.df_enr, bpx.df_enr0)

In [None]:
dfr = gem.reactome.open_reactome_abstract(verbose=True)
print(len(dfr))
dfr.tail(3)

In [None]:
pathway_id = 'R-HSA-192905'
df2 = dfr[dfr.pathway_id == pathway_id]
df2

In [None]:
i = 0

row = dfr.iloc[i]
pathway = row.pathway
ptw_abst = row.abstract
ptw_abst2 = gem.prepare_abstract_n_sentences(ptw_abst)
ptw_abst2

### Question type

In [None]:
for quest_type in ['simple', 'simple+pubmed', 'disease', 'disease+pumed'] :
    question, with_without_PubMed, suffix = gem.define_question(quest_type)
    print(quest_type)
    print(f"{with_without_PubMed} and suffix '{suffix}'")
    print(question%(pathway), '\n\n')

### gemini model

In [None]:
gem.gemini_models

In [None]:
chosen_model = 0
gem.set_gemini_num_model(chosen_model)
gem.gemini_model

In [None]:
want = True
i_try = 0

print(gem.gemini_model)

list_candidates = []

quest_type = 'simple'
print_ok = True

if want:
    i = 0
    while True:
        print(".", end='')

        question0, with_without_PubMed, suffix = gem.define_question(quest_type)

        question_name0 = f'{with_without_PubMed}_{suffix}_0_default'
        question_name1 = f'{with_without_PubMed}_{suffix}_0_first'
        question_name2 = f'{with_without_PubMed}_{suffix}_1_middle' 
        question_name3 = f'{with_without_PubMed}_{suffix}_2_final'
        question_name4 = f'{with_without_PubMed}_{suffix}_3_others'

        multiple_data  = [ [0, question_name0, gem.df_enr], ]

        dfr = gem.dfr[gem.dfr.pathway_id == pathway_id]
        if dfr.empty:
            print(f"\nError: pathway_id {pathway_id} not found in reactome dfr.")
        else:
            pathway  = dfr.iloc[0].pathway
            ptw_abst = dfr.iloc[0].abstract
            ptw_abst = gem.prepare_abstract_n_sentences(ptw_abst)
    
            if ptw_abst[-1] == '.':
                ptw_abst = ptw_abst[:-1]
    
            for i_dfp, question_name, dfp in multiple_data:
        
                s_question0 = question0%(pathway)
                question = gem.prefix_question + s_question0 + f" Context: {ptw_abst}. And {gem.context_disease}"
    
                break
    
            list_candidates = gem.run_curl_gemini(question, temperature=.2, topP=.2, verbose=print_ok)
            print_ok = False
            
            time.sleep(3)
            if len(list_candidates) != 0:
                break
    
        i += 1
        if i == 2:
            break

len(list_candidates)

In [None]:
if want:
    if len(list_candidates) != 0:
        response = gem.response_candidate(list_candidates, 0)
    else:
        response = ''
else:
    response = ''

Markdown(response)

### Check all dfp

In [None]:
for quest_type in gem.question_list:
    #print("\t\t", quest_type)

    question0, with_without_PubMed, suffix = gem.define_question(quest_type)

    # question_name0 = f'{with_without_PubMed}_{suffix}_0_default'
    question_name1 = f'{with_without_PubMed}_{suffix}_0_first'
    question_name2 = f'{with_without_PubMed}_{suffix}_1_middle' 
    question_name3 = f'{with_without_PubMed}_{suffix}_2_final'
    question_name4 = f'{with_without_PubMed}_{suffix}_3_others'

    multiple_data  = [ [0, question_name1], [1, question_name2], 
                       [2, question_name3], [3, question_name4]]

i_dfp, question_name = multiple_data[0]
i_dfp, question_name

In [None]:
multiple_data

In [None]:
%%time

verbose=False

for chosen_model in chosen_model_list:
    gem.set_gemini_num_model(chosen_model)
    print(">>>", gem.gemini_model)

    for run in run_list:
        print("\t###", run)
        
        for case in case_list:
            ret, _, _, _ = gem.bpx.open_case(case)
            n0 = len(gem.bpx.df_enr)
            N = len(gem.bpx.df_enr0)
            for i_dfp, question_name in multiple_data:
                print("\t\t", run, case, i_dfp, end=' ')
                dfp = gem.open_dfp(run=run, i_dfp=i_dfp, case=case, gemini_model=gem.gemini_model, question_name=question_name, verbose=verbose)
                print(len(dfp), "/", n0, "/", N)
            print("")
    
# dfp.head(3)

In [None]:
%%time

verbose=False

for run in run_list:
    print(">>>", run, '\n')
    gem.run_again_dfp(run=run, chosen_model_list=chosen_model_list, i_dfp_list=i_dfp_list, case_list=case_list, verbose=verbose)

print("==================== end ==============")

In [None]:
dfp = gem.open_dfp(run=run, i_dfp=i_dfp, case=case, gemini_model=gem.gemini_model, question_name=question_name, verbose=verbose)
len(dfp)

### Starting run all

#### Runs

  - 0 - default cutoff
  - 1 - BCA cutoff
  - 2 - middle of the table
  - 3 - end of the table
  - 4 - others random genes

In [None]:
%%time

force = False
verbose = False
num_tries = 3
pause_secs = 0

run = 'run01'
gem.run_all_gemini(run=run, chosen_model=chosen_model, num_tries=num_tries, pause_secs=pause_secs, force=force, verbose=verbose)

### Development & tests

In [None]:
def read_or_build_df_read(i_dfp:int, dfp:pd.DataFrame, question_name:str, verbose:bool=False) -> pd.DataFrame:

    fname = gem.fname_gemini_search%(gem.disease, gem.case, question_name, gem.gemini_model)
    fname = title_replace(fname)
    # print(i_dfp, gem.root_gemini, fname)
    fullname = os.path.join(gem.root_gemini, fname)

    if os.path.exists(fullname):
        df_read = pdreadcsv(fname, gem.root_gemini, verbose=verbose)

        if df_read is not None and not df_read.empty:
            if dfp is None:
                return None
            print("###", len(df_read), len(dfp))
            return df_read

    if i_dfp == 4:
        dfa = gem.pick_other_pahtways()
    else:
        dfa = dfp
    
    cols = ['pathway_id', 'pathway', 'fdr']
    dfa = dfa[cols].copy()
    
    dfa['curation'] = None
    dfa['response_explain'] = None
    dfa['score_explain'] = None
    dfa['question'] = None
    dfa['disease']  = gem.disease
    dfa['case']	 = gem.case
    dfa['s_case']   = gem.s_case
    dfa['pathway_found'] = False
    
    dfa = dfa.sort_values('fdr', ascending=True)
    dfa.index = np.arange(0, len(dfa))
    
    # ret = pdwritecsv(dfa, fname, gem.root_gemini, verbose=verbose)
        
    return dfa
    
if run is None or not isinstance(run, str):
    gem.root_gemini = gem.root_gemini0
else:
    gem.root_gemini = os.path.join(gem.root_gemini0, run)

gem.set_gemini_num_model(chosen_model)
print(">> Gemini model:", gem.gemini_model)

gem.dfr = gem.reactome.open_reactome_abstract(verbose=verbose)

if not gem.create_random_Reactome_list():
    print("Aborting, could not create random Reactome list.")

'''----- start main loop ------------------'''
for icase in range(len(gem.bpx.case_list)):
    case = gem.bpx.case_list[icase]
    print("\n\n>>> case", case)

    #------- default - normal cutoff --------------------------
    ret, _, _, _ = gem.bpx.open_case_params(case, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05, pathway_fdr_cutoff=0.05)
    dfp0 = gem.bpx.df_enr
    
    #------- BCA - best cutoff algorithm ----------------------
    ret, _, _, _ = gem.bpx.open_case(case)
    dfp1	= gem.bpx.df_enr
    print(">>>>>", len(dfp1))
    gem.cur_pathway_id_list = list(dfp1.pathway_id)
    df_enr0 = gem.bpx.df_enr0
    
    n = len(dfp1)
    N = len(df_enr0)

    gem.set_case(gem.bpx.case, gem.bpx.df_enr, gem.bpx.df_enr0)
    
    #-- calc the middle
    n2 = int(n/2)
    N2 = int(N/2)
    
    ini = N2-n2
    end = ini+n

    if ini <= n:
        ini = n+1
        end = ini + n

    end_middle = end
    
    dfp2 = df_enr0.iloc[ini:end].copy()
    dfp2 = dfp2[~dfp2.pathway_id.isin(gem.cur_pathway_id_list)]

    if dfp2.empty:
        print("----------------")
        print("dfp2 is empty!!!")
        print("----------------")
        raise Exception('Stop dfp2')

    dfp2.index = np.arange(0, len(dfp2))
    gem.cur_pathway_id_list += list(dfp2.pathway_id)
    print(">>>>>", len(dfp2))

    # calc the end
    ini = N-n
    end = N

    if ini <= end_middle:
        ini = end_middle + 1
        
    dfp3 = df_enr0.iloc[ini:end].copy()
    dfp3 = dfp3[~dfp3.pathway_id.isin(gem.cur_pathway_id_list)]

    if dfp3.empty:
        print("----------------")
        print("dfp3 is empty!!!")
        print("----------------")
        raise Exception('Stop dfp3')
    
    dfp3.index = np.arange(0, len(dfp3))
    gem.cur_pathway_id_list += list(dfp3.pathway_id)
    print(">>>>>", len(dfp3))

    # below gem.pick_other_pahtways()
    dfp4 = None
    
    for quest_type in gem.question_type_list:
        print("\t$$$", quest_type)

        question0, with_without_PubMed, suffix = gem.define_question(quest_type)

        question_name0 = f'{with_without_PubMed}_{suffix}_0_default'
        question_name1 = f'{with_without_PubMed}_{suffix}_0_first'
        question_name2 = f'{with_without_PubMed}_{suffix}_1_middle' 
        question_name3 = f'{with_without_PubMed}_{suffix}_2_final'
        question_name4 = f'{with_without_PubMed}_{suffix}_3_others'

        multiple_data  = [ [0, question_name0, dfp0], [1, question_name1, dfp1], [2, question_name2, dfp2], 
                           [3, question_name3, dfp3], [4, question_name4, dfp4]]


        for i_dfp, question_name, dfp in multiple_data:

            if i_dfp == 0 and dfp is None:
                print("No enrichment analysis for default params.")
                continue

            if i_dfp < 4 and dfp is None:
                # dfa = gem.build_none_df_read(question_name=question_name, verbose=verbose)
                # df_list.append(dfa)
                # print(f"\t\tdfp {i_dfp} - None")
                print(f"\nError: dfp {i_dfp} - None")
                raise Exception('stop: run_question_gemini()')
            
            print(f"\t\tdfp {i_dfp}", end='')
            
            df_read = read_or_build_df_read(i_dfp=i_dfp, dfp=dfp, question_name=question_name, verbose=True)

            if df_read is None:
                print(f"df_read is None")
                print("")
                continue
                
            dfa = df_read[ (df_read.pathway_found==False) | (pd.isnull(df_read.curation)) | (pd.isnull(df_read.response_explain)) ].copy()
            if dfa.empty:
                print(f"Already calculated {len(dfa)} regs.")
                print("")
                continue
            
            print(len(dfa))
            print("\n\n")

In [None]:
len(gem.dfr_not)

In [None]:
'''
    disease and case: already known
    question0 has a %s - to input the pathway description
'''
print(gem.gemini_model)

num_tries=5
force = False
verbose = False
n_sentences = 5

# "Explain and infere Yes, Possible, Low evidence, or No; "
prefix_list = ["Answer in the first line Yes, Possible, Low evidence, or No; and explain; ", ]
question_type_list = ['simple', 'simple+pubmed', 'disease', 'disease+pubmed']


for icase in range(len(case_list)):
    case = case_list[icase]
    print("\n\n>>> case", case)
    # default - normal cutoff
    ret, _, _, _ = bpx.open_case_params(case, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05, pathway_fdr_cutoff=0.05)
    dfp0 = bpx.df_enr
    
    ret, _, _, _ = bpx.open_case(case)
    dfp1    = bpx.df_enr
    df_enr0 = bpx.df_enr0
    
    n = len(dfp1)
    N = len(df_enr0)

    '''
    	gem.df_enr  = bpx.df_enr
		gem.df_enr0 = bpx.df_enr0
    '''
    gem.set_case_covid(bpx.case, bpx.df_enr, bpx.df_enr0)
    
    ''' set n_sentences for > call_gemini > ptw_abst = gem.prepare_abstract_n_sentences(ptw_abst) '''
    gem.n_sentences = n_sentences

    #-- calc the middle
    n2 = int(n/2)
    N2 = int(N/2)
    
    ini = N2-n2
    end = ini+n

    if ini <= n:
        ini = n+1
        end = ini + n

    end_middle = end
    
    dfp2 = df_enr0.iloc[ini:end].copy()
    dfp2.index = np.arange(0, len(dfp2))

    if dfp2.empty:
        print("----------------")
        print("dfp2 is empty!!!")
        print("----------------")
        raise Exception('Stop dfp2')

    # calc the end
    ini = N-n
    end = N

    if ini <= end_middle:
        ini = end_middle + 1
        
    dfp3 = df_enr0.iloc[ini:end].copy()
    dfp3.index = np.arange(0, len(dfp3))

    if dfp3.empty:
        print("----------------")
        print("dfp3 is empty!!!")
        print("----------------")
        raise Exception('Stop dfp3')
    
    dfp4 = gem.pick_other_pahtways(dfp1)

    if dfp4.empty:
        print("----------------")
        print("dfp4 is empty!!!")
        print("----------------")
        raise Exception('Stop dfp4')
    
    for quest_type in question_type_list:
        print("\t$$$", quest_type)

        question0, with_without_PubMed, suffix = gem.define_question(quest_type, bpx.case)

        question_name0 = f'{with_without_PubMed}_{suffix}_0_default'
        question_name1 = f'{with_without_PubMed}_{suffix}_0_first'
        question_name2 = f'{with_without_PubMed}_{suffix}_1_middle' 
        question_name3 = f'{with_without_PubMed}_{suffix}_2_final'
        question_name4 = f'{with_without_PubMed}_{suffix}_3_others'

        multiple_data  = [ [0, question_name0, dfp0], [1, question_name1, dfp1], [2, question_name2, dfp2], 
                           [3, question_name3, dfp3], [4, question_name4, dfp4]]

        dfall = gem.run_question_gemini(prefix_list=prefix_list, 
                                        question0=question0, multiple_data=multiple_data, 
                                        num_tries=num_tries, force=force, verbose=False)
    
        # print(f"\n------------- end quest_type {quest_type} --------------\n\n")
print("-------------- final end --------------")

In [None]:
dfr = gem.reactome.open_reactome_abstract(verbose=False)
len(dfr)

In [None]:
dfr.head(3)

In [None]:
dfr.columns

In [None]:
col0 = 'dbId'

In [None]:
i = 0
dfr.iloc[i][col0], isfloat(dfr.iloc[i][col0])

In [None]:
goods = [True if isfloat(dfr.iloc[i][col0]) or  pd.isnull(dfr.iloc[i][col0]) else False for i in range(len(dfr))]
dfr = dfr[goods]
len(dfr)

In [None]:
dfr = dfr.drop_duplicates("pathway_id")
len(dfr)

In [None]:
dfr[ pd.isnull(dfr.dbId) ]

In [None]:
dfr.head(5)

In [None]:
pdwritecsv(dfr, gem.reactome.fname_reactome_abstract, gem.reactome.root_reactome, verbose=True)

In [None]:
for i_dfp, question_name, dfp in multiple_data:
    print(i_dfp)

pathw_list = dfp.pathway_id
len(pathw_list)

In [None]:
gem.reactome.refresh_reactome_table(pathw_list, force=False, verbose=True)

In [None]:
dfr = gem.reactome.open_reactome_abstract(verbose=False)
print(len(dfr))
dfr.tail(3)

In [None]:
pathway_id_list = dfr.pathway_id.to_list()
files = [x for x in os.listdir(gem.reactome.root_reactome_pathway) if x.endswith('.json') if x+'.json' not in pathway_id_list]
len(pathway_id_list), len(files)

In [None]:
files[:5]

In [None]:
gem.mean_classifier(text, num_words = 100)

In [None]:
dfr = gem.reactome.open_reactome_abstract(verbose=True)

In [None]:
pathway_id = 'R-HSA-71406'
dfr[dfr.pathway_id == pathway_id]

In [None]:
pathway_id = 'R-HSA-71406'
gem.dfr[gem.dfr.pathway_id == pathway_id]

In [None]:
pathway_id = 'R-HSA-71406'
pathway_id = 'R-HSA-381070'
pathway_id