In [None]:
from platform import python_version
print(python_version())

### Two cases of the randomly selected pathways model (2CRSP)

#### Goal
  - create a small dataset
  - these tables can be send to Human Reviewers
#### 2CRSP
  - chose 2 cases / subtypes
  - randomly sorted 30 pathways
     - using the Gemini 1.5-flash model
     - included 15 'Yes' and 15 'No' responses
  - root data: ../project/pubgem

#### Comparing Gemini x Pubmed x Reviewers 
  - same methods as for the Ensemble dataset
  - using the 2CRSP

### 3 Sources ~ 2CRSP
  - Gemini MMC
  - PubMed answers
  - Human consensus

### Gemini x Pubmed: save both and only one

  - save_pubmed_x_gemini_both_and_only_one()
    - compare_pubmed_x_gemini()
      - pub.fname_pubmed_x_gemini%(case, i_dfp, run, pub.gem.gemini_model)
      - instead of
        - run_gemini_consensus_counts_all_models(sel_ptw_pubmed)
        - dfpiv2 = open_gemini_consensus_counts_run_filter_idfp_consensus(sel_ptw_pubmed=sel_ptw_pubmed)
        - dfpiva = open_gemini_dfpiva_all_models_one_run(run=run, sel_ptw_pubmed=sel_ptw_pubmed, verbose=verbose)
        - gemini_summary_consensus_statitics(sel_ptw_pubmed)
        - compare_2_models_venn_diagram(sel_ptw_pubmed)
        - get_2_models(sel_ptw_pubmed)
        - compare_2_runs_unanimous_mean(sel_ptw_pubmed)
        - compare_2_runs_total_answers(sel_ptw_pubmed)
        - report_gemini(sel_ptw_pubmed)
        - sel_ptw_pubmed:bool=False)


In [None]:
import os, sys, pickle

import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
import yaml

import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sys.path.insert(1, '../src/')

from Basic import *
from entrez_conversion import *
from pubmed_lib import *
from biopax_lib import *
from gemini_lib import *

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

email = "flalix@gmail.com"

# !pip3 install pyyaml
with open('params.yml', 'r') as file:
    dic_yml = yaml.safe_load(file)

# print(dic_yml)

In [None]:
root_chibe = dic_yml['root_chibe']
root_colab = dic_yml['root_colab']
root0 = dic_yml['root0']

project = dic_yml['project']
s_project = dic_yml['s_project']

gene_protein = dic_yml['gene_protein']
s_omics = dic_yml['s_omics']

has_age = dic_yml['has_age']
has_gender = dic_yml['has_gender']

want_normalized = dic_yml['want_normalized']

abs_lfc_cutoff_inf = dic_yml['abs_lfc_cutoff_inf']
s_pathw_enrichm_method = dic_yml['s_pathw_enrichm_method']
num_min_degs_for_ptw_enr = dic_yml['num_min_degs_for_ptw_enr']

tolerance_pathway_index = dic_yml['tolerance_pathway_index']
type_sat_ptw_index = dic_yml['type_sat_ptw_index']
saturation_lfc_index = dic_yml['saturation_lfc_index']
chosen_model_sampling = dic_yml['chosen_model_sampling']

case_list = dic_yml['case_list']
case_sel_list = dic_yml['case_sel_list']
s_len_case = dic_yml['s_len_case']

pval_pathway_cutoff = dic_yml['pval_pathway_cutoff']
fdr_pathway_cutoff = dic_yml['fdr_pathway_cutoff']
num_of_genes_cutoff = dic_yml['num_of_genes_cutoff']

run_list = dic_yml['run_list']
chosen_model_list = dic_yml['chosen_model_list']
i_dfp_list = dic_yml['i_dfp_list']

exp_normalization='quantile_norm' if want_normalized else None
normalization='not_normalized' if exp_normalization is None else exp_normalization

cfg = Config(project, s_project, case_list, root0)

case = case_list[0]

n_genes_annot_ptw, n_degs, n_degs_in_ptw, n_degs_not_in_ptw, degs_in_all_ratio = -1,-1,-1,-1,-1
abs_lfc_cutoff, fdr_lfc_cutoff, n_degs, n_degs_up, n_degs_dw = cfg.get_best_lfc_cutoff(case, 'not_normalized')

print(f"G/P LFC cutoffs: lfc={abs_lfc_cutoff:.3f}; fdr={fdr_lfc_cutoff:.3f}")
print(f"Pathway cutoffs: pval={pval_pathway_cutoff:.3f}; fdr={fdr_pathway_cutoff:.3f}; num of genes={num_of_genes_cutoff}")

In [None]:
bpx = Biopax(gene_protein, s_omics, project, s_project, root0,
             case_list, has_age, has_gender, clone_objects=False,
             exp_normalization=exp_normalization, geneset_num=0, 
             num_min_degs_for_ptw_enr=num_min_degs_for_ptw_enr, 
             tolerance_pathway_index=tolerance_pathway_index, 
             s_pathw_enrichm_method = s_pathw_enrichm_method)

case = case_list[0]

bpx.cfg.set_default_best_lfc_cutoff(normalization, abs_lfc_cutoff=1, fdr_lfc_cutoff=0.05)
ret, degs, degs_ensembl, dfdegs = bpx.open_case(case, verbose=False)
print("\nEcho Parameters:")
bpx.echo_parameters()

### Instantiating Gemini

In [None]:
##########################
is_seldata=True
if is_seldata==True: i_dfp_list = [0]
i_dfp=0

case_list = case_sel_list
case_sel0 = case_sel_list[0]
case_sel1 = case_sel_list[1]

with_gender=bpx.has_gender
with_gender_list = [False, True] if with_gender else [False]

print(f"with_gender = {with_gender} because has_gender = {bpx.has_gender}")
##########################

In [None]:
is_seldata, case_list, s_len_case, run_list, chosen_model_list, i_dfp_list, chosen_model_sampling

In [None]:
API_KEY = dic_yml['API_KEY']

disease = dic_yml['disease']
context_disease = dic_yml['context_disease']
n_sentences = dic_yml['n_sentences']
chosen_model_sampling = dic_yml['chosen_model_sampling']

gem = Gemini( bpx=bpx, is_seldata=is_seldata, disease=disease, context_disease=context_disease, 
             API_KEY=API_KEY, n_sentences=n_sentences, root0=root0, 
             chosen_model_list=chosen_model_list, i_dfp_list=i_dfp_list, chosen_model_sampling=chosen_model_sampling)
print("\n")
print(gem.disease, gem.is_seldata, gem.i_dfp_list, gem.chosen_model_list)
print("Context:", context_disease)

In [None]:
gem.set_case(bpx.case, bpx.df_enr, bpx.df_enr0)

In [None]:
gem.is_seldata, gem.bpx.case_list, gem.chosen_model_list, gem.i_dfp_list, gem.chosen_model_sampling

In [None]:
terms1_param = dic_yml['terms1_param']
terms2_param = dic_yml['terms2_param']
terms_not_param = dic_yml['terms_not_param']
connective_param = dic_yml['connective_param']
remove_synonym_list = dic_yml['remove_synonym_list']
inidate = dic_yml['inidate']
enddate = dic_yml['enddate']
verbose_query = dic_yml['verbose_query']
force_query = dic_yml['force_query']
sleep_entrez = dic_yml['sleep_entrez']
retmax = dic_yml['retmax']
try_all_text = dic_yml['try_all_text']
text_quote = dic_yml['text_quote']
dec_ncpus = dic_yml['dec_ncpus']
sleep_TIKA = dic_yml['sleep_TIKA']
min_words_text = dic_yml['min_words_text']

prefix = s_project

pub = Pubmed(bpx, gem, email, prefix, root0=root0, 
             inidate=inidate, enddate=enddate, 
             terms1_param=terms1_param, terms2_param=terms2_param,
             terms_not_param=terms_not_param, connective_param=connective_param,
             remove_synonym_list=remove_synonym_list, 
             sleep_entrez = sleep_entrez, retmax=retmax,  
             try_all_text=try_all_text, text_quote=text_quote,
             root_colab=root_colab, dec_ncpus=dec_ncpus, sleep_TIKA=sleep_TIKA, min_words_text=min_words_text)

### Settings

In [None]:
test=False
force=False
verbose=False

print(">>> terms1_param", terms1_param)
print(">>> terms_not_param", terms_not_param)
print(">>> connective_param", connective_param)
print(f">>> prefix-disease: {prefix}, inidate {inidate}, enddate {enddate}")

chosen_model=3
gemini_model=gem.gemini_models[chosen_model]
pub.gem.set_gemini_num_model(chosen_model, verbose=verbose)

# for selected data
query_type='strong'
N=30

case=case_sel0
print("")
dfsel = gem.open_yes_no_sampling(case=case, N=N, query_type=query_type, verbose=True)
print("")
dfsel.head(3)

In [None]:
pub.gem.is_seldata, pub.gem.i_dfp_list, pub.gem.root_gemini0, case_list, with_gender_list

### Merge all PubMed with/without gender filter
  - COVID-19 is dependent on gender, MB is not.
  - with gender: True or False for COVID-19 and False for MB
  - PubMed search with ou without gender are different
  - i_dfps = [0] for selected data and [0,1,2,3] for all data

#### Method - pub.merge_all_pubmeds:  
  - for all cases
    - get the pathways
    - get the reactome term table
    - for each pathway - term:
      - search for pmids in PubMed
  - one run (dummy)
  - one chosen_model (3 dummy)
  - for gender True and False
  - for all cases, iqs', i_dfps'
    - get the pathways
    - get the reactome term table
    - for each pathway - term:
      - search for pmids in PubMed
  - PubMed search on 2025/01/02


In [None]:
case_list, i_dfp_list, with_gender_list

In [None]:
verbose=False
force=False

dfm_pub, df_summ, dfn = pub.calc_all_pubmed_summaries(case_list=case_list, i_dfp_list=i_dfp_list,
                                                      with_gender_list=with_gender_list, show_warning=False,
                                                      force=force, verbose=verbose)

In [None]:
print(len(dfm_pub))
print(len(df_summ), '\n')

In [None]:
dfm_pub.head(3)

In [None]:
df_summ

In [None]:
dfn

In [None]:
verbose=False
force=False

dfm_pub2, df_summ2 = pub.merge_all_pubmeds(case_list=case_list, i_dfp_list=i_dfp_list, with_gender=with_gender, 
                                         show_warning=False, force=force, verbose=verbose)

print(len(dfm_pub2))
dfm_pub2.head(3)

### run_case_pathway_pubmed_search() run inside merge_all_pubmeds()

#### without gender (MB only without gender)

In [None]:
df_pmid = pub.run_case_pathway_pubmed_search(case=case, i_dfp=i_dfp, with_gender=with_gender, 
                                             test=test, verbose=verbose)

if df_pmid is None:
    dic = {'pathway_id': [], 'pathway': [], 'pmid': [], 'pub_date': [], 'title': [], 
           'keywords': [],   'abstract': [],  'terms': [], 'dates': []}
    df_pmid = pd.DataFrame(dic)
    df_summ_pmid = pd.DataFrame()
    df_summ_pathway = pd.DataFrame() 
else:
    df_summ_pmid = pub.df_summ_pmid
    df_summ_pathway = pub.df_summ_pathway
    
    print(len(df_pmid), len(df_summ_pmid), len(df_summ_pathway), '\n')
    
cols = ['pathway_id', 'pathway', 'pmid', 'pub_date', 'title', 'keywords',
       'abstract',  'terms', 'dates']
df_pmid[cols].head(2)

### PubMed semi-summarize

In [None]:
verbose=True
force=False

dfsumm3 = pub.calc_semi_summarize_pubmed_search(run_dummy='run0', case_list=case_list, 
                                                i_dfp_list=i_dfp_list, with_gender_list=with_gender_list,
                                                test=False, show_warning=True,
                                                force=force, verbose=verbose)

print(len(dfsumm3))
dfsumm3.head(3)

In [None]:
dfsumm3.with_gender.unique()

### Calc Gemini consensus counts

In [None]:
verbose=False

for run in run_list:
    print(">>>", run)
    _ = pub.gem.calc_gemini_dfpiva_all_models_one_run(run=run,  case_list=case_list, 
                                                      chosen_model_list=chosen_model_list, 
                                                      force=False, verbose=verbose)

print("---------------- end -----------------")

In [None]:
verbose=True
run='run01'

dfpiva = gem.open_gemini_dfpiva_all_models_one_run(run=run, chosen_model_list=chosen_model_list, verbose=verbose)
# print(dfpiva.columns)
dfpiva.head(3)

In [None]:
dfpiva.case.unique(), dfpiva.i_dfp.unique(), dfpiva.run.unique()

#### filter Yes/No

In [None]:
verbose=False
consensus='Yes'

dfpiv2 = pub.gem.open_gemini_consensus_counts_run_filter_idfp_consensus_run_all_models(run=run, i_dfp=i_dfp,
                                                                        chosen_model_list=chosen_model_list,
                                                                        consensus=consensus, verbose=verbose)
if dfpiv2 is None:
    dfpiv2 = pd.DataFrame()

print(len(dfpiv2))
dfpiv2.head(3)

In [None]:
i=0
case=case_list[i]
print(">>>", case, "\n\n")

dfpiv3 = dfpiv2[dfpiv2.case == case]

print(len(dfpiv3))
dfpiv3.head(3)

## Agreements between PubMed and Gemini
  - with_gender: True or False for COVID and False for MB
  - run, case, i_dfp (comparing all-model consensus)
  - total COVID-19 = 2*4*8*n_pathways = 64*n_pathways
  - total MB = 2*4*2*n_pathways = 16*n_pathways

In [None]:
verbose=False
force=False

for run in run_list:
    print(">>>", run)
    for with_gender in with_gender_list:
        print("\n\twith_gender", with_gender)
        pub.calc_compare_pubmed_x_gemini(run=run, case_list=case_list, i_dfp_list=i_dfp_list,
                                         with_gender=with_gender, chosen_model_list=chosen_model_list,
                                         force=force, verbose=verbose)
    print("\n")

print("------------- end ------------")

### Agreement percentage With/Without gender
  - open_compare_pubmed_x_gemini()

In [None]:
verbose=False

for run in run_list:

    print("For run:", run)
    for with_gender in with_gender_list:
        print(f'PubMed with_gender {with_gender}')
    
        for i_dfp in i_dfp_list:
            print(f"\ti_dfp {i_dfp}")
            
            for case in case_list:
                dfn, df_both, df_only_pubmed, df_only_gemini, mu, std, n, text = \
                        pub.open_compare_pubmed_x_gemini(run=run, case=case,  i_dfp=i_dfp, with_gender=with_gender, 
                                                         chosen_model_list=chosen_model_list,
                                                         verbose=verbose)
                text = f'#{len(dfn)} pathways agree' + text.split('agree')[1]
                
                sem = std / np.sqrt(n)
                print(f"\t\tfor {case:15} {text}")
            print("")
    
    print("")

In [None]:
verbose=False
case=case_list[1]

dfn, df_both, df_only_pubmed, df_only_gemini, mu, std, n, text = \
        pub.open_compare_pubmed_x_gemini(run=run, case=case, i_dfp=i_dfp, with_gender=with_gender, 
                                         chosen_model_list=chosen_model_list,
                                         verbose=verbose)
print(len(dfn))
dfn.head(3)

In [None]:
len(dfn[dfn.agree==True]), len(dfn[dfn.agree==False])

In [None]:
# dfn.columns

In [None]:
dfn.i_dfp.unique()

In [None]:
dfn.agree.mean(), dfn.agree.std()

In [None]:
text

### Gemini counts: previously run
  - Count Yes and No per model, run versus iq and i_dfp:
    - 2 iq have PubMed inside the search (pubmed=True) and 2 dont
    - i_dfp: 0 to 3, 0=enriched, 1=middle, 2=end of the table, and 3=out of enriched table

In [None]:
verbose=False
run='run01'

for run in run_list:
    for chosen_model in chosen_model_list:
        dfstat = pub.gem.gemini_calc_answers_counts(run=run, case_list=case_list, chosen_model=chosen_model,
                                                    force=False, verbose=verbose)

# old open_gemini_statistical_analysis
dfc = gem.open_gemini_answers_counts(run=run, chosen_model=chosen_model, verbose=verbose)
print(len(dfc))

case = case_list[1]
dfc[dfc.case==case].head(3)

### All Gemini x PubMed: save both and only one

In [None]:
verbose=False
force=False

for run in run_list:
    print(">>>", run)
       
    _, _, _, _, _ = \
    pub.calc_all_pubmed_x_gemini_both_and_only_one(run=run, case_list=case_list, i_dfp_list=i_dfp_list,
                                                   chosen_model_list=chosen_model_list,
                                                   with_gender_list=with_gender_list,
                                                   force=force, verbose=verbose)

print("--------------- end -----------------")

In [None]:
verbose=False
run='run01'

df_all, df_all_both, df_only_pubmed, df_only_gemini, df_stat = \
    pub.calc_all_pubmed_x_gemini_both_and_only_one(run=run, case_list=case_list, i_dfp_list=i_dfp_list,
                                                   chosen_model_list=chosen_model_list, 
                                                   with_gender_list=with_gender_list, verbose=verbose)
df2 = df_stat[df_stat.with_gender == with_gender]

print(f"with_gender {with_gender} {df2.mu.mean():.2f} ({df2.mu.std():.2f})")
df2

### Yes/No gemini & pubmed - choose your case

In [None]:
case=case_list[0]

df2 = df_all_both[(df_all_both.case==case) & (df_all_both.with_gender==with_gender) & (df_all_both.gemini == 'Yes')]
print(len(df2))
df2.head(3)

In [None]:
df2 = df_all_both[(df_all_both.case==case) & (df_all_both.with_gender==with_gender) & (df_all_both.gemini == 'No')]
print(len(df2))
df2.head(3)

In [None]:
df2 = df_only_pubmed[(df_only_pubmed.case==case) & (df_only_pubmed.with_gender==with_gender) & (df_only_pubmed.pubmed == 'Yes')]
print(len(df2))
df2.head(3)

In [None]:
df2 = df_only_gemini[(df_only_gemini.case==case) & (df_only_gemini.with_gender==with_gender) & (df_only_gemini.gemini == 'Yes')]
print(len(df2))
df2.head(3)

### Yes/No gemini & pubmed - choose your case - second case

In [None]:
i=1
case=case_list[i]
case, with_gender

In [None]:
df2 = df_all_both[ (df_all_both.with_gender==with_gender) & (df_all_both.case == case) & (df_all_both.gemini == 'Yes')]
print(len(df2))
df2.head(3)

In [None]:
print("\n".join(df2.pathway))

In [None]:
df2 = df_all_both[ (df_all_both.with_gender==with_gender) & (df_all_both.case == case) & (df_all_both.gemini == 'No')]
print(len(df2))
df2.head(3)

In [None]:
print("\n".join(df2.pathway))

In [None]:
df2 = df_only_pubmed[ (df_all_both.with_gender==with_gender) & (df_only_pubmed.case == case) & (df_only_pubmed.pubmed == 'Yes')]
print(len(df2))
df2.head(3)

In [None]:
print("\n".join(df2.pathway))

In [None]:
df2 = df_only_pubmed[ (df_all_both.with_gender==with_gender) & (df_only_pubmed.case == case) & (df_only_pubmed.pubmed == 'No')]
print(len(df2))
df2.head(3)

In [None]:
print("\n".join(df2.pathway))

### Statistics Gemini x Pubmed per case

In [None]:
verbose=False
force=False

df_stat = pub.stat_compare_pubmed_x_gemini_all(run=run, case_list=case_list, i_dfp_list=i_dfp_list,
                                               chosen_model_list=chosen_model_list,
                                               with_gender_list=with_gender_list,
                                               force=force, verbose=verbose)

# cols=['run', 'case', 'i_dfp', 'with_gender', 'stat', 'pvalue', 'dof', 'expected', 'n', 'vals_gemini', 'vals_pubmed', 'fdr']
cols=['case', 'i_dfp', 'with_gender', 'n', 'pvalue', 'fdr', 'vals_gemini', 'vals_pubmed', 'stat',  ]

case = case_list[0]
df_stat[cols]

### Report

In [None]:
verbose=False
force=False
run='run01'

msg = pub.report_all_pubmed_x_gemini_both_and_only_one(run=run, with_gender_list=[False],
                                                       case_list=case_list, i_dfp_list=i_dfp_list,
                                                       chosen_model_list=chosen_model_list, 
                                                       i_dfp=i_dfp,
                                                       force=force, verbose=verbose)

print(msg)

### Only Gemini

In [None]:
verbose=False
run='run01'

df_all, df_all_both, df_only_pubmed, df_only_gemini, df_stat = \
    pub.calc_all_pubmed_x_gemini_both_and_only_one(run=run, case_list=case_list, i_dfp_list=i_dfp_list,
                                                   chosen_model_list=chosen_model_list, 
                                                   with_gender_list=with_gender_list, verbose=verbose)

print(f"------------ Only Gemini -  i_dfp {i_dfp}  with_gender {with_gender} -------------\m")

for case in case_list:
    df2 = df_only_gemini[ (df_only_gemini.case==case) & 
                          (df_only_gemini.with_gender==with_gender) & 
                          (df_only_gemini.gemini == 'Yes') & 
                          (df_only_gemini.i_dfp==i_dfp)]

    if df2.empty:
        print(">>>", case, 'nothing found')
        continue

    lista = np.unique(df2.pathway)
    print(">>>", case, len(lista))
    print("\t",end='')
    print("\n\t".join(lista))
    print("\n")



### Only Pubmed

In [None]:
print(f"------------ Only PubMed -  i_dfp {i_dfp}  with_gender {with_gender} -------------\m")

for case in case_list:
    df2 = df_only_pubmed[ (df_only_pubmed.case==case) & 
                          (df_only_pubmed.with_gender==with_gender) & 
                          (df_only_pubmed.pubmed == 'Yes') & 
                          (df_only_pubmed.i_dfp==i_dfp)]

    if df2.empty:
        print(f">>> {case}: nothing found\n")
        continue

    lista = np.unique(df2.pathway)
    print(">>>", case, len(lista))
    print("\t",end='')
    print("\n\t".join(lista))
    print("\n")### Both Yes/No

### Both Yes/No

In [None]:
for answer in ['Yes', 'No']:

    print(f"------------ Both {answer}, PubMed and Gemini - i_dfp {i_dfp}  with_gender {with_gender} -------------\n")
    
    for case in case_list:
        df2 = df_all_both[ (df_all_both.case == case) & 
                           (df_all_both.with_gender == with_gender) & 
                           (df_all_both.i_dfp == i_dfp) & 
                           (df_all_both.gemini == answer)]
    
        if df2.empty:
            print(f">>> {case}: nothing found\n")
            continue
            
        print(">>>", case)
        lista = np.unique(df2.pathway)
        print("\t",end='')
        print("\n\t".join(lista))
        print("")
    print("")

### Agreements between PubMed x Gemini

  - Chi2-square - found in Gemini x found in Pubmed

In [None]:
verbose=False
force=False

df, dfg = pub.agreements_between_pubmed_and_gemini(run_list=run_list, case_list=case_list, i_dfp_list=i_dfp_list,
                                                   chosen_model_list=chosen_model_list, with_gender_list=with_gender_list,
                                                   force=force, verbose=verbose)
pd.set_option('display.precision', 3)

print(len(df))

cols = ['run', 'chosen_model_list', 'case', 'i_dfp', 'is_seldata', 'with_gender', 'n',
       'n_gemini_yes', 'n_gemini_no', 'n_pubmed_yes', 'n_pubmed_no', 'n_both_yes_no', 'agree',
       'agree_std', 'fdr', 'pvalue', 'stat', 'dof', 'n_only_gemini_yes', 'n_only_pubmed_yes']

cols = ['run', 'case', 'i_dfp', 'is_seldata', 'with_gender', 'n',
       'n_gemini_yes', 'n_gemini_no', 'n_pubmed_yes', 'n_pubmed_no', 'n_both_yes_no', 'agree',
       'agree_std', 'fdr', 'pvalue', 'stat', 'dof', 'n_only_gemini_yes', 'n_only_pubmed_yes']

run=='run01'
print(">>>", run, "chosen_model_list", chosen_model_list)
df[cols]

In [None]:
i=0
case=case_list[i]
df2 = df[ (df.run==run) & (df.case == case)]
print(df2.i_dfp.unique())
df2[cols]

In [None]:
i=1
case=case_list[i]
df2 = df[ (df.run==run) & (df.case == case)]
print(df2.i_dfp.unique())
df2[cols]

In [None]:
dfg.head(6)

In [None]:
pd.options.display.float_format = "{:,.3f}".format

run='run01'
dfg2 = dfg[ (dfg.run == run) & (dfg.with_gender == False) ]
cols = ['run', 'case', 'with_gender', 'agree', 'agree_std', 'n']
dfg2[cols]

In [None]:
run='run02'
dfg2 = dfg[ (dfg.run == run) & (dfg.with_gender == False) ]
cols = ['run', 'case', 'with_gender', 'agree', 'agree_std', 'n']
dfg2[cols]

In [None]:
run='run01'

print(f"Gemini x Pubmed agreement for {run} (min, max):")
print("\n============================ without gender ======")
for case in case_list:
    # df3 = df2[df2.chosen_model == chosen_model]
    # df2 = df[ (df.run==run) & (df.case == case) & (df.with_gender == True)]
    # mini1, maxi1 = df2.agree.min(), df2.agree.max()

    df3 = df[ (df.run==run) & (df.case == case) & (df.with_gender == False)]
    mini2, maxi2 = df3.agree.min(), df3.agree.max()
    
    # stri = f"for {pub.gem.gemini_models[chosen_model]} [{100*mini:.1f}, {100*maxi:.1f}]%"
    stri = f"for {case:15} [{100*mini2:2.1f}%, {100*maxi2:2.1f}%]"
    print('\t',stri)

In [None]:
run='run02'

print(f"Gemini x Pubmed agreement for {run} (min, max):")
print("\n============================ without gender======")
for case in case_sel_list:
    # df3 = df2[df2.chosen_model == chosen_model]
    # df2 = df[ (df.run==run) & (df.case == case) & (df.with_gender == True)]
    # agree_gender = df2.iloc[0].agree
    # agree_gender_std = df2.iloc[0].agree_std

    
    df3 = df[ (df.run==run) & (df.case == case) & (df.with_gender == False)]
    agree_nogend     = df3.iloc[0].agree
    agree_nogend_std = df3.iloc[0].agree_std
    
    # stri = f"for {pub.gem.gemini_models[chosen_model]} [{100*mini:.1f} {100*maxi:.1f}]%"
    stri = f"for {case:15} {100*agree_nogend:2.1f}% ({100*agree_nogend_std:2.1f}%) "
    print('\t',stri)

### Runs x Gemini

In [None]:
# Gemini results independent on with_gender

run1='run01'
df1 = df[ (df.run==run1) & (df.with_gender==with_gender)].copy()
df1.index = np.arange(0, len(df1))

run2='run02'
df2 =  df[ (df.run==run2) & (df.with_gender==with_gender)].copy()
df2.index = np.arange(0, len(df2))

equal = np.sum([df1.iloc[i].n_gemini_yes - df2.iloc[i].n_gemini_yes == 0 for i in range(len(df1)) ])
diff  = np.sum([df1.iloc[i].n_gemini_yes - df2.iloc[i].n_gemini_yes != 0 for i in range(len(df1)) ])

if diff == 0:
    print('Gemini insensible to runs')
else:
    print('Gemini varies with runs')
# must be the same as cell above
f"different runs with_gender {with_gender}", with_gender, equal, diff

### Pubmed has no runs - comparing with_gender

In [None]:
# PubMed results are ependent on with_gender and independent on run
run1='run01'
df1 = df[ (df.run==run1) & (df.with_gender==with_gender)].copy()
df1.index = np.arange(0, len(df1))

run2='run02'
df2 =  df[ (df.run==run2) & (df.with_gender==with_gender)].copy()
df2.index = np.arange(0, len(df2))

equal = np.sum([df1.iloc[i].n_pubmed_yes - df2.iloc[i].n_pubmed_yes == 0 for i in range(len(df1)) ])
diff  = np.sum([df1.iloc[i].n_pubmed_yes - df2.iloc[i].n_pubmed_yes != 0 for i in range(len(df1)) ])

if diff == 0:
    print('Pubmed insensible to runs')
else:
    print('Pubmed varies with runs')
# must be the same as cell above
f"different runs with_gender {with_gender}", with_gender, equal, diff