In [1]:
import pandas as pd
from ast import literal_eval
import random
import numpy as np

In [2]:
langs = ["af", "fr", "ga", "gu", "kk", "ms", "pt", "simple", "sk", "en"]

In [3]:
dfs = {}
for lang in langs:
    try:
        dfs[lang] = pd.read_parquet(f'test_data/{lang}_gpt.parquet')
    except:
        pass
dfs.keys()

dict_keys(['af', 'fr', 'ga', 'gu', 'kk', 'ms', 'pt', 'simple', 'sk', 'en'])

In [4]:
dfs['pt'].columns

Index(['source_title', 'source_ID', 'target_title', 'context', 'section',
       'mention_present', 'source_lead', 'first_version', 'second_version',
       'direct_match', 'missing_category', 'negative_contexts',
       'current_links', 'target_lead', 'random_rank', 'bm25_mentions_rank',
       'fuzzy_match_rank', 'roberta_simple_pt', 'roberta_simple_multilingual',
       'roberta_full_pt', 'roberta_full_multilingual', 'roberta_dyn_mask_pt',
       'roberta_only_expansion_pt', 'roberta_expansion_pt',
       'roberta_full_multilingual-all', 'roberta_dyn_mask_no_neg_pt',
       'gpt_outputs_25_cands', 'gpt_4_outputs_25_cands'],
      dtype='object')

# Get GPT scores

## All candidates

In [5]:
def evaluate_results(outputs):
    positions = []
    for output in outputs:
        position = 1
        tied = 0
        results = literal_eval(output)
        for result in results:
            if result[0] is None or result[1] is None:
                tied += 1
                continue
            simple_result = []
            if 'Passage A' in result[0] and 'Passage B' not in result[0]:
                simple_result.append('A')
            elif 'Passage B' in result[0] and 'Passage A' not in result[0]:
                simple_result.append('B')
            else:
                simple_result.append(result[0])

            if 'Passage A' in result[1] and 'Passage B' not in result[1]:
                simple_result.append('A')
            elif 'Passage B' in result[1] and 'Passage A' not in result[1]:
                simple_result.append('B')
            else:
                simple_result.append(result[1])
            
            if simple_result == ['B', 'B'] or simple_result == ['A', 'A']:
                tied += 1
            elif simple_result == ['A', 'B']:
                continue
            elif simple_result == ['B', 'A']:
                position += 1
            else:
                tied += 1
                print(result)
        positions.append(position + tied // 2)
    return positions

In [6]:
scores = {lang: {model: {metric: {category: None for category in ['all', 'present', 'missing']} for metric in ['mrr', 'hits@1']} for model in ['gpt-3', 'gpt-4']} for lang in langs}

In [7]:
for lang in dfs:
    if 'gpt_outputs_25_cands' in dfs[lang].columns:
        positions_gpt3 = evaluate_results(dfs[lang]['gpt_outputs_25_cands'].tolist())
        dfs[lang]['gpt-3_rank'] = positions_gpt3
    if 'gpt_4_outputs_25_cands' in dfs[lang].columns:
        positions_gpt4 = evaluate_results(dfs[lang]['gpt_4_outputs_25_cands'].tolist())
        dfs[lang]['gpt-4_rank'] = positions_gpt4
    
    for model in ['gpt-3', 'gpt-4']:
        if f'{model}_rank' in dfs[lang].columns:
            df_all = dfs[lang].copy()
            df_present = dfs[lang][dfs[lang]['missing_category'].isna()]
            df_missing = dfs[lang][~dfs[lang]['missing_category'].isna()]
            
            for category, df in zip(['all', 'present', 'missing'], [df_all, df_present, df_missing]):
                scores[lang][model]['mrr'][category] = np.mean(1 / np.array(df[f'{model}_rank'].tolist()))
                scores[lang][model]['hits@1'][category] = np.mean(np.where(np.array(df[f'{model}_rank'].tolist()) == 1, 1, 0))    

['Neither Passage A nor Passage B is relevant for inserting a mention to the query entity.', '{{Passage B}}']
['Neither passage is relevant for inserting a mention to the query entity, Lemoen.', '{{Passage A}}']
["Since Passage A and Passage B are identical and do not contain any direct relevance to the year 2023 or related events, it's impossible to determine a more suitable passage based solely on the information provided. Therefore, any choice would be arbitrary. However, since a choice is required, I will select:\n\n{{Passage A}}", '{{Passage A}}']
['Neither Passage A nor Passage B', 'Neither Passage A nor Passage B is relevant for inserting a mention to the query entity "China."']
['Neither Passage A nor Passage B', '{{Passage A}}']
['Passage A', 'Neither passage is relevant for the query entity.']
['Neither Passage A nor Passage B is relevant for inserting a mention to the query entity.', '{{Passage A}}']
['Neither Passage A nor Passage B', 'Passage A']
['Neither passage is relev

In [13]:
scores['sk']

{'gpt-3': {'mrr': {'all': 0.4426480536708024,
   'present': 0.5818916871242453,
   'missing': 0.33760461089013494},
  'hits@1': {'all': 0.31,
   'present': 0.46511627906976744,
   'missing': 0.19298245614035087}},
 'gpt-4': {'mrr': {'all': 0.6251084511680419,
   'present': 0.869258454723571,
   'missing': 0.44092511515246735},
  'hits@1': {'all': 0.54,
   'present': 0.8372093023255814,
   'missing': 0.3157894736842105}}}

# Counting contexts

In [104]:
for lang in dfs:
    limited = 0
    full = 0
    negative_contexts = dfs[lang]['negative_contexts'].tolist()
    for context_list in negative_contexts:
        context_list = literal_eval(context_list)
        full += len(context_list) * 2
        limited += min(len(context_list), 24) * 2
    print(lang, full, limited)
    

af 14590 3282


fr 47364 4528
ga 5130 2432
gu 2122 952
kk 16454 3460
ms 12526 3346
pt 13686 3670
simple 18550 3234
sk 79660 3826
en 14854 3406
