In [29]:
import json
import sys

sys.path.append('..')
from mutability.domain import Queries
from utils.data_handling import *
from utils.f1_score import compute_score

from transformers import AutoTokenizer

In [2]:
def evaluate(data, predictions, target_mode, prediction_mode):
    # compute F1 as max across any alias for any answer for the most recent, most frequent, or specific-year answer
    qa_targets, qa_predictions = [], []
    for query in data:
        target = query.get_relevant_target(target_mode)
        if target is None:
            continue
        prediction = get_prediction(predictions, query.id, prediction_mode)
        # if not len(prediction['answer']):
        #    continue
        qa_targets.append(
            {
                "answers": {"answer_start": [0] * len(target), "text": target},
                "id": query.id,
            }
        )
        qa_predictions.append({"prediction_text": prediction["answer"], "id": query.id})

    print("Evaluating on {} datapoints".format(len(qa_targets)))
    return compute_score(predictions=qa_predictions, references=qa_targets)

In [3]:
data_path = '../data/templama/val.json'
data = build_dataset(data_path)
data['Q68060_P54']

Gianluigi Buffon plays for _X_.
(2010)	Juventus FC
(2010)	Italy national association football team
(2011)	Juventus FC
(2011)	Italy national association football team
(2012)	Juventus FC
(2012)	Italy national association football team
(2013)	Juventus FC
(2013)	Italy national association football team
(2014)	Juventus FC
(2014)	Italy national association football team
(2015)	Juventus FC
(2015)	Italy national association football team
(2016)	Juventus FC
(2016)	Italy national association football team
(2017)	Juventus FC
(2017)	Italy national association football team
(2018)	Juventus FC
(2018)	Paris Saint-Germain
(2018)	Italy national association football team
(2019)	Juventus FC
(2019)	Paris Saint-Germain
(2020)	Juventus FC

In [8]:
target_mode = 'most_recent'
prediction_mode = 'perplexity'

## Flan-T5

In [40]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")

In [9]:
predictions_path = '../fm_predictions/yesi-val--google-flan-t5-xxl/predictions.json'
predictions = load_predictions(predictions_path)
predictions['Q68060_P54']

{'query': 'Gianluigi Buffon plays for',
 'predictions': [{'answer': 'Juventus',
   'per_token_probability': [0.83056640625],
   'first_token_probability': 0.83056640625,
   'perplexity': 1.203997648442093}]}

In [10]:
scores = evaluate(data, predictions, target_mode, prediction_mode)
print('F1: ', scores['ave_f1'])

Evaluating on 539 datapoints
F1:  7.707912903206333


In [47]:
mydata = dict()
for key, query in data.queries.items():
    target = query.get_relevant_target(target_mode)[0]
    tok_target = tokenizer(target).input_ids
    tok_target = tokenizer.convert_ids_to_tokens(tok_target, skip_special_tokens=True)
    if len(tok_target) == 1:
        mydata[key] = query
mydata = Queries(mydata)

mypreds = {k: v for k, v in predictions.items() if k in mydata.queries}

len(data.queries), len(mydata.queries), len(mypreds)

scores = evaluate(mydata, mypreds, target_mode, prediction_mode)
print('F1: ', scores['ave_f1'])

Evaluating on 6 datapoints
F1:  16.666666666666668
