In [None]:
from tqdm.notebook import tqdm
import pandas
import json
import collections
import numpy as np

from scorers import score_all_rouge, score_semqa_f1, score_semqa_short_recall

In [None]:
TEST_SET_PATH = '../v1/test.jsonl'

examples_by_qid = {}
with open(TEST_SET_PATH, 'r') as f:
    for line in f:
        example = json.loads(line.strip())
        if example['qid'] not in examples_by_qid:
            examples_by_qid[example['qid']] = [example]
        else:
            examples_by_qid[example['qid']].append(example)

examples = list(examples_by_qid.values())
qids = list(examples_by_qid.keys())
all_targets = [[ex['summary'] for ex in example] for example in examples]
all_short_targets = [[ex['covered_short_answers'] for ex in example] for example in examples]

In [None]:
# Initialize predicitons dictionary
predictions_by_model = {}

def get_predictions(path):
    predictions_by_qid = {}
    with open(PREDICTIOS_PATH, 'r') as f:
        for line in f:
            example = json.loads(line.strip())
            predictions_by_qid[example['qid']] = example['prediction']
    
    return [predictions_by_qid[qid] for qid in qids]

In [None]:
PREDICTIOS_PATH = '../predictions.jsonl' # Assumes that the file has json lines with 'qid' and 'prediction'.
MODEL_NAME = 'Flan T5 base'

predictions_by_model[MODEL_NAME] = get_predictions(PREDICTIOS_PATH)

assert len(predictions_by_model[MODEL_NAME]) == len(all_targets)

In [None]:
metrics = collections.defaultdict(lambda: [])
for model_name, predictions in tqdm(predictions_by_model.items()):
  results = score_all_rouge(all_targets, predictions, bootstrap=True)[0]
  for key, value in results.items():
    metrics[key].append(value[0])
  metrics['Sem-F1'].append(score_semqa_f1(all_targets, predictions, examples))
  metrics['Sem-REC'].append(score_semqa_short_recall(all_short_targets, predictions))

pandas.DataFrame(metrics, index = list(predictions_by_model.keys()))

In [None]:
del metrics['rouge1']
del metrics['rouge2']

In [None]:
pd = pandas.DataFrame(metrics, index = list(predictions_by_model.keys()))

pd['SEMQA'] = np.sqrt(pd['Sem-F1'] * pd['rougeLsum'])
pd

In [None]:
pd.to_latex(float_format="%.2f")