In [None]:
import json

with open('t5_ft_closed_book_predictions.json') as f:
    preds = json.load(f)

In [None]:
from datasets import load_dataset

asqa = load_dataset('din0s/asqa', split='dev')

In [None]:
import itertools, nltk
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

stats = {}
for sample in asqa:
    # lowercase model output
    model_output = preds[sample['sample_id']].lower()
    # remove stopwords
    filtered_output = [w for w in model_output.split(" ") if w not in stopwords]
    # calculate unique words percentage
    pct_unique = len(set(filtered_output)) / len(filtered_output)

    # flatten gold answers into a single list
    gold_answers = set(itertools.chain(*[p['short_answers'] for p in sample['qa_pairs']]))
    # identify gold tokens that exist in the model output
    gold_in_output = list(itertools.chain(*[g.split(" ") for g in gold_answers if g.lower() in model_output]))
    # calculate golden tokens percentage
    pct_gold = len(gold_in_output) / len(filtered_output)

    stats[sample['sample_id']] = {
        'pct_unique': pct_unique,
        'pct_gold': pct_gold,
        'pct_gold_na': pct_gold if pct_gold > 0 else None,
    }

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(stats, orient='index')
df.mean(axis=0) * 100