In [None]:
import datasets
import csv
import json
import numpy as np
import random
import itertools

PREDICTIONS_PATH = "../predictions"
MODEL_PAIRS = [
    ('bart_base_asqa_dpr@3', 'bart_large_asqa_dpr@3'),
    ('bart_large_asqa_dpr@3', 'bart_large_eli5_asqa_dpr@3'),
    ('t5_base_asqa_dpr@3', 't5_base_msmarco_asqa_dpr@3'),
    ('t5_base_asqa_dpr@3', 'bart_base_asqa_dpr@3'),
    ('bart_large_asqa_bm25@3', 'bart_large_asqa_dpr@3'),
    ('bart_large_asqa_bm25@1', 'bart_large_asqa_bm25@3'),
    ('bart_large_asqa_bm25@3', 'bart_large_asqa_bm25@5'),
    ('bart_large_asqa_dpr@1', 'bart_large_asqa_dpr@3'),
    ('bart_large_asqa_dpr@3', 'bart_large_asqa_dpr@5'),
    ('bart_large_eli5_asqa_dpr@3', 'reference'),
    ('dpr@1', 'bart_base_asqa_dpr@1'),
    ('dpr@1', 'bart_large_asqa_dpr@1'),
]
NUM_QUESTIONS = 4

# Load the dataset
with open('../dataset/ASQA.json', 'r') as f:
    asqa_dataset = json.load(f)
    asqa_dataset = asqa_dataset['dev']

In [None]:
def get_question_predictions(model, key):
    if model == 'reference':
        # Get long answer from ASQA dataset
        return asqa_dataset[key]['annotations'][0]['long_answer']
    
    with open(f"{PREDICTIONS_PATH}/{model}.json") as f:
        predictions = json.load(f)
        return predictions[key]

def get_random_question(model1, model2):
    # Find question which predictions have at least one gold answer in them
    random_question = np.random.choice(list(asqa_dataset.keys()), 1)[0]
    
    gold_answers = set(itertools.chain(*[p['short_answers'] for p in asqa_dataset[random_question]['qa_pairs']]))
    prediction1 = get_question_predictions(model1, random_question)
    prediction2 = get_question_predictions(model2, random_question)
    gold_in_prediction1 = itertools.chain(*[g.split(" ") for g in gold_answers if g.lower() in prediction1.lower()])
    gold_in_prediction2 = itertools.chain(*[g.split(" ") for g in gold_answers if g.lower() in prediction2.lower()])
    
    if len(list(gold_in_prediction1)) > 0 or len(list(gold_in_prediction2)) > 0:
        asqa_dataset.pop(random_question)
        return random_question, prediction1, prediction2
    else:
        return get_random_question(model1, model2)

for (model1, model2) in MODEL_PAIRS:
    with open(f"./comparisons/{model1}_vs_{model2}_setup.tsv", "wt", encoding='cp1252') as out_file:
        # write to tsv file
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(['key', 'model1', 'model2', 'prediction1', 'prediction2'])
        # select random questions from dataset
        for i in range(NUM_QUESTIONS):
            # shuffle models so they are in random order
            model1_random = random.choice((model1, model2))
            model2_random = model1 if model1_random == model2 else model2
            sample_id, prediction1, prediction2 = get_random_question(model1_random, model2_random)
            tsv_writer.writerow([sample_id, model1_random, model2_random, prediction1, prediction2])

In [None]:
import os

#Make files ready for drive
for file in os.listdir("./comparisons"):
    out_file = file.replace("_setup", "")
    os.system(f"python preparation.py --asqa ../dataset/ASQA.json --setup ./comparisons/{file} --dst ./comparisons_rfd/{out_file}")

In [None]:
import os

os.makedirs("./results_of_comparisons", exist_ok=True)

#Make files ready for drive
for file in os.listdir("./comparisons"):
    out_file = file.replace("_setup.tsv", "")
    !python analysis.py --setup ./comparisons/{file} --comparisons ./human_evaluation/{out_file}/AnnotationInterface.xlsx --dst ./results_of_comparisons/{out_file}.tsv

In [None]:
# flatten list of tuples
def flatten(l):
    return set([item for sublist in l for item in sublist])

MODELS = flatten(MODEL_PAIRS)
he_metrics = ["Accuracy", "Comprehensiveness", "Fluency", "Overall", "count"]
results_dict = {m : {metric : 0 for metric in he_metrics} for m in MODELS}

for file in os.listdir("./results_of_comparisons"):
    with open(f"./results_of_comparisons/{file}", "rt", encoding='cp1252') as f:
        tsv_reader = csv.reader(f, delimiter='\t')
        next(tsv_reader)
        for row in tsv_reader:
            _, _, acc1, acc2, comp, fluency, overall, model1, model2, sample_id = row
            for model, acc in zip([model1, model2], [acc1, acc2]):
                results_dict[model]["Accuracy"] += float(acc)
                results_dict[model]["Comprehensiveness"] += 0.5 if str(comp) == 'tie' else 0
                results_dict[model]["Fluency"] += 0.5 if str(fluency) == 'tie' else 0
                results_dict[model]["Overall"] += 0.5 if str(overall) == 'tie' else 0
                results_dict[model]["count"] += 1
            
            if comp == 'left':
                results_dict[model1]["Comprehensiveness"] += 1
            elif comp == 'right':
                results_dict[model2]["Comprehensiveness"] += 1

            if fluency == 'left':
                results_dict[model1]["Fluency"] += 1
            elif fluency == 'right':
                results_dict[model2]["Fluency"] += 1
            
            if overall == 'left':
                results_dict[model1]["Overall"] += 1
            elif overall == 'right':
                results_dict[model2]["Overall"] += 1

results_dict = {m : {metric : results_dict[m][metric] / results_dict[m]["count"] for metric in he_metrics} for m in MODELS}

with open("results.json", "wt") as f:
    json.dump(results_dict, f, indent=4)

In [None]:
import pandas as pd

df = pd.read_json("results.json").T * 100

# drop column count
df = df.drop(columns=["count"])

# map all values to two decimal places
df = df.applymap(lambda x: float(f"{x:.2f}"))

print(df.sort_values(by=['Overall'], ascending=False))

In [None]:
model_automated = {
    "dpr@1":{
        "rougeLsum": 31.375407192067286,
        "length": 103.46729957805907,
        "str_em": 29.347749648382564,
        "QA-EM": 14.418073136427568,
        "QA-F1": 17.534081701452774,
        "QA-Hit": 3.5864978902953584,
        "ovscore": 23.4550411025446
    },
    "bart_large_asqa_dpr@1":{
        "rougeLsum": 34.17367623514718,
        "length": 54.69409282700422,
        "str_em": 25.97573839662447,
        "QA-EM": 14.349507735583686,
        "QA-F1": 18.122323894945254,
        "QA-Hit": 3.059071729957806,
        "ovscore": 24.885868066320903
    },
    "bart_large_asqa_dpr@3": {
        "rougeLsum": 36.42172293222454,
        "length": 62.45675105485232,
        "str_em": 29.952531645569618,
        "QA-EM": 16.547116736990155,
        "QA-F1": 20.77815216309937,
        "QA-Hit": 3.9029535864978904,
        "ovscore": 27.509563812027398
    },
    "bart_large_asqa_dpr@5": {
        "rougeLsum": 36.61926015496989,
        "length": 63.333333333333336,
        "str_em": 29.755625879043603,
        "QA-EM": 16.264064697609,
        "QA-F1": 20.53591318397746,
        "QA-Hit": 3.481012658227848,
        "ovscore": 27.4227997732534
    },
    "bart_large_asqa_bm25@1":{  
        "rougeLsum": 31.81947228874431,
        "length": 53.792194092827,
        "str_em": 16.654360056258792,
        "QA-EM": 8.649789029535864,
        "QA-F1": 11.384725834565225,
        "QA-Hit": 1.7932489451476792,
        "ovscore": 19.03302309692025
    },
    "bart_large_asqa_bm25@3":{
        "rougeLsum": 33.43383550706495,
        "length": 59.677215189873415,
        "str_em": 20.625879043600566,
        "QA-EM": 10.114275668073136,
        "QA-F1": 13.518431641657038,
        "QA-Hit": 2.2151898734177213,
        "ovscore": 21.259657095556914
    },
    "bart_large_asqa_bm25@5": {
         "rougeLsum": 33.80983814622092,
        "length": 58.994725738396625,
        "str_em": 21.81082981715893,
        "QA-EM": 11.031997187060478,
        "QA-F1": 14.552247321904103,
        "QA-Hit": 2.7426160337552745,
        "ovscore": 22.181278741708166
    },
    "bart_large_eli5_asqa_dpr@3":{
        "rougeLsum": 36.64911989014963,
        "length": 59.71413502109704,
        "str_em": 30.253164556962027,
        "QA-EM": 16.51898734177215,
        "QA-F1": 20.98682683459458,
        "QA-Hit": 3.2700421940928273,
        "ovscore": 27.733530838587182
    },
    "t5_base_asqa_dpr@3": {
        "rougeLsum": 33.652080108780105,
        "length": 57.82805907172996,
        "str_em": 26.41350210970464,
        "QA-EM": 13.959212376933896,
        "QA-F1": 17.777224882873334,
        "QA-Hit": 2.7426160337552745,
        "ovscore": 24.45895737496291
    },
    "t5_base_msmarco_asqa_dpr@3": {
        "rougeLsum": 33.68414993414289,
        "length": 55.979957805907176,
        "str_em": 26.042545710267227,
        "QA-EM": 14.228199718706048,
        "QA-F1": 18.058298780795056,
        "QA-Hit": 2.848101265822785,
        "ovscore": 24.66330155692563
    },
    "bart_base_asqa_dpr@1":{
        "rougeLsum": 33.06625765989156,
        "length": 52.391350210970465,
        "str_em": 24.226441631504926,
        "QA-EM": 12.594936708860757,
        "QA-F1": 16.07544034144928,
        "QA-Hit": 2.320675105485232,
        "ovscore": 23.055469032890592
    },
    "bart_base_asqa_dpr@3": {
        "rougeLsum": 33.88441978057487,
        "length": 57.11708860759494,
        "str_em": 25.149437412095637,
        "QA-EM": 12.645921237693392,
        "QA-F1": 16.532482741404735,
        "QA-Hit": 2.848101265822785,
        "ovscore": 23.66840901338464
    }
}

model_human = df
automated_df = pd.DataFrame(model_automated).T
human_df = pd.DataFrame(model_human)


In [None]:
# save correlation between all automated and human metrics in a dict
correlations = {}
for human_stat in human_df.columns:
    correlations[human_stat] = {}
    for automated_stat in automated_df.columns:
        correlations[human_stat][automated_stat] = human_df[human_stat].corr(automated_df[automated_stat], method='pearson')

pd.DataFrame(correlations).T.drop(columns=["length", "QA-EM", "QA-Hit"])

In [None]:
# Create pairwise comparisons table

for file in os.listdir("./results_of_comparisons"):
    pairwise_results_dict = {}
    with open(f"./results_of_comparisons/{file}", "rt", encoding='cp1252') as f:
        file = file.replace(".tsv", "")
        pairwise_results_dict = {k : {metric : 0 for metric in he_metrics} for k in file.split("_vs_")}
        tsv_reader = csv.reader(f, delimiter='\t')
        next(tsv_reader)
        for row in tsv_reader:
            _, _, acc1, acc2, comp, fluency, overall, model1, model2, sample_id = row
            for model, acc in zip([model1, model2], [acc1, acc2]):
                pairwise_results_dict[model]["Accuracy"] += float(acc)
                pairwise_results_dict[model]["Comprehensiveness"] += 0.5 if str(comp) == 'tie' else 0
                pairwise_results_dict[model]["Fluency"] += 0.5 if str(fluency) == 'tie' else 0
                pairwise_results_dict[model]["Overall"] += 0.5 if str(overall) == 'tie' else 0
                pairwise_results_dict[model]["count"] += 1
            
            if comp == 'left':
                pairwise_results_dict[model1]["Comprehensiveness"] += 1
            elif comp == 'right':
                pairwise_results_dict[model2]["Comprehensiveness"] += 1

            if fluency == 'left':
                pairwise_results_dict[model1]["Fluency"] += 1
            elif fluency == 'right':
                pairwise_results_dict[model2]["Fluency"] += 1
            
            if overall == 'left':
                pairwise_results_dict[model1]["Overall"] += 1
            elif overall == 'right':
                pairwise_results_dict[model2]["Overall"] += 1
            
        pairwise_results_dict = {k: {k2: v2 / v["count"] for k2, v2 in v.items()} for k, v in pairwise_results_dict.items()}

        with open(f"pairwise_results/{file}.json", "wt") as f:
            json.dump(pairwise_results_dict, f, indent=4)
    

In [None]:
import json

for file in os.listdir("./pairwise_results"):
    df = pd.read_json(f"./pairwise_results/{file}").T
    # drop column count
    df = df.drop(columns=["count"])
    # map all values to two decimal places
    df = df.applymap(lambda x: float(f"{x:.2f}"))
    display(df)
