In [None]:
import datasets
import csv
import json
import numpy as np
import random
import itertools

PREDICTIONS_PATH = "../predictions"
MODEL_PAIRS = [
    ('bart_base_asqa_dpr@3', 'bart_large_asqa_dpr@3'),
    ('bart_large_asqa_dpr@3', 'bart_large_eli5_asqa_dpr@3'),
    ('t5_base_asqa_dpr@3', 't5_base_msmarco_asqa_dpr@3'),
    ('t5_base_asqa_dpr@3', 'bart_base_asqa_dpr@3'),
    ('bart_large_asqa_bm25@3', 'bart_large_asqa_dpr@3'),
    ('bart_large_asqa_bm25@1', 'bart_large_asqa_bm25@3'),
    ('bart_large_asqa_bm25@3', 'bart_large_asqa_bm25@5'),
    ('bart_large_asqa_dpr@1', 'bart_large_asqa_dpr@3'),
    ('bart_large_asqa_dpr@3', 'bart_large_asqa_dpr@5'),
    ('bart_large_eli5_asqa_dpr@3', 'reference'),
    ('dpr@1', 'bart_base_asqa_dpr@1'),
    ('dpr@1', 'bart_large_asqa_dpr@1'),
]
NUM_QUESTIONS = 4

# Load the dataset
with open('../dataset/ASQA.json', 'r') as f:
    asqa_dataset = json.load(f)
    asqa_dataset = asqa_dataset['dev']

In [None]:
def get_question_predictions(model, key):
    if model == 'reference':
        # Get long answer from ASQA dataset
        return asqa_dataset[key]['annotations'][0]['long_answer']
    
    with open(f"{PREDICTIONS_PATH}/{model}.json") as f:
        predictions = json.load(f)
        return predictions[key]

def get_random_question(model1, model2):
    # Find question which predictions have at least one gold answer in them
    random_question = np.random.choice(list(asqa_dataset.keys()), 1)[0]
    
    gold_answers = set(itertools.chain(*[p['short_answers'] for p in asqa_dataset[random_question]['qa_pairs']]))
    prediction1 = get_question_predictions(model1, random_question)
    prediction2 = get_question_predictions(model2, random_question)
    gold_in_prediction1 = itertools.chain(*[g.split(" ") for g in gold_answers if g.lower() in prediction1.lower()])
    gold_in_prediction2 = itertools.chain(*[g.split(" ") for g in gold_answers if g.lower() in prediction2.lower()])
    
    if len(list(gold_in_prediction1)) > 0 or len(list(gold_in_prediction2)) > 0:
        asqa_dataset.pop(random_question)
        return random_question, prediction1, prediction2
    else:
        return get_random_question(model1, model2)

for (model1, model2) in MODEL_PAIRS:
    with open(f"./comparisons/{model1}_vs_{model2}_setup.tsv", "wt", encoding='cp1252') as out_file:
        # write to tsv file
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(['key', 'model1', 'model2', 'prediction1', 'prediction2'])
        # select random questions from dataset
        for i in range(NUM_QUESTIONS):
            # shuffle models so they are in random order
            model1_random = random.choice((model1, model2))
            model2_random = model1 if model1_random == model2 else model2
            sample_id, prediction1, prediction2 = get_random_question(model1_random, model2_random)
            tsv_writer.writerow([sample_id, model1_random, model2_random, prediction1, prediction2])

In [None]:
import os

#Make files ready for drive
for file in os.listdir("./comparisons"):
    out_file = file.replace("_setup", "")
    os.system(f"python preparation.py --asqa ../dataset/ASQA.json --setup ./comparisons/{file} --dst ./comparisons_rfd/{out_file}")

In [None]:
import os

os.makedirs("./results_of_comparisons", exist_ok=True)

#Make files ready for drive
for file in os.listdir("./comparisons"):
    out_file = file.replace("_setup.tsv", "")
    # os.system(f"python analysis.py --setup ./comparisons/{file} --comparisons ./human_evaluation/{out_file}/AnnotationInterface.xlsx --dst ./resuts_of_comparisons/{out_file}.tsv")
    !python analysis.py --setup ./comparisons/{file} --comparisons ./human_evaluation/{out_file}/AnnotationInterface.xlsx --dst ./resuts_of_comparisons/{out_file}.tsv

In [None]:
# flatten list of tuples
def flatten(l):
    return set([item for sublist in l for item in sublist])

MODELS = flatten(MODEL_PAIRS)
he_metrics = ["Accuracy", "Comprehensiveness", "Fluency", "Overall", "count"]
results_dict = {m : {metric : 0 for metric in he_metrics} for m in MODELS}

for file in os.listdir("./resuts_of_comparisons"):
    with open(f"./results_of_comparisons/{file}", "rt", encoding='cp1252') as f:
        tsv_reader = csv.reader(f, delimiter='\t')
        next(tsv_reader)
        for row in tsv_reader:
            _, _, acc1, acc1, comp, fluency, overall, model1, model2, sample_id = row
            for model in [model1, model2]:
                results_dict[model]["Accuracy"] += float(acc1)
                results_dict[model]["Comprehensiveness"] += 0.5 if str(comp) == 'tie' else 0
                results_dict[model]["Fluency"] += 0.5 if str(fluency) == 'tie' else 0
                results_dict[model]["Overall"] += 0.5 if str(overall) == 'tie' else 0
                results_dict[model]["count"] += 1
            
            if comp == 'left':
                results_dict[model1]["Comprehensiveness"] += 1
            elif comp == 'right':
                results_dict[model2]["Comprehensiveness"] += 1

            if fluency == 'left':
                results_dict[model1]["Fluency"] += 1
            elif fluency == 'right':
                results_dict[model2]["Fluency"] += 1
            
            if overall == 'left':
                results_dict[model1]["Overall"] += 1
            elif overall == 'right':
                results_dict[model2]["Overall"] += 1

results_dict = {m : {metric : results_dict[m][metric] / results_dict[m]["count"] for metric in he_metrics} for m in MODELS}

with open("results.json", "wt") as f:
    json.dump(results_dict, f, indent=4)

In [None]:
import pandas as pd

df = pd.read_json("results.json").T * 100

# drop column count
df = df.drop(columns=["count"])

# map all values to two decimal places
df = df.applymap(lambda x: float(f"{x:.2f}"))

df.sort_values(by=['Overall'], ascending=False)

In [None]:
model_automated = {
    "bart_large_asqa_dpr@3": {
        "rougeLsum": 36.42172293222454,
        "str_em": 29.952531645569618,
        "QA-EM": 16.547116736990155,
        "QA-F1": 20.77815216309937,
        "QA-Hit": 3.9029535864978904,
        "ovscore": 27.509563812027398
    },
    "t5_base_asqa_dpr@3": {
        "rougeLsum": 33.652080108780105,
        "length": 57.82805907172996,
        "str_em": 26.41350210970464,
        "QA-EM": 13.959212376933896,
        "QA-F1": 17.777224882873334,
        "QA-Hit": 2.7426160337552745,
        "ovscore": 24.45895737496291
    },
    "bart_base_asqa_dpr@3": {
        "rougeLsum": 33.88441978057487,
        "length": 57.11708860759494,
        "str_em": 25.149437412095637,
        "QA-EM": 12.645921237693392,
        "QA-F1": 16.532482741404735,
        "QA-Hit": 2.848101265822785,
        "ovscore": 23.66840901338464
    }
}

model_human = df
automated_df = pd.DataFrame(model_automated).T
human_df = pd.DataFrame(model_human)


In [None]:
# save correlation between all automated and human metrics in a dict
correlations = {}
for human_stat in human_df.columns:
    correlations[human_stat] = {}
    for automated_stat in automated_df.columns:
        correlations[human_stat][automated_stat] = human_df[human_stat].corr(automated_df[automated_stat], method='pearson')

pd.DataFrame(correlations).T