In [30]:
# Based on http://nlp.cs.washington.edu/zeroshot/evaluate.py
import pandas as pd
import os
import codecs
import re
import string
import sys
import numpy as np

PUNCTUATION = set(string.punctuation)

import re

def remove_latin(text):
    return re.sub(r'[^\x00-\x7f]',r'', text)

def unk_zero_re_eval(test_file, answer_file):
    q_aprf = unk_read_results(test_file, answer_file)
    return pretify(q_aprf)

def unk_read_results(test_set, answer_file):
    with codecs.open(test_set, "r", "utf-8") as fin:
        data = [line.strip().split("\t") for line in fin]
    metadata = [x[:4] for x in data]
    gold = [set(x[4:]) for x in data]

    with codecs.open(answer_file, "r", "utf-8") as fin:
        answers = [line.strip() for line in fin]

    new_answers = []
    for answer in answers[1:]:
        if answer != "no_answer":
            new_answers.append(answer)
        else:
            new_answers.append("")

    telemetry = []
    for m, g, a in zip(metadata, new_gold, new_answers):
        stats = score(g, a)
        telemetry.append([m[0], m[1], str(len(g) > 0), stats])
    return aprf(telemetry)

def parse_no_answers(results):
    p_answer = [
        a for i, a in sorted([(int(i), a) for i, a in results[0]["scores"].items()])
    ]
    p_no_answer = [
        a for i, a in sorted([(int(i), a) for i, a in results[0]["na"].items()])
    ]

    import numpy as np

    return [answer > no_answer for answer, no_answer in zip(p_answer, p_no_answer)]


def gb(collection, keyfunc):
    return [(k, list(g)) for k, g in groupby(sorted(collection, key=keyfunc), keyfunc)]


def aprf(g):
    tp, tn, sys_pos, real_pos = sum(map(lambda x: x[-1], g))
    total = len(g)
    # a = float(tp + tn) / total
    # nr = tn / float(total - real_pos)
    # npr = tn / float(total - sys_pos)
    if tp == 0:
        p = r = f = 0.0
    else:
        p = tp / float(sys_pos)
        r = tp / float(real_pos)
        f = 2 * p * r / (p + r)
    # return np.array((a, p, r, f, npr, nr))
    return np.array((p, r, f))


def score(gold, answer):
    if len(gold) > 0:
        gold = set.union(*[simplify(g) for g in gold])
    answer = simplify(answer)
    result = np.zeros(4)
    if answer == gold:
        if len(gold) > 0:
            result[0] += 1
        else:
            result[1] += 1
    if len(answer) > 0:
        result[2] += 1
    if len(gold) > 0:
        result[3] += 1
    return result


def simplify(answer):
    return set(
        "".join(c for c in t if c not in PUNCTUATION)
        for t in answer.strip().lower().split()
    ) - {"the", "a", "an", "and", ""}


def pretify(results):
    return " \t ".join(
        [
            ": ".join((k, v))
            for k, v in zip(
                ["Precision", "Recall", "F1"],
                map(lambda r: "{0:.2f}%".format(r * 100), results),
            )
        ]
    )

In [38]:
def preprocess_the_prediction_files(main_path, list_of_files):
    for file in list_of_files:
        df = pd.read_csv(os.path.join(main_path, file), sep=',')
        df["predictions_str"].to_csv(os.path.join("/tmp/", file), sep='\t', header=True, index=False)

def unk_eval_the_prediction_files(list_of_files, gold_file):
    scores = {}
    scores_list = []
    for file in list_of_files:
        score = unk_zero_re_eval(gold_file, os.path.join("/tmp/", file))
        f1_score = float(score.split()[-1][0:-1])
        scores[f1_score] = file
        scores_list.append(f1_score)

    f1s = np.array(scores_list)
    max_f1 = max(scores.keys())
    return scores[max_f1],  max_f1, f1s, scores

In [41]:
# Evaluating the dev predictions on the RE-QA dataset using the model having access to gold templates!
for fold_i in range(1, 11, 1):
    fold_gold_file = "./zero-shot-extraction/relation_splits/dev.{}".format(fold_i-1)
    fold_path = "/home/saeednjf/scratch/feb-15-2022-arr/fold_{}/gold/dev_predictions/".format(fold_i)
    fold_files = ["gold_fold.{}.dev.predictions.step.{}.csv".format(fold_i, 100 * i) for i in range(1, 506, 1)]
    preprocess_the_prediction_files(fold_path, fold_files)
    max_file,  max_f1, f1s, scores = unk_eval_the_prediction_files(fold_files, fold_gold_file)
    print(fold_i, max_file, max_f1)
    print("#")

1 gold_fold.1.dev.predictions.step.800.csv 56.83
#
2 gold_fold.2.dev.predictions.step.2000.csv 65.76
#
3 gold_fold.3.dev.predictions.step.4200.csv 63.59
#
4 gold_fold.4.dev.predictions.step.1400.csv 64.44
#
5 gold_fold.5.dev.predictions.step.900.csv 67.0
#
6 gold_fold.6.dev.predictions.step.400.csv 69.45
#
7 gold_fold.7.dev.predictions.step.6100.csv 59.3
#
8 gold_fold.8.dev.predictions.step.7300.csv 64.1
#
9 gold_fold.9.dev.predictions.step.1800.csv 67.48
#
10 gold_fold.10.dev.predictions.step.4100.csv 61.41
#


In [42]:
# Evaluating the dev predictions on the RE-QA dataset using the model having access to psuedo questions.
for fold_i in range(1, 11, 1):
    fold_gold_file = "./zero-shot-extraction/relation_splits/dev.{}".format(fold_i-1)
    fold_path = "/home/saeednjf/scratch/feb-15-2022-arr/fold_{}/concat/dev_predictions/".format(fold_i)
    fold_files = ["concat_fold.{}.dev.predictions.step.{}.csv".format(fold_i, 100 * i) for i in range(1, 506, 1)]
    preprocess_the_prediction_files(fold_path, fold_files)
    max_file,  max_f1, f1s, scores = unk_eval_the_prediction_files(fold_files, fold_gold_file)
    print(fold_i, max_file, max_f1)
    print("#")

1 concat_fold.1.dev.predictions.step.1700.csv 53.96
#
2 concat_fold.2.dev.predictions.step.4600.csv 64.46
#
3 concat_fold.3.dev.predictions.step.18600.csv 61.4
#
4 concat_fold.4.dev.predictions.step.1400.csv 67.12
#
5 concat_fold.5.dev.predictions.step.7900.csv 69.91
#
6 concat_fold.6.dev.predictions.step.2600.csv 64.16
#
7 concat_fold.7.dev.predictions.step.24700.csv 63.0
#
8 concat_fold.8.dev.predictions.step.13800.csv 61.08
#
9 concat_fold.9.dev.predictions.step.17700.csv 61.6
#
10 concat_fold.10.dev.predictions.step.35500.csv 57.72
#
