In [4]:
# Based on http://nlp.cs.washington.edu/zeroshot/evaluate.py
import pandas as pd
import os
import codecs
import re
import string
import sys
import numpy as np

PUNCTUATION = set(string.punctuation)

import re

def remove_latin(text):
    return re.sub(r'[^\x00-\x7f]',r'', text)

def unk_zero_re_eval(test_file, answer_file):
    q_aprf = unk_read_results(test_file, answer_file)
    return pretify(q_aprf)

def unk_read_results(test_set, answer_file):
    with codecs.open(test_set, "r", "utf-8") as fin:
        data = [line.strip().split("\t") for line in fin]
    metadata = [x[:4] for x in data]
    gold = [set(x[4:]) for x in data]

    with codecs.open(answer_file, "r", "utf-8") as fin:
        answers = [line.strip() for line in fin]

    new_answers = []
    for answer in answers[1:]:
        if answer != "no_answer":
            new_answers.append(answer)
        else:
            new_answers.append("")

    telemetry = []
    for m, g, a in zip(metadata, gold, new_answers):
        stats = score(g, a)
        telemetry.append([m[0], m[1], str(len(g) > 0), stats])
    return aprf(telemetry)

def parse_no_answers(results):
    p_answer = [
        a for i, a in sorted([(int(i), a) for i, a in results[0]["scores"].items()])
    ]
    p_no_answer = [
        a for i, a in sorted([(int(i), a) for i, a in results[0]["na"].items()])
    ]

    import numpy as np

    return [answer > no_answer for answer, no_answer in zip(p_answer, p_no_answer)]


def gb(collection, keyfunc):
    return [(k, list(g)) for k, g in groupby(sorted(collection, key=keyfunc), keyfunc)]


def aprf(g):
    tp, tn, sys_pos, real_pos = sum(map(lambda x: x[-1], g))
    total = len(g)
    # a = float(tp + tn) / total
    # nr = tn / float(total - real_pos)
    # npr = tn / float(total - sys_pos)
    if tp == 0:
        p = r = f = 0.0
    else:
        p = tp / float(sys_pos)
        r = tp / float(real_pos)
        f = 2 * p * r / (p + r)
    # return np.array((a, p, r, f, npr, nr))
    return np.array((p, r, f))


def score(gold, answer):
    if len(gold) > 0:
        gold = set.union(*[simplify(g) for g in gold])
    answer = simplify(answer)
    result = np.zeros(4)
    if answer == gold:
        if len(gold) > 0:
            result[0] += 1
        else:
            result[1] += 1
    if len(answer) > 0:
        result[2] += 1
    if len(gold) > 0:
        result[3] += 1
    return result


def simplify(answer):
    return set(
        "".join(c for c in t if c not in PUNCTUATION)
        for t in answer.strip().lower().split()
    ) - {"the", "a", "an", "and", ""}


def pretify(results):
    return " \t ".join(
        [
            ": ".join((k, v))
            for k, v in zip(
                ["Precision", "Recall", "F1"],
                map(lambda r: "{0:.2f}%".format(r * 100), results),
            )
        ]
    )

In [2]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

In [5]:
import torch
def compute_perplexity_for_questions(main_path, file):
    ppls = []
    df = pd.read_csv(os.path.join(main_path, file), sep=',')
    questions = df["question_predictions"].tolist()
    for question in questions:
        encodings = tokenizer(question, return_tensors="pt")
        input_ids = encodings.input_ids.to(device)
        b_sz, length = input_ids.size()
        target_ids = input_ids.clone()
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs[0]
            ppl = torch.exp(neg_log_likelihood)
        ppls.append(ppl)
    ppl = torch.stack(ppls).mean()
    return ppl

def gold_compute_perplexity_for_questions(main_path, file):
    ppls = []
    df = pd.read_csv(os.path.join(main_path, file), sep=',')
    inputs = df["input_str"].tolist()
    for inp in inputs:
        question = inp.split("context:")[0].replace("question:", "").strip()
        encodings = tokenizer(question, return_tensors="pt")
        input_ids = encodings.input_ids.to(device)
        b_sz, length = input_ids.size()
        target_ids = input_ids.clone()
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs[0]
            ppl = torch.exp(neg_log_likelihood)
        ppls.append(ppl)
    ppl = torch.stack(ppls).mean()
    return ppl

In [6]:
def preprocess_the_prediction_files(main_path, list_of_files):
    for file in list_of_files:
        df = pd.read_csv(os.path.join(main_path, file), sep=',')
        df["predictions_str"].to_csv(os.path.join("/tmp/", file), sep='\t', header=True, index=False)

def unk_eval_the_prediction_files(list_of_files, gold_file):
    scores = {}
    scores_list = []
    precision_list = []
    recall_list = []
    for file in list_of_files:
        score = unk_zero_re_eval(gold_file, os.path.join("/tmp/", file))
        arr = score.split()
        f1_score = float(arr[-1][0:-1])
        precision = float(arr[1][0:-1])
        recall = float(arr[3][0:-1])
        scores[f1_score] = file
        scores_list.append(f1_score)
        precision_list.append(precision)
        recall_list.append(recall)

    f1s = np.array(scores_list)
    precisions = np.array(precision_list)
    recalls = np.array(recall_list)
    max_f1 = max(scores.keys())
    return scores[max_f1],  max_f1, f1s, scores, precisions, recalls

In [5]:
results = {}
for fold_i in range(1, 11, 1):
    results[fold_i] = {'mml-pgg-off-sim': {},
                       'mml-pgg-on-sim': {},
                       'mml-mml-off-sim': {},
                       'mml-mml-on-sim': {}}

In [6]:
# Evaluating the dev predictions on the RE-QA dataset on all folds for the tail entity generation task.
folders = ["mml-pgg-off-sim", "mml-pgg-on-sim", "mml-mml-off-sim", "mml-mml-on-sim"]

for fold_i in range(1, 11, 1):
    for folder in folders:
        fold_gold_file = "./zero-shot-extraction/relation_splits/dev.{}".format(fold_i-1)
        fold_path = "~/reqa-predictions/fold_{}/{}/".format(fold_i, folder)
        if fold_i == 1:
            fold_files = ["{}.fold.{}.dev.predictions.step.{}.csv".format(folder, fold_i, 100 * i) for i in range(1, 101, 1)]
        elif 2 <= fold_i <= 4:
            if folder == "mml-pgg-off-sim":
                fold_files = ["{}.fold.{}.dev.predictions.step.{}.csv".format(folder, fold_i, 100 * i) for i in range(1, 101, 1)]
            else:
                fold_files = ["{}.dev.predictions.fold.{}.step.{}.csv".format(folder, fold_i, 100 * i) for i in range(1, 101, 1)]
        else:
            if folder == "mml-pgg-off-sim":
                fold_files = ["{}.fold.{}.dev.predictions.step.{}.csv".format(folder, fold_i, 100 * i) for i in range(1, 201, 1)]
            else:
                fold_files = ["{}.dev.predictions.fold.{}.step.{}.csv".format(folder, fold_i, 100 * i) for i in range(1, 201, 1)]

        preprocess_the_prediction_files(fold_path, fold_files)
        max_file,  max_f1, f1s, scores, precisions, recalls = unk_eval_the_prediction_files(fold_files, fold_gold_file)
        print(folder, fold_i, max_file, max_f1)
        print("\n")
        results[fold_i][folder] = max_file
    print("NEXT")

mml-pgg-off-sim 1 mml-pgg-off-sim.fold.1.dev.predictions.step.500.csv 49.45


mml-pgg-on-sim 1 mml-pgg-on-sim.fold.1.dev.predictions.step.300.csv 53.57


mml-mml-off-sim 1 mml-mml-off-sim.fold.1.dev.predictions.step.500.csv 48.79


mml-mml-on-sim 1 mml-mml-on-sim.fold.1.dev.predictions.step.8900.csv 50.56


NEXT
mml-pgg-off-sim 2 mml-pgg-off-sim.fold.2.dev.predictions.step.9300.csv 64.03


mml-pgg-on-sim 2 mml-pgg-on-sim.dev.predictions.fold.2.step.9200.csv 62.0


mml-mml-off-sim 2 mml-mml-off-sim.dev.predictions.fold.2.step.9300.csv 64.78


mml-mml-on-sim 2 mml-mml-on-sim.dev.predictions.fold.2.step.9400.csv 62.34


NEXT
mml-pgg-off-sim 3 mml-pgg-off-sim.fold.3.dev.predictions.step.5300.csv 58.42


mml-pgg-on-sim 3 mml-pgg-on-sim.dev.predictions.fold.3.step.300.csv 57.72


mml-mml-off-sim 3 mml-mml-off-sim.dev.predictions.fold.3.step.3800.csv 59.03


mml-mml-on-sim 3 mml-mml-on-sim.dev.predictions.fold.3.step.2000.csv 61.22


NEXT
mml-pgg-off-sim 4 mml-pgg-off-sim.fold.4.dev.predictio

In [7]:
# base predictions on the test data.
avg_f1 = 0.0
avg_p = 0.0
avg_re = 0.0
for fold_i in range(1, 11, 1):
    fold_gold_file = "./zero-shot-extraction/relation_splits/test.{}".format(fold_i-1)
    fold_path = "~/reqa-predictions/fold_{}/".format(fold_i)
    fold_files = ["base-base.test.predictions.fold.{}.csv".format(fold_i)]
    preprocess_the_prediction_files(fold_path, fold_files)
    max_file,  max_f1, f1s, scores, precisions, recalls = unk_eval_the_prediction_files(fold_files, fold_gold_file)
    print(fold_i, f1s, precisions, recalls)
    avg_f1 += f1s[0]
    avg_p += precisions[0]
    avg_re += recalls[0]
    print("\n")
    print("NEXT")

print("avg f1 is:", avg_f1/10.0)
print("avg p is:", avg_p/10.0)
print("avg re is:", avg_re/10.0)

1 [26.77] [23.52] [31.05]


NEXT
2 [25.95] [23.88] [28.4]


NEXT
3 [28.03] [24.05] [33.58]


NEXT
4 [27.49] [24.6] [31.15]


NEXT
5 [28.93] [25.78] [32.97]


NEXT
6 [26.12] [23.78] [28.97]


NEXT
7 [24.31] [20.6] [29.65]


NEXT
8 [29.7] [26.79] [33.32]


NEXT
9 [30.84] [27.29] [35.43]


NEXT
10 [26.46] [23.8] [29.78]


NEXT
avg f1 is: 27.459999999999997
avg p is: 24.409
avg re is: 31.429999999999996


In [8]:
# Evaluating the test predictions on the RE-QA dataset on all folds for the tail entity generation task.
folders = ["mml-pgg-on-sim", "mml-mml-off-sim", "mml-mml-on-sim", "mml-pgg-off-sim"]
for folder in folders:
    avg_f1 = {"mml-mml-off-sim": 0, "mml-mml-on-sim": 0, "mml-pgg-on-sim": 0, "mml-pgg-off-sim": 0}
    avg_p = {"mml-mml-off-sim": 0, "mml-mml-on-sim": 0, "mml-pgg-on-sim": 0, "mml-pgg-off-sim": 0}
    avg_r = {"mml-mml-off-sim": 0, "mml-mml-on-sim": 0, "mml-pgg-on-sim": 0, "mml-pgg-off-sim": 0}
    for fold_i in range(1, 11, 1):
        fold_gold_file = "./zero-shot-extraction/relation_splits/test.{}".format(fold_i-1)
        fold_path = "~/reqa-predictions/fold_{}/{}".format(fold_i, folder)
        old_dev_file = results[fold_i][folder]
        new_test_file = old_dev_file.replace(".fold.{}.dev.predictions.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        new_test_file = new_test_file.replace(".dev.predictions.fold.{}.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        fold_files = [new_test_file]
        preprocess_the_prediction_files(fold_path, fold_files)
        max_file,  max_f1, f1s, scores, precisions, recalls = unk_eval_the_prediction_files(fold_files, fold_gold_file)
        print(folder, fold_i, max_file, max_f1)
        avg_f1[folder] += max_f1
        avg_p[folder] += precisions[0]
        avg_r[folder] += recalls[0]
        print("\n")

    print(folder, "f1", avg_f1[folder] / 10.0)
    print(folder, "p", avg_p[folder] / 10.0)
    print(folder, "r", avg_r[folder] / 10.0)
    print("NEXT")

mml-pgg-on-sim 1 mml-pgg-on-sim.test.predictions.fold.1.step.300.csv 59.57


mml-pgg-on-sim 2 mml-pgg-on-sim.test.predictions.fold.2.step.9200.csv 45.83


mml-pgg-on-sim 3 mml-pgg-on-sim.test.predictions.fold.3.step.300.csv 55.44


mml-pgg-on-sim 4 mml-pgg-on-sim.test.predictions.fold.4.step.500.csv 61.92


mml-pgg-on-sim 5 mml-pgg-on-sim.test.predictions.fold.5.step.12600.csv 53.77


mml-pgg-on-sim 6 mml-pgg-on-sim.test.predictions.fold.6.step.300.csv 54.33


mml-pgg-on-sim 7 mml-pgg-on-sim.test.predictions.fold.7.step.9800.csv 60.39


mml-pgg-on-sim 8 mml-pgg-on-sim.test.predictions.fold.8.step.17000.csv 56.03


mml-pgg-on-sim 9 mml-pgg-on-sim.test.predictions.fold.9.step.2300.csv 52.0


mml-pgg-on-sim 10 mml-pgg-on-sim.test.predictions.fold.10.step.4300.csv 47.2


mml-pgg-on-sim f1 54.648
mml-pgg-on-sim p 57.0
mml-pgg-on-sim r 52.849000000000004
NEXT
mml-mml-off-sim 1 mml-mml-off-sim.test.predictions.fold.1.step.500.csv 63.75


mml-mml-off-sim 2 mml-mml-off-sim.test.predictions.fold

In [9]:
# Compute perplexity over the test generated questions for the following method on the RE-QA dataset.
folders = ["mml-mml-off-sim"]

for folder in folders:
    avg_pp = {"mml-mml-off-sim": 0}
    for fold_i in range(9, 11, 1):
        fold_path = "~/reqa-predictions/fold_{}/{}".format(fold_i, folder)
        old_dev_file = results[fold_i][folder]
        new_test_file = old_dev_file.replace(".fold.{}.dev.predictions.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        new_test_file = new_test_file.replace(".dev.predictions.fold.{}.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        fold_file = new_test_file
        pp = compute_perplexity_for_questions(fold_path, fold_file)
        avg_pp[folder] += pp
        print(fold_file, pp)
    print("\n")
    print(folder, "pp", avg_pp[folder] / 10.0)

mml-mml-off-sim.test.predictions.fold.1.step.500.csv tensor(250.1070, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.2.step.9300.csv tensor(111.5883, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.3.step.3800.csv tensor(123.9757, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.4.step.2300.csv tensor(154.7580, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.5.step.3000.csv tensor(222.3316, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.6.step.300.csv tensor(152.8412, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.7.step.13400.csv tensor(171.4223, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.8.step.7900.csv tensor(154.2868, device='cuda:0')


In [1]:
'''
mml-mml-off-sim.test.predictions.fold.1.step.500.csv tensor(250.1070, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.2.step.9300.csv tensor(111.5883, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.3.step.3800.csv tensor(123.9757, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.4.step.2300.csv tensor(154.7580, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.5.step.3000.csv tensor(222.3316, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.6.step.300.csv tensor(152.8412, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.7.step.13400.csv tensor(171.4223, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.8.step.7900.csv tensor(154.2868, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.9.step.4700.csv tensor(112.6208, device='cuda:0')
mml-mml-off-sim.test.predictions.fold.10.step.1500.csv tensor(29.8551, device='cuda:0')
'''

arr = [250.1070, 111.5883, 123.9757, 
       154.7580, 222.3316, 152.8412, 
       171.4223, 154.2868, 112.6208, 
       29.8551]

print(sum(arr)/10.0)

148.37867999999997


In [None]:
# Compute perplexity over the test generated questions for the following method on the RE-QA dataset.
folders = ["mml-pgg-off-sim"]

for folder in folders:
    avg_pp = {"mml-pgg-off-sim": 0}
    for fold_i in range(1, 11, 1):
        fold_path = "~/reqa-predictions/fold_{}/{}".format(fold_i, folder)
        old_dev_file = results[fold_i][folder]
        new_test_file = old_dev_file.replace(".fold.{}.dev.predictions.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        new_test_file = new_test_file.replace(".dev.predictions.fold.{}.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        fold_file = new_test_file
        pp = compute_perplexity_for_questions(fold_path, fold_file)
        avg_pp[folder] += pp
        print(fold_file, pp)
    print("\n")
    print(folder, "pp", avg_pp[folder] / 10.0)

In [2]:
'''
mml-pgg-off-sim.test.predictions.fold.1.step.500.csv tensor(217.1630, device='cuda:0')
mml-pgg-off-sim.test.predictions.fold.2.step.9300.csv tensor(122.6252, device='cuda:0')
mml-pgg-off-sim.test.predictions.fold.3.step.5300.csv tensor(121.2163, device='cuda:0')
mml-pgg-off-sim.test.predictions.fold.4.step.1100.csv tensor(137.1326, device='cuda:0')
mml-pgg-off-sim.test.predictions.fold.5.step.6400.csv tensor(160.4001, device='cuda:0')
mml-pgg-off-sim.test.predictions.fold.6.step.700.csv tensor(125.5082, device='cuda:0')
mml-pgg-off-sim.test.predictions.fold.7.step.14900.csv tensor(142.9996, device='cuda:0')
mml-pgg-off-sim.test.predictions.fold.8.step.3800.csv tensor(161.9223, device='cuda:0')
mml-pgg-off-sim.test.predictions.fold.9.step.2900.csv tensor(100.0727, device='cuda:0')
mml-pgg-off-sim.test.predictions.fold.10.step.8600.csv tensor(152.0948, device='cuda:0')
'''

arr = [217.1630, 122.6252, 121.2163, 
       137.1326, 160.4001, 125.5082, 
       142.9996, 161.9223, 100.0727, 
       152.0948]

print(sum(arr)/10.0)

144.11348


In [None]:
# Compute perplexity over the test generated questions for the following method on the RE-QA dataset.
folders = ["mml-pgg-on-sim"]

for folder in folders:
    avg_pp = {"mml-pgg-on-sim": 0}
    for fold_i in range(1, 11, 1):
        fold_path = "~/reqa-predictions/fold_{}/{}".format(fold_i, folder)
        old_dev_file = results[fold_i][folder]
        new_test_file = old_dev_file.replace(".fold.{}.dev.predictions.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        new_test_file = new_test_file.replace(".dev.predictions.fold.{}.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        fold_file = new_test_file
        pp = compute_perplexity_for_questions(fold_path, fold_file)
        avg_pp[folder] += pp
        print(fold_file, pp)
    print("\n")
    print(folder, "pp", avg_pp[folder] / 10.0)

In [None]:
'''
mml-pgg-on-sim.test.predictions.fold.1.step.300.csv tensor(1344.3567, device='cuda:0')
mml-pgg-on-sim.test.predictions.fold.2.step.9200.csv tensor(238.5175, device='cuda:0')
mml-pgg-on-sim.test.predictions.fold.3.step.300.csv tensor(1374.6135, device='cuda:0')
mml-pgg-on-sim.test.predictions.fold.4.step.500.csv tensor(353.1273, device='cuda:0')
mml-pgg-on-sim.test.predictions.fold.5.step.12600.csv tensor(132.0151, device='cuda:0')
mml-pgg-on-sim.test.predictions.fold.6.step.300.csv tensor(899.9470, device='cuda:0')
mml-pgg-on-sim.test.predictions.fold.7.step.9800.csv tensor(2476.5774, device='cuda:0')
mml-pgg-on-sim.test.predictions.fold.8.step.17000.csv tensor(132.0151, device='cuda:0')
mml-pgg-on-sim.test.predictions.fold.9.step.2300.csv tensor(5960.5225, device='cuda:0')
mml-pgg-on-sim.test.predictions.fold.10.step.4300.csv tensor(185.5247, device='cuda:0')


mml-pgg-on-sim pp tensor(1309.7216, device='cuda:0')
'''

In [None]:
# Compute perplexity over the test generated questions for the following method on the RE-QA dataset.
folders = ["mml-mml-on-sim"]

for folder in folders:
    avg_pp = {"mml-mml-on-sim": 0}
    for fold_i in range(1, 11, 1):
        fold_path = "~/reqa-predictions/fold_{}/{}".format(fold_i, folder)
        old_dev_file = results[fold_i][folder]
        new_test_file = old_dev_file.replace(".fold.{}.dev.predictions.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        new_test_file = new_test_file.replace(".dev.predictions.fold.{}.".format(fold_i), ".test.predictions.fold.{}.".format(fold_i))
        fold_file = new_test_file
        pp = compute_perplexity_for_questions(fold_path, fold_file)
        avg_pp[folder] += pp
        print(fold_file, pp)
    print("\n")
    print(folder, "pp", avg_pp[folder] / 10.0)

In [7]:
avg_pp = 0.0
for fold_i in range(1, 11, 1):
    fold_path = "~/reqa-predictions/fold_{}/".format(fold_i)
    fold_file = "base-base.test.predictions.fold.{}.csv".format(fold_i)
    pp = compute_perplexity_for_questions(fold_path, fold_file)
    avg_pp += pp
    print(fold_file, pp)

print("\n")
print("pp", avg_pp / 10.0)

base_base_fold.1.test.predictions.step.csv tensor(215.8548, device='cuda:0')
base_base_fold.2.test.predictions.step.csv tensor(181.5120, device='cuda:0')
base_base_fold.3.test.predictions.step.csv tensor(128.0450, device='cuda:0')
base_base_fold.4.test.predictions.step.csv tensor(185.9727, device='cuda:0')
base_base_fold.5.test.predictions.step.csv tensor(196.0677, device='cuda:0')
base_base_fold.6.test.predictions.step.csv tensor(138.5674, device='cuda:0')
base_base_fold.7.test.predictions.step.csv tensor(219.6459, device='cuda:0')
base_base_fold.8.test.predictions.step.csv tensor(206.5881, device='cuda:0')
base_base_fold.9.test.predictions.step.csv tensor(163.6257, device='cuda:0')
base_base_fold.10.test.predictions.step.csv tensor(179.0245, device='cuda:0')


pp tensor(181.4904, device='cuda:0')


In [7]:
# Tail Entity Generation.
# Evaluating the dev predictions on the RE-QA dataset using the model having access to gold templates!
dev_max_files = {}
for fold_i in range(1, 11, 1):
    fold_gold_file = "./zero-shot-extraction/relation_splits/dev.{}".format(fold_i-1)
    fold_path = "~/reqa-predictions/fold_{}/gold/dev_predictions/".format(fold_i)
    fold_files = ["gold_fold.{}.dev.predictions.step.{}.csv".format(fold_i, 100 * i) for i in range(1, 506, 1)]
    preprocess_the_prediction_files(fold_path, fold_files)
    max_file,  max_f1, f1s, scores, precisions, recalls = unk_eval_the_prediction_files(fold_files, fold_gold_file)
    dev_max_files[fold_i] = max_file
    print(fold_i, max_file, max_f1)
    print("#")

1 gold_fold.1.dev.predictions.step.800.csv 56.83
#
2 gold_fold.2.dev.predictions.step.2000.csv 65.76
#
3 gold_fold.3.dev.predictions.step.4200.csv 63.59
#
4 gold_fold.4.dev.predictions.step.1400.csv 64.44
#
5 gold_fold.5.dev.predictions.step.900.csv 67.0
#
6 gold_fold.6.dev.predictions.step.400.csv 69.45
#
7 gold_fold.7.dev.predictions.step.6100.csv 59.3
#
8 gold_fold.8.dev.predictions.step.7300.csv 64.1
#
9 gold_fold.9.dev.predictions.step.1800.csv 67.48
#
10 gold_fold.10.dev.predictions.step.4100.csv 61.41
#


In [8]:
# Tail Entity Generation.
# Evaluating the test predictions on the RE-QA dataset using the model having access to gold templates!

avg_f1 = 0.0
avg_p = 0.0
avg_re = 0.0
for fold_i in range(1, 11, 1):
    fold_gold_file = "./zero-shot-extraction/relation_splits/test.{}".format(fold_i-1)
    fold_path = "~/reqa-predictions/fold_{}/gold/".format(fold_i)
    fold_files = ["gold_fold.{}.test.predictions.step..csv".format(fold_i)]
    preprocess_the_prediction_files(fold_path, fold_files)
    max_file,  max_f1, f1s, scores, precisions, recalls = unk_eval_the_prediction_files(fold_files, fold_gold_file)
    print(fold_i, f1s, precisions, recalls)
    avg_f1 += f1s[0]
    avg_p += precisions[0]
    avg_re += recalls[0]
    print("\n")
    print("NEXT")

print("avg f1 is:", avg_f1/10.0)
print("avg p is:", avg_p/10.0)
print("avg re is:", avg_re/10.0)

1 [64.21] [61.01] [67.77]


NEXT
2 [48.65] [52.8] [45.1]


NEXT
3 [60.73] [60.75] [60.7]


NEXT
4 [67.16] [66.82] [67.5]


NEXT
5 [55.61] [59.76] [52.]


NEXT
6 [61.03] [60.72] [61.35]


NEXT
7 [58.4] [60.15] [56.75]


NEXT
8 [59.74] [63.77] [56.18]


NEXT
9 [56.49] [56.74] [56.25]


NEXT
10 [56.89] [58.55] [55.32]


NEXT
avg f1 is: 58.891
avg p is: 60.10699999999999
avg re is: 57.89200000000001


In [12]:
# Tail Entity Generation.
# Evaluating the test predictions on the RE-QA dataset using the model having access to gold templates without finetunning. Gold Base.

avg_f1 = 0.0
avg_p = 0.0
avg_re = 0.0
for fold_i in range(1, 11, 1):
    fold_gold_file = "./zero-shot-extraction/relation_splits/test.{}".format(fold_i-1)
    fold_path = "~/reqa-predictions/fold_{}/gold/".format(fold_i)
    fold_files = ["base_gold_fold.{}.test.predictions.step..csv".format(fold_i)]
    preprocess_the_prediction_files(fold_path, fold_files)
    max_file,  max_f1, f1s, scores, precisions, recalls = unk_eval_the_prediction_files(fold_files, fold_gold_file)
    print(fold_i, f1s, precisions, recalls)
    avg_f1 += f1s[0]
    avg_p += precisions[0]
    avg_re += recalls[0]
    print("\n")
    print("NEXT")

print("avg f1 is:", avg_f1/10.0)
print("avg p is:", avg_p/10.0)
print("avg re is:", avg_re/10.0)

1 [44.76] [36.19] [58.65]


NEXT
2 [38.57] [31.78] [49.07]


NEXT
3 [42.92] [35.45] [54.37]


NEXT
4 [47.94] [40.4] [58.93]


NEXT
5 [43.86] [36.67] [54.55]


NEXT
6 [44.04] [36.65] [55.18]


NEXT
7 [41.53] [33.62] [54.33]


NEXT
8 [46.63] [39.3] [57.3]


NEXT
9 [43.52] [35.54] [56.1]


NEXT
10 [42.35] [35.43] [52.65]


NEXT
avg f1 is: 43.612
avg p is: 36.103
avg re is: 55.113


In [9]:
# Tail Entity Generation.
# Evaluating the dev predictions on the RE-QA dataset using the concat model.
dev_max_files = {}
for fold_i in range(1, 11, 1):
    fold_gold_file = "./zero-shot-extraction/relation_splits/dev.{}".format(fold_i-1)
    fold_path = "~/reqa-predictions/fold_{}/concat/dev_predictions/".format(fold_i)
    fold_files = ["concat_fold.{}.dev.predictions.step.{}.csv".format(fold_i, 100 * i) for i in range(1, 506, 1)]
    preprocess_the_prediction_files(fold_path, fold_files)
    max_file,  max_f1, f1s, scores, precisions, recalls = unk_eval_the_prediction_files(fold_files, fold_gold_file)
    dev_max_files[fold_i] = max_file
    print(fold_i, max_file, max_f1)
    print("#")

1 concat_fold.1.dev.predictions.step.1700.csv 53.96
#
2 concat_fold.2.dev.predictions.step.4600.csv 64.46
#
3 concat_fold.3.dev.predictions.step.18600.csv 61.4
#
4 concat_fold.4.dev.predictions.step.1400.csv 67.12
#
5 concat_fold.5.dev.predictions.step.7900.csv 69.91
#
6 concat_fold.6.dev.predictions.step.2600.csv 64.16
#
7 concat_fold.7.dev.predictions.step.24700.csv 63.0
#
8 concat_fold.8.dev.predictions.step.13800.csv 61.08
#
9 concat_fold.9.dev.predictions.step.17700.csv 61.6
#
10 concat_fold.10.dev.predictions.step.35500.csv 57.72
#


In [10]:
# Tail Entity Generation.
# Evaluating the test predictions on the RE-QA dataset using the concat model!

avg_f1 = 0.0
avg_p = 0.0
avg_re = 0.0
for fold_i in range(1, 11, 1):
    fold_gold_file = "./zero-shot-extraction/relation_splits/test.{}".format(fold_i-1)
    fold_path = "~/reqa-predictions/fold_{}/concat/".format(fold_i)
    fold_files = ["concat_fold.{}.test.predictions.step..csv".format(fold_i)]
    preprocess_the_prediction_files(fold_path, fold_files)
    max_file,  max_f1, f1s, scores, precisions, recalls = unk_eval_the_prediction_files(fold_files, fold_gold_file)
    print(fold_i, f1s, precisions, recalls)
    avg_f1 += f1s[0]
    avg_p += precisions[0]
    avg_re += recalls[0]
    print("\n")
    print("NEXT")

print("avg f1 is:", avg_f1/10.0)
print("avg p is:", avg_p/10.0)
print("avg re is:", avg_re/10.0)

1 [58.35] [61.38] [55.62]


NEXT
2 [42.61] [50.08] [37.08]


NEXT
3 [57.43] [59.32] [55.65]


NEXT
4 [56.75] [61.26] [52.87]


NEXT
5 [56.59] [60.4] [53.23]


NEXT
6 [56.48] [57.44] [55.55]


NEXT
7 [54.52] [59.09] [50.62]


NEXT
8 [55.51] [58.14] [53.12]


NEXT
9 [44.98] [56.21] [37.48]


NEXT
10 [53.] [55.78] [50.48]


NEXT
avg f1 is: 53.622
avg p is: 57.910000000000004
avg re is: 50.17


In [11]:
# Tail Entity Generation.
# Evaluating the test predictions on the RE-QA dataset using the concat model without finetunning. Concat-Base

avg_f1 = 0.0
avg_p = 0.0
avg_re = 0.0
for fold_i in range(1, 11, 1):
    fold_gold_file = "./zero-shot-extraction/relation_splits/test.{}".format(fold_i-1)
    fold_path = "~/reqa-predictions/fold_{}/concat/".format(fold_i)
    fold_files = ["base_concat_fold.{}.test.predictions.step..csv".format(fold_i)]
    preprocess_the_prediction_files(fold_path, fold_files)
    max_file,  max_f1, f1s, scores, precisions, recalls = unk_eval_the_prediction_files(fold_files, fold_gold_file)
    print(fold_i, f1s, precisions, recalls)
    avg_f1 += f1s[0]
    avg_p += precisions[0]
    avg_re += recalls[0]
    print("\n")
    print("NEXT")

print("avg f1 is:", avg_f1/10.0)
print("avg p is:", avg_p/10.0)
print("avg re is:", avg_re/10.0)

1 [3.06] [2.36] [4.37]


NEXT
2 [2.56] [1.97] [3.63]


NEXT
3 [1.94] [1.49] [2.78]


NEXT
4 [2.01] [1.54] [2.87]


NEXT
5 [1.66] [1.28] [2.38]


NEXT
6 [2.78] [2.16] [3.9]


NEXT
7 [3.3] [2.52] [4.78]


NEXT
8 [2.82] [2.21] [3.9]


NEXT
9 [1.58] [1.2] [2.3]


NEXT
10 [0.79] [0.61] [1.15]


NEXT
avg f1 is: 2.25
avg p is: 1.734
avg re is: 3.2059999999999995
