In [None]:
import pandas as pd
from sklearn.metrics import f1_score

TEST_FILE = "data/test-train_non_para_test_para.tsv"
PREDICTION_FILE = "data/predictions/train_non_para_test_para_predictions"

LABELS = {
    "dbpedia_14": [
        "company", "educational institution", "artist", "athlete", "office holder", "mean of transportation",
        "building", "natural place", "village", "animal", "plant", "album", "film", "written work"],
    "emo": ["others", "happy", "sad", "angry"],
    "ethos-race": ["false", "true"],
    "ethos-religion": ["false", "true"],
    "financial_phrasebank" : ["negative", "neutral", "positive"],
    "wiki_qa": ["yes", "no"],
    "anli": ["entailment", "neutral", "contradiction"],
    "glue-mnli": ["entailment", "neutral", "contradiction"],
    "glue-qnli": ["yes", "no"],
    "glue-rte": ["yes", "no"],
    "glue-wnli": ["yes", "no"],
    "scitail": ["entailment", "neutral"],
    "sick": ["entailment", "neutral", "contradiction"],
    "superglue-cb": ["entailment", "neutral", "contradiction"],
    "superglue-rte": ["yes", "no"],
    "glue-mrpc": ["yes", "no"],
    "glue-qqp": ["yes", "no"],
    "medical_questions_pairs": ["similar", "dissimilar"],
    "paws": ["yes", "no"],
}

df = pd.read_csv(TEST_FILE, header=None, sep="\t")
df = df[range(4)]  # Only take the first 4 columns.
df.columns = ["task_name", "task_prefix", "input", "target"]

with open(PREDICTION_FILE) as fin:
    predictions = fin.readlines()
predictions = [x.replace("\n", "") for x in predictions]
df["prediction"] = predictions

df = df.groupby(["task_name", "task_prefix"]).agg({'target': lambda x: list(x), 'prediction': lambda x: list(x)}).reset_index()

def compute_f1(row):
    return f1_score(row.target, row.prediction, average='macro', labels=LABELS[row.task_name])

df["f1_score"] = df.apply(compute_f1, axis=1)
df.groupby(["task_name"]).agg({'f1_score': "mean"})