In [1]:
import os
import json
import numpy as np
import pandas as pd

from scipy.stats import spearmanr
from sklearn.metrics import accuracy_score, f1_score,  precision_score, recall_score

In [None]:
def load_gold_labels(dimension, task):
    test_df = pd.read_csv(f"../../data/processed/{dimension}/{task}_test.csv")
    with open("../../data/interim/labels.json", "r") as json_input:
        labels2ids = json.load(json_input)
    test_df["labels"] = test_df["labels"].replace(labels2ids[task])
    gold_labels = test_df["labels"].to_list()
    return gold_labels

In [3]:
def calculate_spearmanr(gold, preds):
    spearman_corr, pval = spearmanr(np.array(gold), np.array(preds))
    return (spearman_corr, pval)

In [None]:
models_name = [
    "bert-base-cased",
    "roberta-base",
    "distilbert-base-cased",
    "microsoft/deberta-base",
    "facebook/FairBERTa",
]

dimensions = [
    "merged"
]

tasks = [
    # "buzzfeed",
    # "politifact",
    # "twittercovidq2",
    # "clef22",

    "propaganda",
    "webis",
    "pheme",
    "basil",
    "shadesoftruth",
    "fingerprints",
    "clickbait",
]


for model_name in models_name:
    for task in tasks:
        tot_diff = []
        for dimension in dimensions:

            try:
                vanilla_test_preds = list(np.load(f'../../data/interim/tot_predictions{os.sep}{model_name}{os.sep}{dimension}{os.sep}{task}{os.sep}unperturbed.npy'))
                perturbed_test_preds = list(np.load(f'../../data/interim/tot_predictions{os.sep}{model_name}{os.sep}{dimension}{os.sep}{task}{os.sep}perturbed.npy'))

                test_gold = load_gold_labels(dimension, task)

            except:
                print(f"NO DATA FOR: {model_name} - {dimension} - {task}")

        van_acc = accuracy_score(test_gold, vanilla_test_preds)
        pert_acc = accuracy_score(test_gold, perturbed_test_preds)
        
        van_f1 = f1_score(test_gold, vanilla_test_preds, average="macro") 
        pert_f1 = f1_score(test_gold, perturbed_test_preds, average="macro")

        if len(set(test_gold)) > 2:
            van_precision = precision_score(test_gold, vanilla_test_preds, average="weighted")
            pert_precision = precision_score(test_gold, perturbed_test_preds, average="weighted")
            van_recall = recall_score(test_gold, vanilla_test_preds, average="weighted")
            pert_recall = recall_score(test_gold, perturbed_test_preds, average="weighted")

        else:
            van_precision = precision_score(test_gold, vanilla_test_preds)
            pert_precision = precision_score(test_gold, perturbed_test_preds)
            van_recall = recall_score(test_gold, vanilla_test_preds)
            pert_recall = recall_score(test_gold, perturbed_test_preds)

        vec_1 = [van_acc, van_f1, van_precision, van_recall]
        vec_2 = [pert_acc, pert_f1, pert_precision, pert_recall]

        sv, pval = calculate_spearmanr(vec_1, vec_2)
        print(f"model: {model_name} | dimension: {dimension} | task: {task} | spearman: {sv} | pval: {pval}")
    print("\n")
        

model: bert-base-cased | dimension: merged | task: propaganda | spearman: 1.0 | pval: 0.0
model: bert-base-cased | dimension: merged | task: webis | spearman: 1.0 | pval: 0.0
model: bert-base-cased | dimension: merged | task: pheme | spearman: 1.0 | pval: 0.0
model: bert-base-cased | dimension: merged | task: basil | spearman: 1.0 | pval: 0.0
model: bert-base-cased | dimension: merged | task: shadesoftruth | spearman: 0.3333333333333334 | pval: 0.6666666666666666
model: bert-base-cased | dimension: merged | task: fingerprints | spearman: 1.0 | pval: 0.0
model: bert-base-cased | dimension: merged | task: clickbait | spearman: 1.0 | pval: 0.0


model: roberta-base | dimension: merged | task: propaganda | spearman: 1.0 | pval: 0.0
model: roberta-base | dimension: merged | task: webis | spearman: 1.0 | pval: 0.0
model: roberta-base | dimension: merged | task: pheme | spearman: 1.0 | pval: 0.0
model: roberta-base | dimension: merged | task: basil | spearman: 1.0 | pval: 0.0
model: roberta-b