In [50]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoModel, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset
import torch

In [51]:
DEVICE = 0 if torch.cuda.is_available() else -1
INPUT_COL_NAME = 'text'
TARGET_COL_NAME = 'fake'
SMOKE = 25

In [52]:
test_df = pd.read_csv("../data/fake_papers_translated.csv")
if SMOKE:
    test_df = test_df[:SMOKE]

In [53]:
models = [
    { 
        "name": "multiclass",
        "model": "anon/deberta-v3-large-finetuned-synthetic-multi-class"
    },
    { 
        "name": "dagpap22only",
        "model": "anon/deberta-v3-large-finetuned-dagpap22-only"
    },
    { 
        "name": "translated-only",
        "model": "anon/deberta-v3-large-finetuned-synthetic-translated-only"
    },
    { 
        "name": "all",
        "model": "anon/deberta-v3-large-finetuned-DAGPap22-synthetic-all"
    },
]

In [54]:
def get_predictions(model, texts):
    preds = []
    ds = Dataset.from_dict({
        "texts": list(texts.fillna(''))
    })
    for pred in tqdm(model(KeyDataset(ds, "texts"), batch_size=1), total=len(ds)):
        preds.append(
            pred
        )
    return preds

In [55]:
import numpy as np


def coverage_risk(confidences: np.ndarray, accuracies: np.ndarray):
    # From https://github.com/sleep3r/garrus/blob/a6fd1d44b06285918cefe54f421a004dc6f315cb/garrus/metrics/aurc.py
    sort_values = sorted(zip(confidences, accuracies), key=lambda x: x[0], reverse=True)
    sort_conf, sort_acc = zip(*sort_values)
    risk_list = []
    coverage_list = []
    risk = 0
    for i in range(len(sort_conf)):
        coverage = (i + 1) / len(sort_conf)
        coverage_list.append(coverage)

        if sort_acc[i] == 0:
            risk += 1

        risk_list.append(risk / (i + 1))
    return risk_list, coverage_list


def compute_area_under_risk_coverage_age(confidences: np.ndarray, accuracies: np.ndarray) -> float:
    """
    Area Under Risk-Coverage.
    $$ AURC (\kappa, f \mid V_{n}) = \frac{1}{n} \sum_{\theta \in \Theta} \hat{r} (f, g_{\theta} \mid V_{n}) $$
    From https://github.com/sleep3r/garrus/blob/a6fd1d44b06285918cefe54f421a004dc6f315cb/garrus/metrics/aurc.py
    """
    risk_list, coverage_list = coverage_risk(confidences, accuracies)
    risk_coverage_curve_area = 0
    for risk_value in risk_list:
        risk_coverage_curve_area += risk_value * (1 / len(risk_list))

    aurc = risk_coverage_curve_area
    return float(aurc), risk_list, coverage_list


In [56]:
results = []
for model in models:
    print(f"Getting results for {model['name']}")
    mdl = pipeline("text-classification", model['model'], device=DEVICE)
    test_predictions = get_predictions(mdl, test_df[INPUT_COL_NAME])
    real_label = 'LABEL_0'
    if model['name'] == "multiclass":
        real_label = 'LABEL_2' # real label
        
    confidences = [
        pred['score']
        for pred in test_predictions
    ]
    predictions =  [
        0 if pred['label'] == real_label else 1
        for pred in test_predictions
    ]
    accuracies = [
        0 if prediction != y_true else 1 for (prediction, y_true)
        in zip(predictions, test_df[TARGET_COL_NAME])
    ]
    aurc, risk_list, coverage_list = compute_area_under_risk_coverage_age(confidences, accuracies)
    f1 = f1_score(test_df[TARGET_COL_NAME], predictions)
    precision = precision_score(test_df[TARGET_COL_NAME], predictions)
    recall = recall_score(test_df[TARGET_COL_NAME], predictions)
    acc = accuracy_score(test_df[TARGET_COL_NAME], predictions)
    data = {
        "model": model['model'],
        "name": model['name'],
        "aurc": aurc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "acc": acc,
        "risk_list": risk_list,
        "coverage_list": coverage_list,
        "label_distribution": list(zip([
            pred['label'] for pred in test_predictions
        ], accuracies))
    }
    results.append(data)

Getting results for multiclass


  0%|          | 0/25 [00:00<?, ?it/s]

Getting results for dagpap22only


  0%|          | 0/25 [00:00<?, ?it/s]

Getting results for translated-only


  0%|          | 0/25 [00:00<?, ?it/s]

Getting results for all


  0%|          | 0/25 [00:00<?, ?it/s]

In [57]:
results

[{'model': 'anon/deberta-v3-large-finetuned-synthetic-multi-class',
  'name': 'multiclass',
  'aurc': 0.0,
  'f1': 1.0,
  'precision': 1.0,
  'recall': 1.0,
  'acc': 1.0,
  'risk_list': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0],
  'coverage_list': [0.04,
   0.08,
   0.12,
   0.16,
   0.2,
   0.24,
   0.28,
   0.32,
   0.36,
   0.4,
   0.44,
   0.48,
   0.52,
   0.56,
   0.6,
   0.64,
   0.68,
   0.72,
   0.76,
   0.8,
   0.84,
   0.88,
   0.92,
   0.96,
   1.0],
  'label_distribution': [('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_3', 1),
   ('LABEL_0', 1),
   ('LABEL_3'

In [58]:
pd.DataFrame(
    results
).to_csv('./translation_evaluation_results_smoke.csv')