In [11]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoModel, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset
import torch

In [43]:
DEVICE = 0 if torch.cuda.is_available() else -1
INPUT_COL_NAME = 'text'
TARGET_COL_NAME = 'fake'
SMOKE = 100

In [44]:
tool_labels = {
    "generate": 0,
    "paraphrase": 1,
    "real": 2,
    "translate": 3 
}

In [45]:
known_df = pd.read_csv("../data/real_and_fake_passages_dataset_test.csv")

In [46]:
known_df = known_df.rename(columns={'passages': 'text'})
known_df.head(1)

Unnamed: 0,text,fake,type,model,tool
0,a nearby recurrent laryngeal nerve was the oth...,0,real,real,real


In [47]:
test_df = known_df
test_df['pred'] = test_df['tool'].apply(lambda x: tool_labels[x])
if SMOKE:
    test_df = test_df[:SMOKE]

In [48]:
models = [
    { 
        "name": "multiclass",
        "model": "anon/deberta-v3-large-finetuned-synthetic-multi-class"
    },
]
datasets = ['all', 'generate', 'paraphrase', 'translate']

In [49]:
def get_predictions(model, texts):
    preds = []
    ds = Dataset.from_dict({
        "texts": list(texts.fillna(''))
    })
    for pred in tqdm(model(KeyDataset(ds, "texts"), batch_size=1), total=len(ds)):
        preds.append(
            pred
        )
    return preds

In [50]:
import numpy as np


def coverage_risk(confidences: np.ndarray, accuracies: np.ndarray):
    # From https://github.com/sleep3r/garrus/blob/a6fd1d44b06285918cefe54f421a004dc6f315cb/garrus/metrics/aurc.py
    sort_values = sorted(zip(confidences, accuracies), key=lambda x: x[0], reverse=True)
    sort_conf, sort_acc = zip(*sort_values)
    risk_list = []
    coverage_list = []
    risk = 0
    for i in range(len(sort_conf)):
        coverage = (i + 1) / len(sort_conf)
        coverage_list.append(coverage)

        if sort_acc[i] == 0:
            risk += 1

        risk_list.append(risk / (i + 1))
    return risk_list, coverage_list


def compute_area_under_risk_coverage_age(confidences: np.ndarray, accuracies: np.ndarray) -> float:
    """
    Area Under Risk-Coverage.
    $$ AURC (\kappa, f \mid V_{n}) = \frac{1}{n} \sum_{\theta \in \Theta} \hat{r} (f, g_{\theta} \mid V_{n}) $$
    From https://github.com/sleep3r/garrus/blob/a6fd1d44b06285918cefe54f421a004dc6f315cb/garrus/metrics/aurc.py
    """
    risk_list, coverage_list = coverage_risk(confidences, accuracies)
    risk_coverage_curve_area = 0
    for risk_value in risk_list:
        risk_coverage_curve_area += risk_value * (1 / len(risk_list))

    aurc = risk_coverage_curve_area
    return float(aurc), risk_list, coverage_list


In [51]:
label_map = {
    'LABEL_0': 0,
    'LABEL_1': 1,
    'LABEL_2': 2,
    'LABEL_3': 3
}

In [53]:
results = []
for model in models:
    print(f"Getting results for {model['name']}")
    mdl = pipeline("text-classification", model['model'], device=DEVICE)
    for dataset in datasets:
        if dataset == 'all':
            df = test_df
        else:
            df = test_df.loc[
                (test_df['tool'] == dataset) | (test_df['tool'] == 'real')
            ]
        test_predictions = get_predictions(mdl, test_df[INPUT_COL_NAME])

        confidences = [
            pred['score']
            for pred in test_predictions
        ]
        predictions =  [
            label_map[pred['label']]
            for pred in test_predictions
        ]
        accuracies = [
            0 if prediction != y_true else 1 for (prediction, y_true)
            in zip(predictions, test_df["pred"])
        ]
        aurc, risk_list, coverage_list = compute_area_under_risk_coverage_age(confidences, accuracies)
        f1 = f1_score(test_df["pred"], predictions, average='micro')
        f1_none = f1_score(test_df["pred"], predictions, average=None)
        precision = precision_score(test_df["pred"], predictions,  average='micro')
        precision_none = precision_score(test_df["pred"], predictions,  average=None)
        recall = recall_score(test_df["pred"], predictions,  average='micro')
        recall_none = recall_score(test_df["pred"], predictions,  average=None)
        acc = accuracy_score(test_df["pred"], predictions)
        data = {
            "model": model['model'],
            "name": model['name'],
            "aurc": aurc,
            "f1_micro": f1,
            "f1_none": f1_none,
            "precision": precision,
            "precision": precision_none,
            "recall": recall,
            "recall_none": recall_none,
            "acc": acc,
            "dataset": dataset,
            "risk_list": risk_list,
            "coverage_list": coverage_list,
            "label_distribution": list(zip([
                pred['label'] for pred in test_predictions
            ], accuracies))
        }
        results.append(data)

Getting results for multiclass


  0%|          | 0/100 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/100 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/100 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/100 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
results


[{'model': 'anon/deberta-v3-large-finetuned-synthetic-multi-class',
  'name': 'multiclass',
  'aurc': 0.0001,
  'f1_micro': 0.99,
  'f1_none': array([0.85714286, 1.        , 1.        , 0.        ]),
  'precision': array([1., 1., 1., 0.]),
  'recall': 0.99,
  'recall_none': array([0.75, 1.  , 1.  , 0.  ]),
  'acc': 0.99,
  'dataset': 'all',
  'risk_list': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0

In [57]:
pd.DataFrame(
    results
).to_csv('./multi_class_evaluation_results_smoke.csv')