In [None]:
pip install transformers==4.18.0

In [None]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoModel, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset
import torch

In [None]:
DEVICE = 0 if torch.cuda.is_available() else -1
INPUT_COL_NAME = 'text'
TARGET_COL_NAME = 'fake'

In [None]:
known_df = pd.read_csv("../input/detectinggeneratedscientificaugmented/real_and_fake_passages_dataset_test.csv")
unknown_df = pd.read_csv("../input/detecting-generated-scientific-papers/fake_papers_train_part_public.csv")

In [None]:
known_df = known_df.rename(columns={'passages': 'text'})
known_df.head(1)

In [None]:
unknown_df['type'] = unknown_df['fake'].apply(lambda x: 'unknown' if 1 else 'real' )
unknown_df['model'] = unknown_df['fake'].apply(lambda x: 'unknown' if 1 else 'real' )
unknown_df['tool'] = unknown_df['fake'].apply(lambda x: 'unknown' if 1 else 'real' )

In [None]:
test_df = pd.concat([unknown_df, known_df])

In [None]:
models = [
    { 
        "name": "unknown",
        "model": "anon/deberta-v3-large-finetuned-dagpap22-only"
    },
    { 
        "name": "translate",
        "model": "anon/deberta-v3-large-finetuned-synthetic-translated-only"
    },
    { 
        "name": "generate",
        "model": "anon/deberta-v3-large-finetuned-synthetic-generated-only"
    },
        { 
        "name": "paraphrase",
        "model": "anon/deberta-v3-large-finetuned-synthetic-paraphrase-only"
    },
    { 
        "name": "all",
        "model": "anon/deberta-v3-large-finetuned-DAGPap22-synthetic-all"
    },
]
datasets = ['unknown', 'all', 'generate', 'paraphrase', 'translate']

In [None]:
def get_predictions(model, texts):
    preds = []
    ds = Dataset.from_dict({
        "texts": list(texts.fillna(''))
    })
    for pred in tqdm(model(KeyDataset(ds, "texts"), batch_size=1, truncation=True, max_length=256), total=len(ds)):
        preds.append(
            pred
        )
    return preds

In [None]:
import numpy as np


def coverage_risk(confidences: np.ndarray, accuracies: np.ndarray):
    # From https://github.com/sleep3r/garrus/blob/a6fd1d44b06285918cefe54f421a004dc6f315cb/garrus/metrics/aurc.py
    sort_values = sorted(zip(confidences, accuracies), key=lambda x: x[0], reverse=True)
    sort_conf, sort_acc = zip(*sort_values)
    risk_list = []
    coverage_list = []
    risk = 0
    for i in range(len(sort_conf)):
        coverage = (i + 1) / len(sort_conf)
        coverage_list.append(coverage)

        if sort_acc[i] == 0:
            risk += 1

        risk_list.append(risk / (i + 1))
    return risk_list, coverage_list


def compute_area_under_risk_coverage_age(confidences: np.ndarray, accuracies: np.ndarray) -> float:
    """
    Area Under Risk-Coverage.
    $$ AURC (\kappa, f \mid V_{n}) = \frac{1}{n} \sum_{\theta \in \Theta} \hat{r} (f, g_{\theta} \mid V_{n}) $$
    From https://github.com/sleep3r/garrus/blob/a6fd1d44b06285918cefe54f421a004dc6f315cb/garrus/metrics/aurc.py
    """
    risk_list, coverage_list = coverage_risk(confidences, accuracies)
    risk_coverage_curve_area = 0
    for risk_value in risk_list:
        risk_coverage_curve_area += risk_value * (1 / len(risk_list))

    aurc = risk_coverage_curve_area
    return float(aurc), risk_list, coverage_list


In [None]:
results = []
for model in models:
    print(f"Getting results for {model['name']}")
    mdl = pipeline("text-classification", model['model'], device=DEVICE)
    for dataset in datasets:
        print(f"Getting results for dataset type {dataset}")
        if dataset == 'all':
            df = test_df
        else:
            df = test_df.loc[
                (test_df['tool'] == dataset) | (test_df['tool'] == 'real')
            ]
        test_predictions = get_predictions(mdl, df[INPUT_COL_NAME])
        real_label = 'LABEL_0'

        confidences = [
            pred['score']
            for pred in test_predictions
        ]
        predictions =  [
            0 if pred['label'] == real_label else 1
            for pred in test_predictions
        ]
        accuracies = [
            0 if prediction != y_true else 1 for (prediction, y_true)
            in zip(predictions, df[TARGET_COL_NAME])
        ]
        aurc, risk_list, coverage_list = compute_area_under_risk_coverage_age(confidences, accuracies)
        f1 = f1_score(df[TARGET_COL_NAME], predictions)
        precision = precision_score(df[TARGET_COL_NAME], predictions)
        recall = recall_score(df[TARGET_COL_NAME], predictions)
        acc = accuracy_score(df[TARGET_COL_NAME], predictions)
        results.append({
            "model": model['model'],
            "name": model['name'],
            "aurc": aurc,
            "f1": f1,
            "precision": precision,
            "recall": recall,
            "acc": acc,
            "dataset": dataset,
            "risk_list": risk_list,
            "coverage_list": coverage_list,
            "predictions": predictions
        })
    del mdl
    torch.cuda.empty_cache()

In [None]:
results

In [None]:
pd.DataFrame(
    results
).to_csv('./across_dataset_evaluation_results.csv')