In [1]:
import os

import random
import numpy as np
import pandas as pd

from transformers import AutoTokenizer

# from transformers import XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import (
    AutoModelForSequenceClassification,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)


import torch
import torch.nn as nn
from torch.optim import AdamW, Adam
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
Checkpoint = str
ModelName = str
TokenizerName = str

ModelMetadata = tuple[Checkpoint, ModelName, TokenizerName]

In [4]:
models_metadata: list[ModelMetadata] = [
    ("xlm-roberta-base-finetuned", "xlm-roberta-base", "xlm-roberta-base"),
    ("xlm-roberta-base-full-training", "xlm-roberta-base", "xlm-roberta-base"),
    ("xlm-roberta-base-full-training-2", "xlm-roberta-base", "xlm-roberta-base"),
    (
        "distilbert-base-uncased-full-training",
        "distilbert-base-uncased",
        "distilbert-base-uncased",
    ),
    (
        "distilbert-base-uncased-full-training-gpt2-data-with-twitter",
        "distilbert-base-uncased",
        "distilbert-base-uncased",
    ),
    ("gpt2-full-training", "gpt2", "gpt2"),
]

In [5]:
test = pd.read_csv("data/original/test.csv")

mapping = {"human": 0, "bot": 1}

test["label"] = test["account.type"].apply(lambda x: mapping[x])

In [6]:
class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, model):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            model.resize_token_embeddings(len(tokenizer))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data.iloc[index]

        text = data.text
        label = data.label

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return dict(
            text=text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            label=torch.tensor(label, dtype=torch.long),
        )

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


seed = 1
seed_everything(seed)

In [8]:
def calculate_metrics(y_true, y_pred):
    results = {
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
    }
    return results

In [9]:
batch_size = 8
lr = 2e-5

In [10]:
def evaluate_model(
    model_metadata: ModelMetadata, test_data: pd.DataFrame, bot_idx: dict[str, pd.Index]
) -> tuple[list[dict], list[dict]]:
    """
    Evaluate the performance of a sequence classification model on test data.

    Args:
        model_metadata (ModelMetadata): Metadata of the model, including checkpoint, model name, and tokenizer name.
        test_data (pd.DataFrame): Test data for evaluation.
        bot_idx (dict[str, pd.Index]): Dictionary mapping bot names to their corresponding indices in the test data.

    Returns:
        tuple[list[dict], list[dict]]: A tuple containing two lists of dictionaries.
            - The first list contains overall evaluation results for the model on the entire test data.
            - The second list contains evaluation results for each bot type separately.
    """

    checkpoint, model_name, tokenizer_name = model_metadata

    # get best model (two models are saved - last and best - listdir sort results so
    # the best will always be first)
    model = AutoModelForSequenceClassification.from_pretrained(
        os.path.join(checkpoint, os.listdir(checkpoint)[0]),
        num_labels=2,
    )
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    ds_test = TweetDataset(test_data, tokenizer=tokenizer, max_len=512, model=model)

    # freeze_params(model)
    if torch.cuda.is_available():
        model = model.to("cuda")

    args = TrainingArguments(
        checkpoint,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        seed=seed,
    )
    trainer = Trainer(
        model,
        args,
        # train_dataset=ds_train,
        # eval_dataset=ds_valid,
        tokenizer=tokenizer,
        optimizers=(AdamW(model.parameters(), lr=lr), None),  # Optimizer, Scheduler
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    test_results = trainer.predict(ds_test)

    results = [
        {
            **calculate_metrics(
                test_data.label.values, np.argmax(test_results.predictions, axis=1)
            ),
            "model": checkpoint,
            "dataset": "raw",
        }
    ]

    results_grouped = []
    for bot_nm, bot_val in bot_idx.items():
        results_grouped.append(
            {
                "accuracy": accuracy_score(
                    np.argmax(test_results.predictions[bot_val], axis=1),
                    test_data.label.values[bot_val],
                ),
                "model": checkpoint,
                "bot_type": bot_nm,
                "type": "raw",
            }
        )
    return results, results_grouped

In [11]:
def evaluate_models(
    models_metadata: list[ModelMetadata], test_data: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Evaluate multiple models on test data and return the results.

    Args:
        models_metadata (list[ModelMetadata]): A list of metadata for the models to be evaluated.
        test_data (pd.DataFrame): The test data to evaluate the models on.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames:
            - results_pd: DataFrame with the evaluation results for each model.
            - results_groupd_pd: DataFrame with the grouped evaluation results for each model.
    """

    bot_idx: dict[str, pd.Index] = {
        bot_type: test_data[test_data["class_type"] == bot_type].index
        for bot_type in test_data["class_type"].unique()
    }

    results = []
    results_grouped = []

    for model_metadata in models_metadata:
        _results, _results_grouped = evaluate_model(model_metadata, test_data, bot_idx)

        results.extend(_results)
        results_grouped.extend(_results_grouped)

    results_pd = pd.DataFrame(results)
    results_groupd_pd = pd.DataFrame(results_grouped)

    return results_pd, results_groupd_pd

In [12]:
res, res_grouped = evaluate_models(models_metadata, test)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
res.to_csv("notebooks/results/transformers.csv", index=False)
res_grouped.to_csv(
    "notebooks/results/results_in_depth-transformers.csv"
)  # , index=False)