In [1]:
import os

import random
import numpy as np
import pandas as pd

from transformers import AutoTokenizer

# from transformers import XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import (
    AutoModelForSequenceClassification,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)


import torch
import torch.nn as nn
from torch.optim import AdamW, Adam
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 1
model_checkpoint = "xlm-roberta-base-full-training"
model_name = "xlm-roberta-base"
tokenizer_name = "xlm-roberta-base"

In [3]:
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(
    f"{model_checkpoint}/checkpoint-2589", num_labels=2
)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [5]:
model

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [6]:
def freeze_params(model):
    for name, param in model.named_parameters():
        if not name.startswith("classifier"):
            param.requires_grad = False

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [8]:
seed_everything(1)

In [9]:
# freeze_params(model)
if torch.cuda.is_available():
    model = model.to("cuda")

In [10]:
train = pd.read_csv("data/original/train.csv")
test = pd.read_csv("data/original/test.csv")
valid = pd.read_csv("data/original/validation.csv")

In [11]:
mapping = {"human": 0, "bot": 1}

In [12]:
train["text"].values.shape

(20712,)

In [13]:
train["label"] = train["account.type"].apply(lambda x: mapping[x])
test["label"] = test["account.type"].apply(lambda x: mapping[x])
valid["label"] = valid["account.type"].apply(lambda x: mapping[x])

In [14]:
class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data.iloc[index]

        text = data.text
        label = data.label

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return dict(
            text=text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            label=torch.tensor(label, dtype=torch.long),
        )

In [15]:
batch_size = 8
lr = 2e-5

In [16]:
ds_train = TweetDataset(train, tokenizer=tokenizer, max_len=512)
ds_test = TweetDataset(test, tokenizer=tokenizer, max_len=512)
ds_valid = TweetDataset(valid, tokenizer=tokenizer, max_len=512)

In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).astype(np.float32).mean().item()}

In [18]:
args = TrainingArguments(
    f"{model_checkpoint}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,  # save best and last
    load_best_model_at_end=True,
    seed=seed,
    # metric_for_best_model=metric_name,
)

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(AdamW(model.parameters(), lr=lr), None),  # Optimizer, Scheduler
    # compute_loss to override one must create cutsom trainer class and override the method compute_loss(self, model, inputs, return_outputs=False):
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [22]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3148,0.303349,0.870982
2,0.289,0.408895,0.880539
3,0.2761,0.503136,0.890964
4,0.1819,0.547583,0.877063


TrainOutput(global_step=10356, training_loss=0.27838994067071626, metrics={'train_runtime': 5506.4317, 'train_samples_per_second': 37.614, 'train_steps_per_second': 4.702, 'total_flos': 2.179822471446528e+16, 'train_loss': 0.27838994067071626, 'epoch': 4.0})

In [20]:
test_results = trainer.predict(ds_test)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [21]:
test_results

PredictionOutput(predictions=array([[ 0.10679162, -0.16207664],
       [ 2.8623366 , -2.9953113 ],
       [-0.45227817,  0.7512854 ],
       ...,
       [-0.22327916,  0.38776678],
       [ 2.8765996 , -3.0160606 ],
       [ 2.6537564 , -2.8882673 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 0, 0]), metrics={'test_loss': 0.2852252125740051, 'test_accuracy': 0.8713839054107666, 'test_runtime': 46.1645, 'test_samples_per_second': 55.411, 'test_steps_per_second': 6.932})

In [23]:
test_results.metrics

{'test_loss': 0.2852252125740051,
 'test_accuracy': 0.8713839054107666,
 'test_runtime': 46.1645,
 'test_samples_per_second': 55.411,
 'test_steps_per_second': 6.932}

In [24]:
from sklearn.metrics import (
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)


def proba_to_pred(proba):
    pred = (proba > 0.5).astype(int)
    return pred


def calculate_metrics(y_true, y_pred):
    results = {
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
    }
    return results

In [25]:
calculate_metrics(test.label.values, np.argmax(test_results.predictions, axis=1))

{'balanced_accuracy': 0.8713382726917058,
 'f1_score': 0.8785529715762274,
 'precision': 0.8327501749475158,
 'recall': 0.9296875}