In [50]:
import os

import random
import numpy as np
import pandas as pd

from transformers import AutoTokenizer

# from transformers import XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import (
    AutoModelForSequenceClassification,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)


import torch
import torch.nn as nn
from torch.optim import AdamW, Adam
from torch.utils.data import Dataset, DataLoader

In [2]:
seed = 1
model_name = "xlm-roberta-base"
tokenizer_name = "xlm-roberta-base"

In [3]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [5]:
model

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [6]:
def freeze_params(model):
    for name, param in model.named_parameters():
        if not name.startswith("classifier"):
            param.requires_grad = False

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [24]:
seed_everything(1)

In [25]:
freeze_params(model)
if torch.cuda.is_available():
    model = model.to("cuda")

In [8]:
train = pd.read_csv("data/original/train.csv")
test = pd.read_csv("data/original/test.csv")
valid = pd.read_csv("data/original/validation.csv")

In [9]:
mapping = {"human": 0, "bot": 1}

In [10]:
train["text"].values.shape

(20712,)

In [12]:
train["label"] = train["account.type"].apply(lambda x: mapping[x])
test["label"] = test["account.type"].apply(lambda x: mapping[x])
valid["label"] = valid["account.type"].apply(lambda x: mapping[x])

In [13]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [14]:
class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data.iloc[index]

        text = data.text
        label = data.label

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return dict(
            text=text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            label=torch.tensor(label, dtype=torch.long),
        )

In [33]:
batch_size = 8
lr = 2e-5

In [16]:
ds_train = TweetDataset(train, tokenizer=tokenizer, max_len=512)
ds_test = TweetDataset(test, tokenizer=tokenizer, max_len=512)
ds_valid = TweetDataset(valid, tokenizer=tokenizer, max_len=512)

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).astype(np.float32).mean().item()}

In [30]:
args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,  # save best and last
    load_best_model_at_end=True,
    seed=seed,
    # metric_for_best_model=metric_name,
)

In [35]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(AdamW(model.parameters(), lr=lr), None),  # Optimizer, Scheduler
    # compute_loss to override one must create cutsom trainer class and override the method compute_loss(self, model, inputs, return_outputs=False):
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [36]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5782,0.567648,0.735013
2,0.529,0.508257,0.748914
3,0.5329,0.485394,0.762381
4,0.5046,0.484378,0.757602
5,0.5034,0.462335,0.764987
6,0.4763,0.466197,0.765856
7,0.4722,0.456325,0.769331
8,0.4776,0.456408,0.771503
9,0.4896,0.453334,0.771937
10,0.4699,0.454347,0.773675


TrainOutput(global_step=25890, training_loss=0.507957960049956, metrics={'train_runtime': 4542.0297, 'train_samples_per_second': 45.601, 'train_steps_per_second': 5.7, 'total_flos': 5.44955617861632e+16, 'train_loss': 0.507957960049956, 'epoch': 10.0})

In [37]:
trainer.evaluate()

{'eval_loss': 0.45333370566368103,
 'eval_accuracy': 0.7719374299049377,
 'eval_runtime': 41.4939,
 'eval_samples_per_second': 55.478,
 'eval_steps_per_second': 6.941,
 'epoch': 10.0}

In [38]:
test_results = trainer.predict(ds_test)

In [39]:
test_results

PredictionOutput(predictions=array([[-0.60016686,  0.34100235],
       [ 1.3083602 , -1.5889796 ],
       [ 0.08203287, -0.34019834],
       ...,
       [-0.6210207 ,  0.3922635 ],
       [ 1.3584753 , -1.6362598 ],
       [ 0.42886525, -0.6967438 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 0, 0]), metrics={'test_loss': 0.44171246886253357, 'test_accuracy': 0.7810789942741394, 'test_runtime': 46.0146, 'test_samples_per_second': 55.591, 'test_steps_per_second': 6.954})

In [40]:
test_results.metrics

{'test_loss': 0.44171246886253357,
 'test_accuracy': 0.7810789942741394,
 'test_runtime': 46.0146,
 'test_samples_per_second': 55.591,
 'test_steps_per_second': 6.954}

In [70]:
from sklearn.metrics import (
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)


def proba_to_pred(proba):
    pred = (proba > 0.5).astype(int)
    return pred


def calculate_metrics(y_true, y_pred):
    results = {
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
    }
    return results

In [71]:
calculate_metrics(test.label.values, np.argmax(test_results.predictions, axis=1))

{'balanced_accuracy': 0.7810959507042254,
 'f1_score': 0.7763578274760383,
 'precision': 0.7941176470588235,
 'recall': 0.759375}