<a href="https://colab.research.google.com/github/Denev6/practice/blob/main/transformer/RoBERTa_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuned RoBERTa
- [EmoBERTa-large](https://github.com/tae898/erc)
- [MIT LICENSE](https://github.com/tae898/erc/blob/main/LICENSE)

In [None]:
!python --version

Python 3.8.15


In [None]:
!pip install transformers

import nltk

nltk.download("punkt")

In [None]:
import os
import re
import gc
import warnings

import numpy as np
import pandas as pd
from tqdm.auto import tqdm, trange
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, RobertaTokenizerFast, RobertaForSequenceClassification
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from google.colab import drive

drive.mount("/content/drive")
warnings.filterwarnings("ignore")

In [None]:
def join_path(*args):
    return os.path.join("/content/drive/MyDrive", *args)


DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TRAIN_CSV = join_path("data", "train.csv")
TEST_CSV = join_path("data", "test.csv")

ARGS = {
    "model": "tae898/emoberta-large",
    "model_path": join_path("model.pth"),
    "batch_size": 8,
    "grad_step": 8,
    "epochs": 10,
    "max_len": 128,
    "lr": 3e-6,
    "patience": 2,
}

## Utils

In [None]:
class EarlyStopping(object):
    def __init__(self, patience, save_path):
        self._max_score = 0
        self._patience = patience
        self._path = save_path
        self.__counter = 0

    def should_stop(self, model, score):
        if score > self._max_score:
            self._max_score = score
            self.__counter = 0
            torch.save(model.state_dict(), self._path)
        elif score < self._max_score:
            self.__counter += 1
            if self.__counter >= self._patience:
                return True
        return False

    def load(self, model):
        model.load_state_dict(torch.load(self._path))
        return model

    @property
    def counter(self):
        return self.__counter

In [None]:
class LabelEncoder(object):
    def __init__(self):
        self._targets = [
            "neutral",
            "joy",
            "surprise",
            "anger",
            "sadness",
            "disgust",
            "fear",
        ]
        self.num_classes = len(self._targets)

    def encode(self, label):
        return self._targets.index(label)

    def decode(self, label):
        return self._targets[label]

In [None]:
class DataProcessor(object):
    def __init__(self, label_encoder):
        self._twt_tokenizer = TweetTokenizer(
            preserve_case=True, strip_handles=True, reduce_len=True
        )
        self._label_encoder = label_encoder
        self._char_dict = {
            "–": "-",
            "—": "-",
        }

    def process(self, data, mode, do_preprocess=False):
        data = data.reset_index(drop=True)
        if mode == "train":
            if do_preprocess:
                data["Utterance"] = data["Utterance"].map(self._process)
            data["Target"] = data["Target"].map(self._label_encoder.encode)
            data = data.loc[:, ["Utterance", "Target"]]
        elif mode == "test":
            data = data.loc[:, ["ID", "Utterance"]]
        return data

    def _process(self, sentence):
        sentence = self.__unify_char(sentence)
        tokens = self._twt_tokenizer.tokenize(sentence)
        tokens = self.__shorten_repeated_tokens(tokens)
        sentence = self.__tokens_to_sentence(tokens)
        return sentence

    def __unify_char(self, sentence):
        for char, new in self._char_dict.items():
            sentence = sentence.replace(char, new)
        return sentence

    def __shorten_repeated_tokens(self, tokens):
        for i, token in enumerate(tokens):
            if "-" in token:
                token = token.split("-")
                token = "-".join(dict.fromkeys(token))
                tokens[i] = token
        return tokens

    def __tokens_to_sentence(self, tokens):
        sentence = " ".join(tokens)
        patterns = re.findall(r"\" .*? \"", sentence)
        patterns += re.findall(r"“ .*? ”", sentence)
        patterns += re.findall(r"' .*? '", sentence)
        for pattern in patterns:
            sentence = sentence.replace(
                pattern, "".join([pattern[0], pattern[2:-2], pattern[-1]])
            )

        patterns = re.findall(r" \W+ ", sentence)
        for pattern in patterns:
            sentence = sentence.replace(pattern, pattern.lstrip())
        return sentence

In [None]:
class RoBERTaDataset(Dataset):
    def __init__(
        self,
        data,
        roberta_tokenizer,
        max_length=512,
        mode=None,
    ):
        self._dataset = data
        self._roberta_tokenizer = roberta_tokenizer
        self._max_length = max_length
        self._mode = mode

    def __len__(self):
        return len(self._dataset)

    def __getitem__(self, idx):
        text = self._dataset.loc[idx, "Utterance"]
        inputs = self._roberta_tokenizer(
            text,
            max_length=self._max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        input_ids = inputs["input_ids"][0]
        attention_mask = inputs["attention_mask"][0]

        if self._mode == "train":
            y = self._dataset.loc[idx, "Target"]
            return input_ids, attention_mask, y
        else:
            return input_ids, attention_mask

## Dataset

In [None]:
label_encoder = LabelEncoder()
processor = DataProcessor(label_encoder)
roberta_tokenizer = RobertaTokenizerFast.from_pretrained(ARGS["model"], truncation=True)

In [None]:
train_csv = pd.read_csv(TRAIN_CSV)
df_train, df_val = train_test_split(
    train_csv, test_size=0.2, shuffle=True, random_state=32
)

In [None]:
df_train = processor.process(df_train, mode="train", do_preprocess=True)
df_val = processor.process(df_val, mode="train")

train_set = RoBERTaDataset(
    df_train,
    roberta_tokenizer,
    max_length=ARGS["max_len"],
    mode="train",
)
val_set = RoBERTaDataset(
    df_val,
    roberta_tokenizer,
    max_length=ARGS["max_len"],
    mode="train",
)

train_dataloader = DataLoader(train_set, batch_size=ARGS["batch_size"])
val_dataloader = DataLoader(val_set, batch_size=ARGS["batch_size"])

## Train

In [None]:
def evaluate(model, criterion, val_loader, device, mode=None):
    model.eval()

    val_loss = list()
    model_preds = list()
    true_labels = list()

    with torch.no_grad():
        for input_ids, attention_mask, label in val_loader:
            label = label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output.logits, label.long())
            val_loss.append(batch_loss.item())

            model_preds += output.logits.argmax(1).detach().cpu().numpy().tolist()
            true_labels += label.detach().cpu().numpy().tolist()

        val_f1 = f1_score(true_labels, model_preds, average="macro")

        if mode != "train":
            val_acc = accuracy_score(true_labels, model_preds)
            return val_acc, val_f1

        return val_loss, val_f1


def train(model, optimizer, scheduler, criterion, train_loader, val_loader, device):
    torch.cuda.empty_cache()
    gc.collect()

    model_path = ARGS["model_path"]
    grad_step = ARGS["grad_step"]
    epoch_progress = trange(1, ARGS["epochs"] + 1)
    early_stopper = EarlyStopping(ARGS["patience"], model_path)

    model.to(device)
    criterion.to(device)
    model.zero_grad()

    for epoch in epoch_progress:

        model.train()
        train_loss = list()
        for batch_id, data in enumerate(train_loader, start=1):

            input_ids, attention_mask, train_label = data
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)
            train_label = train_label.to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output.logits, train_label.long())
            train_loss.append(batch_loss.item())

            batch_loss /= grad_step
            batch_loss.backward()

            if batch_id % grad_step == 0:
                optimizer.step()
                model.zero_grad()

        val_loss, val_f1 = evaluate(model, criterion, val_loader, device, mode="train")
        train_loss = np.mean(train_loss)
        val_loss = np.mean(val_loss)
        tqdm.write(
            f"Epoch {epoch}, Train-Loss: {train_loss:.5f},  Val-Loss: {val_loss:.5f},  Val-f1: {val_f1:.5f}"
        )

        if early_stopper.should_stop(model, val_f1):
            break

        scheduler.step()

    tqdm.write(f"\n\n -- EarlyStopping: [Epoch: {epoch - early_stopper.counter}]")
    tqdm.write(f"Model saved at '{model_path}'.")
    model = early_stopper.load(model)

    return model

In [None]:
model = RobertaForSequenceClassification.from_pretrained(ARGS["model"])
optimizer = AdamW(
    model.parameters(), lr=ARGS["lr"], weight_decay=0.01, correct_bias=False
)
scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 0.9**epoch)
criterion = nn.CrossEntropyLoss()

In [None]:
best_model = train(
    model, optimizer, scheduler, criterion, train_dataloader, val_dataloader, DEVICE
)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1, Train-Loss: 0.86085,  Val-Loss: 0.76575,  Val-f1: 0.63370
Epoch 2, Train-Loss: 0.65933,  Val-Loss: 0.72349,  Val-f1: 0.64779
Epoch 3, Train-Loss: 0.53677,  Val-Loss: 0.73985,  Val-f1: 0.65201
Epoch 4, Train-Loss: 0.46940,  Val-Loss: 0.76987,  Val-f1: 0.64344
Epoch 5, Train-Loss: 0.41825,  Val-Loss: 0.77944,  Val-f1: 0.65447
Epoch 6, Train-Loss: 0.36902,  Val-Loss: 0.80055,  Val-f1: 0.64074
Epoch 7, Train-Loss: 0.33900,  Val-Loss: 0.82009,  Val-f1: 0.63377


 -- EarlyStopping: [Epoch: 5]
Model saved at '/content/drive/MyDrive/model.pth'.


In [None]:
torch.cuda.empty_cache()
gc.collect()

val_acc, val_f1 = evaluate(best_model, criterion, val_dataloader, DEVICE)
print(f"Accuracy: {val_acc:.5f}")
print(f"F1-macro: {val_f1:.5f}")

Accuracy: 0.76827
F1-macro: 0.65447


## Prediction

In [None]:
df_test = pd.read_csv(TEST_CSV)

df_test = processor.process(df_test, mode="test")
test_set = RoBERTaDataset(
    df_test,
    roberta_tokenizer,
    max_length=ARGS["max_len"],
)
test_dataloader = DataLoader(test_set, batch_size=ARGS["batch_size"], shuffle=False)

In [None]:
def predict(model, test_loader, device):
    model.eval()
    model_preds = list()

    with torch.no_grad():
        for input_ids, attention_mask in test_loader:
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)
            model_preds += output.logits.argmax(1).detach().cpu().numpy().tolist()
        return model_preds

In [None]:
preds = predict(best_model, test_dataloader, DEVICE)

df_test["Target"] = preds
df_test["Target"] = df_test["Target"].map(label_encoder.decode)
res = df_test.loc[:, ["ID", "Target"]]
res.head(10)

Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,neutral
5,TEST_0005,joy
6,TEST_0006,joy
7,TEST_0007,joy
8,TEST_0008,joy
9,TEST_0009,neutral
