<a href="https://colab.research.google.com/github/Denev6/practice/blob/main/transformer/RoBERTa_GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import os
import gc
import warnings

import numpy as np
import pandas as pd
from tqdm.auto import tqdm, trange
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizerFast, RobertaModel
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from google.colab import drive

drive.mount("/content/drive")
warnings.filterwarnings("ignore")

Mounted at /content/drive


In [None]:
!python --version

print(f"torch: {torch.__version__}")

Python 3.8.16
torch: 1.13.0+cu116


In [None]:
def join_path(*args):
    return os.path.join("/content/drive/MyDrive", *args)


DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TRAIN_CSV = join_path("data", "train.csv")
TEST_CSV = join_path("data", "test.csv")

ARGS = {
    "model": "tae898/emoberta-large",
    "model_path": join_path("model.pth"),
    "batch_size": 64,
    "epochs": 100,
    "max_len": 256,
    "lr": 3e-4
}

In [None]:
class EarlyStopping(object):
    def __init__(self, patience=2, save_path="model.pth"):
        self._min_loss = np.inf
        self._patience = patience
        self._path = save_path
        self.__counter = 0

    def should_stop(self, model, loss):
        if loss < self._min_loss:
            self._min_loss = loss
            self.__counter = 0
            torch.save(model.state_dict(), self._path)
        elif loss > self._min_loss:
            self.__counter += 1
            if self.__counter >= self._patience:
                return True
        return False
   
    def load(self, model):
        model.load_state_dict(torch.load(self._path))
        return model
    
    @property
    def counter(self):
        return self.__counter

In [None]:
class EmoLabelEncoder(object):
    def __init__(self):
        self._targets = [
            "neutral",
            "joy",
            "surprise",
            "anger",
            "sadness",
            "disgust",
            "fear",
        ]
        self.num_classes = len(self._targets)

    def encode(self, label):
        return self._targets.index(label)

    def decode(self, label):
        return self._targets[label]

In [None]:
class EmoDataset(Dataset):
    def __init__(
        self,
        data,
        roberta_tokenizer,
        label_encoder,
        max_length=256,
        mode=None,
    ):
        self._label_encoder = label_encoder
        self._roberta_tokenizer = roberta_tokenizer
        self._max_length = max_length
        self._mode = mode
        self._dataset = self._init_dataset(data)

    def _init_dataset(self, data):
        if self._mode == "train":
            data["Target"] = data["Target"].map(self._label_encoder.encode)
            data = data.loc[:, ["Utterance", "Target"]]
        else:
            data = data.loc[:, "Utterance"]
            data = data.to_frame()
        return data

    def __len__(self):
        return len(self._dataset)

    def __getitem__(self, idx):
        text = self._dataset.loc[idx, "Utterance"]
        inputs = self._roberta_tokenizer(
            text,
            max_length=self._max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        input_ids = inputs["input_ids"][0]
        attention_mask = inputs["attention_mask"][0]

        if self._mode == "train":
            y = self._dataset.loc[idx, "Target"]
            return input_ids, attention_mask, y
        else:
            return input_ids, attention_mask

In [None]:
class EmoClassifier(nn.Module):
    def __init__(self, num_classes):
        super(EmoClassifier, self).__init__()
        self.hidden_size = 128
        self.emoberta = RobertaModel.from_pretrained(ARGS["model"], add_pooling_layer=False)
        self.gru = nn.GRU(input_size=1024, hidden_size=self.hidden_size, dropout=0.1, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, num_classes)
        self.out = nn.Softmax(dim=-1)

        for _, param in self.emoberta.named_parameters():
            param.requires_grad = False

    def forward(self, id, mask):
        output = self.emoberta(id, mask)
        output, _ = self.gru(output.last_hidden_state)
        output = self.fc(output[:, -1, :])
        output = self.out(output)

        return output

# Dataset

In [None]:
train_csv = pd.read_csv(TRAIN_CSV)

df_train, df_val = train_test_split(
    train_csv, test_size=0.2, shuffle=False
)

In [None]:
label_encoder = EmoLabelEncoder()
roberta_tokenizer = RobertaTokenizerFast.from_pretrained(ARGS["model"], truncation=True)

train_set = EmoDataset(
    df_train.reset_index(drop=True),
    roberta_tokenizer,
    label_encoder,
    max_length=ARGS["max_len"],
    mode="train",
)
val_set = EmoDataset(
    df_val.reset_index(drop=True),
    roberta_tokenizer,
    label_encoder,
    max_length=ARGS["max_len"],
    mode="train",
)

train_dataloader = DataLoader(train_set, batch_size=ARGS["batch_size"])
val_dataloader = DataLoader(val_set, batch_size=ARGS["batch_size"])

Downloading:   0%|          | 0.00/408 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

# Model

In [None]:
def evaluate(model, criterion, val_loader, device, mode=None):
    model.eval()

    val_loss = list()
    model_preds = list()
    true_labels = list()

    with torch.no_grad():
        for input_ids, attention_mask, label in val_loader:
            label = label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output, label.long())
            val_loss.append(batch_loss.item())

            if mode != "train":
                model_preds += output.argmax(1).detach().cpu().numpy().tolist()
                true_labels += label.detach().cpu().numpy().tolist()

        if mode != "train":
            val_acc = accuracy_score(true_labels, model_preds)
            val_f1 = f1_score(true_labels, model_preds, average="macro")
            return val_acc, val_f1

        return val_loss


def train(model, optimizer, criterion, train_loader, val_loader, device):
    torch.cuda.empty_cache()
    gc.collect()
    
    early_stopper = EarlyStopping(patience=3, save_path=ARGS["model_path"])
    epoch_progress = trange(1, ARGS["epochs"] + 1)
    criterion.to(device)
    model.to(device)
    model.zero_grad()

    for epoch in epoch_progress:

        model.train()
        train_loss = list()
        for data in train_loader:

            input_ids, attention_mask, train_label = data
            train_label = train_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label.long())
            train_loss.append(batch_loss.item())

            batch_loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        val_loss = evaluate(model, criterion, val_loader, device, mode="train")
        train_loss = np.mean(train_loss)
        val_loss = np.mean(val_loss)
        tqdm.write(
            f"Epoch {epoch},  Train-Loss: {train_loss:.5f},  Val-Loss: {val_loss:.5f}"
        )

        if early_stopper.should_stop(model, val_loss):
            break

    model = early_stopper.load(model)
    return model

In [None]:
model = EmoClassifier(label_encoder.num_classes)
optimizer = optim.NAdam(
    model.parameters(), lr=ARGS["lr"], weight_decay=0.01
)
criterion = nn.CrossEntropyLoss()

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at tae898/emoberta-large were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
best_model = train(
    model, optimizer, criterion, train_dataloader, val_dataloader, DEVICE
)

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1,  Train-Loss: 1.51887,  Val-Loss: 1.42666
Epoch 2,  Train-Loss: 1.42805,  Val-Loss: 1.41250
Epoch 3,  Train-Loss: 1.41703,  Val-Loss: 1.40578
Epoch 4,  Train-Loss: 1.41550,  Val-Loss: 1.40111
Epoch 5,  Train-Loss: 1.41174,  Val-Loss: 1.39939
Epoch 6,  Train-Loss: 1.41025,  Val-Loss: 1.39736
Epoch 7,  Train-Loss: 1.40745,  Val-Loss: 1.39511
Epoch 8,  Train-Loss: 1.40967,  Val-Loss: 1.39650
Epoch 9,  Train-Loss: 1.41026,  Val-Loss: 1.39645
Epoch 10,  Train-Loss: 1.40993,  Val-Loss: 1.39804


In [None]:
torch.cuda.empty_cache()
gc.collect()

val_acc, val_f1 = evaluate(model, criterion, val_dataloader, DEVICE)
print(f"Accuracy: {val_acc:.5f}")
print(f"F1-macro: {val_f1:.5f}")

Accuracy: 0.77578
F1-macro: 0.59070
