In [None]:
#pip install --upgrade torch==1.11.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
import pandas as pd
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch

from sklearn.metrics import f1_score
from torch import nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import torch.nn.functional as F
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import scipy
import re
import math
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
from torch.optim import Adam
from sklearn.metrics import classification_report


from torch.utils.data import Dataset, DataLoader

In [None]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
dataset = pd.read_csv('tiny_dataset.csv', encoding = 'utf-8')

In [None]:
dataset.emo_class.value_counts()

In [None]:
dataset = dataset[(dataset.post_date >= 2019) & (dataset.post_date <= 2020)]

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [label for label in df['emo_class']]
        # self.labels = [labels[label] for label in df['round_N']]
        self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [None]:
class BertBaseline(nn.Module):
    def __init__(self, model_name, inner_features, out_features):
        super(BertBaseline, self).__init__()

        self.bert = AutoModel.from_pretrained(model_name, return_dict=True)
        self.conv = nn.Conv1d(768, 512, kernel_size=1)  # Convolutional layer for dimension reduction
        self.rnn = nn.GRU(512, inner_features, batch_first=True)
        self.linear1 = nn.Linear(inner_features, inner_features // 2)
        self.linear2 = nn.Linear(inner_features // 2, out_features)
        self.dropout = nn.Dropout(0.15)
        self.layer_norm = nn.LayerNorm(inner_features)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        bert = self.bert(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_states = bert.hidden_states
        x = hidden_states[-1]  # Use the last hidden layer
        x = x.transpose(1, 2)  # Transpose to make the dimension 768 the last dimension
        x = self.conv(x)  # Apply the convolutional layer for dimension reduction
        x = x.transpose(1, 2)  # Transpose back to the original dimension
        x = self.rnn(x)[0]  # Apply the RNN and get the output sequence
        x = self.layer_norm(x)  # Apply layer normalization
        x = self.dropout(x)  # Apply dropout
        x = self.relu(x)  # Apply ReLU activation
        x = x[:, -1, :]  # Take the last output from the recurrent layer
        x = self.linear1(x)  # Apply the first linear layer
        x = self.relu(x)  # Apply ReLU activation
        x = self.dropout(x)  # Apply dropout
        x = self.linear2(x)  # Apply the second linear layer
        x = self.softmax(x)  # Apply softmax activation

        return x

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):
    val_reports = []
    train_reports = []
    best_epoch = 0
    train, val = Dataset(train_data), Dataset(val_data)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=16)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    min_val_loss = float('inf')
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        train_dataloader_desc = f"Training (Epoch {epoch_num+1}/{epochs})"
        train_dataloader_tqdm = tqdm(train_dataloader, desc=train_dataloader_desc, leave=False)

        for train_input, train_label in train_dataloader_tqdm:
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, attention_mask=mask)

            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

            train_dataloader_tqdm.set_postfix({'Loss': total_loss_train / (train_dataloader_tqdm.n + 1e-12)})

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label.long())
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        avg_train_loss = total_loss_train /len(train_dataloader_tqdm)
        avg_val_loss = total_loss_val /len(val_dataloader)


        if avg_val_loss <= min_val_loss:
            min_val_loss = avg_val_loss
            best_epoch = epoch_num +1
            print('best epoch', best_epoch)
            path = 'tiny_best_rnn_cnn__extra_emo'+str(best_epoch) +'.pth'
            torch.save(model.state_dict(), path)

        train_labels = []
        train_predictions = []
        for train_input, train_label in train_dataloader:
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, attention_mask=mask)

            _, predict = torch.max(output.cpu().data, 1)
            train_labels.extend(train_label.cpu().detach().numpy())
            train_predictions.extend(predict.cpu().detach().numpy())

        train_report = classification_report(train_labels, train_predictions)
        train_reports.append(train_report)
        val_labels = []
        val_predictions = []
        for val_input, val_label in val_dataloader:
            val_label = val_label.to(device)
            mask = val_input['attention_mask'].to(device)
            input_id = val_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, attention_mask=mask)

            _, predict = torch.max(output.cpu().data, 1)
            val_labels.extend(val_label.cpu().detach().numpy())
            val_predictions.extend(predict.cpu().detach().numpy())

        validation_report = classification_report(val_labels, val_predictions)
        val_reports.append(validation_report)

        print(f'Epoch: {epoch_num + 1} | Train Loss: {avg_train_loss:.3f} | Val Loss: {avg_val_loss:.3f}')
        print('train report')
        print(train_report)
        print('validation report')
        print(validation_report)

    return path,best_epoch,train_reports, val_reports


In [None]:
def evaluate(model, test_data):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")


    if use_cuda:


        model = model.cuda()


    total_acc_test = 0
    test_labels = []
    test_predictions = []
    with torch.no_grad():


        for test_input, test_label in test_dataloader:


            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)


            output = model(input_id, mask)


            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
            _, predict = torch.max(output.cpu().data, 1)
            test_labels.extend(test_label.cpu().detach().numpy())
            test_predictions.extend(predict.cpu().detach().numpy())

        test_report = classification_report(test_labels, test_predictions)
        print(test_report)
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(dataset, test_size=0.3, stratify=dataset['emo_class'],
                                   random_state=42)
df_val, df_test = train_test_split(df_test, test_size=0.5, stratify=df_test['emo_class'],
                                   random_state=42)
low_class = df_train[df_train.emo_class == 2]
df_train = df_train.append(low_class)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
EPOCHS = 10
model = BertBaseline(model_name, inner_features=512, out_features=3)
LR = 1e-6


In [None]:
path,best_epoch,train_reports, val_reports = train(model, df_train, df_val, LR, EPOCHS)

In [None]:
print(train_reports[best_epoch-1])

In [None]:
print(val_reports[best_epoch-1])

In [None]:
model = BertBaseline('xlm-roberta-base', inner_features = 512, out_features = 3)

In [None]:
model.load_state_dict(torch.load(path))

In [None]:
model = model.to(device)

In [None]:
evaluate(model, df_test)