In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import logging
import string
import re
import os
from tqdm.notebook import tqdm
from random import randint, shuffle

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    ConfusionMatrixDisplay,
    accuracy_score,
    f1_score
)

import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'
sns.set_theme()
logging.basicConfig(filename='training_log.log',
                    format='%(asctime)s - %(message)s', 
                    level=logging.INFO,
                    filemode='w')
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='training_log.log', mode='a')
formatter = logging.Formatter('%(asctime)s: %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)


class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, num_classes):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead),
            num_layers
        )
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x.transpose(0, 1))
        x = x.mean(dim=0)
        x = self.fc(x)
        return x

class Tokenizer:
    def __init__(self, text_list, max_vocab_size=20000, n=3):
        self.max_vocab_size = max_vocab_size
        self.oov_token = "<OOV>"
        self.n = n
        self.build_vocab(text_list)

    def build_vocab(self, text_list):
        all_text = " ".join(self.preprocess(text) for text in text_list)
        ngrams = [self.ngramify(word) for word in all_text.split()]
        ngrams_flat = [ngram for word_ngrams in ngrams for ngram in word_ngrams] 
        ngram_count = Counter(ngrams_flat)
        vocab = [self.oov_token] + sorted(ngram_count, key=ngram_count.get, reverse=True)
        vocab = vocab[:self.max_vocab_size]
        self.word_to_id = {word: i for i, word in enumerate(vocab)}
        self.id_to_word = {i: word for word, i in self.word_to_id.items()}
        self.vocab_size = len(self.word_to_id)

    def preprocess(self, text):
        return text.lower()

    def ngramify(self, word):
        return [word[i:i + self.n] for i in range(len(word) - self.n + 1)]

    def encode(self, text):
        text = self.preprocess(text)
        words = text.split()
        ids = []
        for word in words:
            ngrams = self.ngramify(word)
            word_ids = [self.word_to_id.get(ngram, self.word_to_id[self.oov_token]) for ngram in ngrams]
            ids.extend(word_ids)
        return ids

    def decode(self, ids):
        ngrams = [self.id_to_word.get(id, self.oov_token) for id in ids]
        return "".join(ngrams)

    def __len__(self):
        return self.vocab_size


device = 'cuda' if torch.cuda.is_available() else 'cpu'


def pad_sequences(sequences, max_length):
    padded_sequences = []
    for seq in sequences:
        if len(seq) > max_length:
            seq = seq[:max_length]
        padded_seq = seq + [0] * (max_length - len(seq))
        padded_sequences.append(padded_seq)
    return padded_sequences

class DS(Dataset):
    def __init__(self, data_path, tokenizer, max_token_length=512):
        super().__init__()
        self.max_token_length = max_token_length

        self.labels = [
            'hate',
            'privacy',
            'sexual',
            'impersonation',
            'illegal',
            'advertisement',
            'ai',
            'neutral'
        ]

        self.data = pd.read_csv(data_path)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.loc[idx, :]


        comment = row['body']
        comment = comment.split()

        if len(comment) > self.max_token_length:
            comment = comment[:self.max_token_length]
        comment = ' '.join(comment)
        comment = comment.replace("\\", "")

        emotion = row['label']
        emotion = self.labels.index(emotion)

        return f"{comment}", emotion

    def choose(self):
        return self[randint(0, len(self)-1)]

    def get_tokenizer_size(self):
        return len(self.tokenizer)

    def decode(self, input_id):
        return self.tokenizer.decode(input_id)

    def collate_fn(self, data):
        comments, emotions = zip(*data)
        comments = [self.tokenizer.encode(comment) for comment in comments]
        comments = pad_sequences(comments, self.max_token_length)
        comments = torch.tensor(comments, dtype=torch.long, device=device)
        emotions = torch.tensor(emotions).long().to(device)
        return comments, emotions

In [3]:
val_path = 'https://drive.google.com/uc?export=download&id=1ZMJI7DyKMLHpHp-HBUO64kWbP6A-qj9k'
train_path = 'https://drive.google.com/uc?export=download&id=1ZTfYOXeZLW57mLR7IegIFovi7FW1chS6'
train_df_orig = pd.read_csv(train_path)
val_df_orig = pd.read_csv(val_path)

train_df = train_df_orig.copy()
val_df = val_df_orig.copy()
print(f'Size of training data: {len(train_df)}')
print(f'Size of validation data: {len(val_df)}')

tokenizer = Tokenizer(train_df["body"].tolist())
batch_size = 16

train_ds = DS(train_path, tokenizer)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=train_ds.collate_fn)
val_ds = DS(val_path, tokenizer)
val_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=val_ds.collate_fn)

Size of training data: 44188
Size of validation data: 4910


In [4]:
model = Transformer(vocab_size=train_ds.get_tokenizer_size(), d_model=128, nhead=4, num_layers=4, num_classes=len(train_ds.labels)).to(device)

In [None]:
def compute_metrics(targets, preds):
    targets = targets.cpu().detach()
    preds = preds.cpu().detach()
    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, average='weighted')
    return {'acc': acc, 'f1': f1, 'preds': preds, 'targets':targets}

def focal_loss(gamma=2):
    def compute_loss(preds, targets):
        ce_loss = F.cross_entropy(preds, targets, reduction='none')
        pt = torch.exp(-ce_loss) 
        return ((1-pt)**gamma * ce_loss).mean()
    return compute_loss

def train(data, model, optimizer, label_loss_fn, ft_lambda=1e-4):
    model.train()
    comments, labels = data
    label_outputs = model(comments)
    label_loss = label_loss_fn(label_outputs, labels)
    loss = label_loss
    model.zero_grad()
    loss.backward()
    optimizer.step()

    label_preds = label_outputs.argmax(-1)
    label_metrics = compute_metrics(labels, label_preds)
    return loss, label_metrics

@torch.no_grad()
def validate(data, model, label_loss_fnn):
    model.eval()
    comments, labels = data
    label_outputs = model(comments)
    label_loss = label_loss_fnn(label_outputs, labels)
    loss = label_loss

    label_preds = label_outputs.argmax(-1)
    label_metrics = compute_metrics(labels, label_preds)
    return loss, label_metrics

n_epoch = 100
loss_fn = focal_loss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

In [5]:


mem = {
    'train_loss': [],
    'train_acc': [],
    'train_f1': [],
    'val_loss': [],
    'val_acc': [],
    'val_f1': []
}

cur_best_f1 = 0

for epoch in tqdm(range(n_epoch), desc='Training'):

    n_batch = len(train_dl)
    train_losses = []
    train_accs = []
    train_f1s = []

    for i, data in enumerate(train_dl):
        train_loss, train_metrics = train(data, model, optimizer, loss_fn, loss_fn)
        pos = epoch + ((i+1)/n_batch)
        train_losses.append(train_loss.cpu().detach())
        train_accs.append(train_metrics['acc'])
        train_f1s.append(train_metrics['f1'])
        log = f"epoch: {pos:.3f}\ttrain loss: {train_loss:.3f}\ttrain_acc: {train_metrics['acc']:.3f}\ttrain_f1: {train_metrics['f1']:.3f}"
        print('\r', log, end='')
    
    mem['train_loss'].append(np.mean(train_losses))
    mem['train_acc'].append(np.mean(train_accs))
    mem['train_f1'].append(np.mean(train_f1s))

    n_batch = len(val_dl)
    val_losses = []
    val_accs = []
    val_f1s = []

    for i, data in enumerate(val_dl):
        val_loss, val_metrics = validate(data, model, loss_fn)
        pos = epoch + ((i+1)/n_batch)
        val_losses.append(val_loss.cpu().detach())
        val_accs.append(val_metrics['acc'])
        val_f1s.append(val_metrics['f1'])
        log = f"epoch: {pos:.3f}\tval loss: {val_loss:.3f}\tval_acc: {val_metrics['acc']:.3f}\tval_f1: {val_metrics['f1']:.3f}"
        print('\r', log, end='')
    
    print('\r', end='')
    mem['val_loss'].append(np.mean(val_losses))
    mem['val_acc'].append(np.mean(val_accs))
    mem['val_f1'].append(np.mean(val_f1s))

    log = f"epoch: {epoch+1}\ntrain loss: {mem['train_loss'][-1]:.3f}\ttrain_acc: {mem['train_acc'][-1]:.3f}\ttrain_f1: {mem['train_f1'][-1]:.3f}"
    log = log + f"\nval loss:   {mem['val_loss'][-1]:.3f}\tval_acc:   {mem['val_acc'][-1]:.3f}\tval_f1:   {mem['val_f1'][-1]:.3f}\n"
    logger.info(log)
    print(log)
    scheduler.step()    

    if mem['val_f1'][-1] > cur_best_f1:
        torch.save(model.state_dict(), f"transformer_weights.pth")
        cur_best_f1 = mem['val_f1'][-1]

Training:   0%|          | 0/100 [00:00<?, ?it/s]

epoch: 1
train loss: 0.411	train_acc: 0.692	train_f1: 0.660
val loss:   0.290	val_acc:   0.817	val_f1:   0.802

epoch: 2
train loss: 0.289	train_acc: 0.815	train_f1: 0.798
val loss:   0.258	val_acc:   0.835	val_f1:   0.818

epoch: 3
train loss: 0.258	train_acc: 0.831	train_f1: 0.815
val loss:   0.250	val_acc:   0.838	val_f1:   0.827

epoch: 4
train loss: 0.234	train_acc: 0.840	train_f1: 0.825
val loss:   0.229	val_acc:   0.845	val_f1:   0.837

epoch: 5
train loss: 0.215	train_acc: 0.848	train_f1: 0.835
val loss:   0.208	val_acc:   0.854	val_f1:   0.844

epoch: 6
train loss: 0.198	train_acc: 0.858	train_f1: 0.845
val loss:   0.195	val_acc:   0.864	val_f1:   0.855

epoch: 7
train loss: 0.185	train_acc: 0.863	train_f1: 0.852
val loss:   0.190	val_acc:   0.874	val_f1:   0.863

epoch: 8
train loss: 0.174	train_acc: 0.869	train_f1: 0.858
val loss:   0.187	val_acc:   0.862	val_f1:   0.855

epoch: 9
train loss: 0.164	train_acc: 0.873	train_f1: 0.864
val loss:   0.180	val_acc:   0.875	val_f1:  

In [5]:
model_weight = torch.load("transformer_weights.pth")
model.load_state_dict(model_weight)

label_preds, label_targets = [], []

with torch.no_grad():
    for i, data in tqdm(enumerate(val_dl), total=len(val_dl)):
        comments, labels = data
        label_outputs = model(comments)
        label_preds.extend(label_outputs.argmax(-1).cpu().numpy())
        label_targets.extend(labels.cpu().numpy())
    print(classification_report(label_targets, label_preds))

  0%|          | 0/307 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1958
           1       0.48      0.46      0.47        26
           2       0.59      0.22      0.32        46
           3       0.75      0.12      0.20        26
           4       0.20      0.04      0.06        28
           5       0.76      0.62      0.68        47
           6       0.88      0.86      0.87       604
           7       0.86      0.90      0.88      2175

    accuracy                           0.88      4910
   macro avg       0.68      0.52      0.55      4910
weighted avg       0.88      0.88      0.88      4910

