In [1]:

!pip install transformers

import os
import logging
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from random import randint, shuffle

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    ConfusionMatrixDisplay,
    accuracy_score,
    f1_score
)

import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
sns.set_theme()

logging.basicConfig(filename='training_log.log',
                    format='%(asctime)s - %(message)s', 
                    level=logging.INFO,
                    filemode='w')
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='training_log.log', mode='a')
formatter = logging.Formatter('%(asctime)s: %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)

In [3]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.W = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden_states):
        attn_weights = F.softmax(self.v(torch.tanh(self.W(hidden_states))), dim=1)
        return torch.sum(hidden_states * attn_weights, dim=1)

class PositionwiseFeedForward(nn.Module):
    def __init__(self, hidden_size, ff_dim, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(hidden_size, ff_dim)
        self.fc2 = nn.Linear(ff_dim, hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.fc2(self.dropout(F.relu(self.fc1(x))))

class RedditCop(nn.Module):
    def __init__(self, model_ckpt, num_labels, tokenizer_size=30523, hidden_size_ffn=2048):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_ckpt)
        self.encoder.resize_token_embeddings(tokenizer_size)
        encoder_config = self.encoder.config
        self.attention = Attention(encoder_config.hidden_size)
        self.positionwise_ffn = PositionwiseFeedForward(encoder_config.hidden_size, hidden_size_ffn)
        self.layer_norm1 = nn.LayerNorm(encoder_config.hidden_size)
        self.layer_norm2 = nn.LayerNorm(encoder_config.hidden_size)
        self.dropout = nn.Dropout(0.3)

        self.classifier = nn.Sequential(
            nn.Linear(encoder_config.hidden_size, hidden_size_ffn),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size_ffn, num_labels)
        )

    def forward(self, x):
        x = self.encoder(**x)
        x = x.last_hidden_state

        x = self.attention(x)
        x = self.layer_norm1(x + self.dropout(self.positionwise_ffn(x)))
        x = self.layer_norm2(x)
        x = self.classifier(x)

        return x

In [4]:
# dataset and training code from alson's hatebert
class DS(Dataset):
    def __init__(self, data_path, model_ckpt, max_token_length=50):
        super().__init__()
        self.max_token_length = max_token_length

        self.labels = [
            'hate',
            'privacy',
            'sexual',
            'impersonation',
            'illegal',
            'advertisement',
            'ai',
            'neutral'
        ]

        self.data = pd.read_csv(data_path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.loc[idx, :]


        comment = row['body']
        comment = comment.split()

        if len(comment) > self.max_token_length:
            comment = comment[:45]

        comment = ' '.join(comment)
        comment = comment.replace("\\", "")
        emotion = row['label']
        emotion = self.labels.index(emotion)

        return f"{comment}", emotion

    def choose(self):
        return self[randint(0, len(self)-1)]

    def get_tokenizer_size(self):
        return len(self.tokenizer)

    def decode(self, input_id):
        return self.tokenizer.decode(input_id)

    def collate_fn(self, data):
        comments, emotions = zip(*data)
        comments = self.tokenizer(comments,
                                  padding=True,
                                  return_tensors='pt')
        comments = {k:v.to(device) for k, v in comments.items()}
        emotions = torch.tensor(emotions).long().to(device)
        return comments, emotions

In [5]:
def compute_metrics(targets, preds):
    targets = targets.cpu().detach()
    preds = preds.cpu().detach()
    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, average='weighted')
    return {'acc': acc, 'f1': f1, 'preds': preds, 'targets':targets}

def focal_loss(gamma=2):
    def compute_loss(preds, targets):
        ce_loss = F.cross_entropy(preds, targets, reduction='none')
        pt = torch.exp(-ce_loss) 
        return ((1-pt)**gamma * ce_loss).mean()
    return compute_loss

def train(data, model, optimizer, label_loss_fn, ft_lambda=1e-4):
    model.train()
    comments, labels = data
    label_outputs = model(comments)
    label_loss = label_loss_fn(label_outputs, labels)
    loss = label_loss
    model.zero_grad()
    loss.backward()
    optimizer.step()
    label_preds = label_outputs.argmax(-1)
    label_metrics = compute_metrics(labels, label_preds)
    return loss, label_metrics

@torch.no_grad()
def validate(data, model, label_loss_fnn):
    model.eval()
    comments, labels = data
    label_outputs = model(comments)
    label_loss = label_loss_fnn(label_outputs, labels)
    loss = label_loss
    label_preds = label_outputs.argmax(-1)
    label_metrics = compute_metrics(labels, label_preds)
    return loss, label_metrics

In [6]:
val_path = 'https://drive.google.com/uc?export=download&id=1ZMJI7DyKMLHpHp-HBUO64kWbP6A-qj9k'
train_path = 'https://drive.google.com/uc?export=download&id=1ZTfYOXeZLW57mLR7IegIFovi7FW1chS6'
train_df_orig = pd.read_csv(train_path)
val_df_orig = pd.read_csv(val_path)

train_df = train_df_orig.copy()
val_df = val_df_orig.copy()
print(f'Size of training data: {len(train_df)}')
print(f'Size of validation data: {len(val_df)}')

Size of training data: 44188
Size of validation data: 4910


In [7]:
model_ckpt = "GroNLP/hateBERT"
batch_size = 16
train_ds = DS(train_path, model_ckpt)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=train_ds.collate_fn)
val_ds = DS(val_path, model_ckpt)
val_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=val_ds.collate_fn)

In [8]:
model = RedditCop(model_ckpt, num_labels=8).to(device)

Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
n_epoch = 30
loss_fn = focal_loss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

In [11]:
mem = {
    'train_loss': [],
    'train_acc': [],
    'train_f1': [],
    'val_loss': [],
    'val_acc': [],
    'val_f1': []
}

cur_best_f1 = 0

for epoch in tqdm(range(n_epoch), desc='Training'):

    n_batch = len(train_dl)
    train_losses = []
    train_accs = []
    train_f1s = []

    for i, data in enumerate(train_dl):
        train_loss, train_metrics = train(data, model, optimizer, loss_fn, loss_fn)
        pos = epoch + ((i+1)/n_batch)
        train_losses.append(train_loss.cpu().detach())
        train_accs.append(train_metrics['acc'])
        train_f1s.append(train_metrics['f1'])
        log = f"epoch: {pos:.3f}\ttrain loss: {train_loss:.3f}\ttrain_acc: {train_metrics['acc']:.3f}\ttrain_f1: {train_metrics['f1']:.3f}"
        print('\r', log, end='')
    
    mem['train_loss'].append(np.mean(train_losses))
    mem['train_acc'].append(np.mean(train_accs))
    mem['train_f1'].append(np.mean(train_f1s))

    n_batch = len(val_dl)
    val_losses = []
    val_accs = []
    val_f1s = []

    for i, data in enumerate(val_dl):
        val_loss, val_metrics = validate(data, model, loss_fn)
        pos = epoch + ((i+1)/n_batch)
        val_losses.append(val_loss.cpu().detach())
        val_accs.append(val_metrics['acc'])
        val_f1s.append(val_metrics['f1'])
        log = f"epoch: {pos:.3f}\tval loss: {val_loss:.3f}\tval_acc: {val_metrics['acc']:.3f}\tval_f1: {val_metrics['f1']:.3f}"
        print('\r', log, end='')
    
    print('\r', end='')
    mem['val_loss'].append(np.mean(val_losses))
    mem['val_acc'].append(np.mean(val_accs))
    mem['val_f1'].append(np.mean(val_f1s))

    log = f"epoch: {epoch+1}\ntrain loss: {mem['train_loss'][-1]:.3f}\ttrain_acc: {mem['train_acc'][-1]:.3f}\ttrain_f1: {mem['train_f1'][-1]:.3f}"
    log = log + f"\nval loss:   {mem['val_loss'][-1]:.3f}\tval_acc:   {mem['val_acc'][-1]:.3f}\tval_f1:   {mem['val_f1'][-1]:.3f}\n"
    logger.info(log)
    print(log)
    scheduler.step()    

    if mem['val_f1'][-1] > cur_best_f1:
        torch.save(model.state_dict(), f"reddit_cop_weights.pth")
        cur_best_f1 = mem['val_f1'][-1]

Training:   0%|          | 0/30 [00:00<?, ?it/s]

epoch: 1
train loss: 0.129	train_acc: 0.905	train_f1: 0.900
val loss:   0.084	val_acc:   0.926	val_f1:   0.924

epoch: 2
train loss: 0.059	train_acc: 0.941	train_f1: 0.940
val loss:   0.080	val_acc:   0.925	val_f1:   0.924

epoch: 3
train loss: 0.037	train_acc: 0.955	train_f1: 0.955
val loss:   0.090	val_acc:   0.930	val_f1:   0.926

epoch: 4
train loss: 0.026	train_acc: 0.965	train_f1: 0.965
val loss:   0.104	val_acc:   0.932	val_f1:   0.931

epoch: 5
train loss: 0.019	train_acc: 0.973	train_f1: 0.973
val loss:   0.108	val_acc:   0.924	val_f1:   0.923

epoch: 6
train loss: 0.013	train_acc: 0.982	train_f1: 0.982
val loss:   0.124	val_acc:   0.928	val_f1:   0.926

epoch: 7
train loss: 0.009	train_acc: 0.989	train_f1: 0.989
val loss:   0.133	val_acc:   0.923	val_f1:   0.923

epoch: 8
train loss: 0.007	train_acc: 0.992	train_f1: 0.992
val loss:   0.157	val_acc:   0.930	val_f1:   0.927

epoch: 9
train loss: 0.005	train_acc: 0.994	train_f1: 0.994
val loss:   0.158	val_acc:   0.934	val_f1:  

In [11]:
model_weight = torch.load("reddit_cop_weights.pth")
model.load_state_dict(model_weight)

<All keys matched successfully>

In [12]:
label_preds, label_targets = [], []

with torch.no_grad():
    for i, data in tqdm(enumerate(val_dl), total=len(val_dl)):
        comments, labels = data
        label_outputs = model(comments)
        label_preds.extend(label_outputs.argmax(-1).cpu().numpy())
        label_targets.extend(labels.cpu().numpy())
    print(classification_report(label_targets, label_preds))
    

  0%|          | 0/307 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1958
           1       0.86      0.73      0.79        26
           2       0.87      0.57      0.68        46
           3       0.76      0.62      0.68        26
           4       0.58      0.50      0.54        28
           5       0.90      0.81      0.85        47
           6       0.97      0.99      0.98       604
           7       0.94      0.93      0.94      2175

    accuracy                           0.94      4910
   macro avg       0.85      0.76      0.80      4910
weighted avg       0.94      0.94      0.94      4910

