This notebook is an example of the complete pipeline of the classifier model.

Data used for this example is extracted from: https://zenodo.org/record/3520150

The code is based on the tutorial found in: https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/

In [None]:
cd ..

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import BertTokenizer, AutoModel, AdamW

# Settings

## Data sources

In [None]:
TRAIN_FILE = "data/dachs/Train_Hate_Messages.csv"
TEST_FILE = "data/dachs/Test_Hate_Messages.csv"
SEED_SHUFFLE = 31

## Data preparation

In [None]:
VALIDATION_SIZE = 0.3
SEED_TRAIN_VALIDATION_SEPARATION = 31

# Model

In [None]:
BERT_MODEL = "dccuchile/bert-base-spanish-wwm-cased"
DEVICE = "cuda"
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 1e-3
OUTPUT_FILE = "models/dachs/best_model.pt"

## Tokenizer

In [None]:
SEQUENCE_LENGTH = 30

# Data preparation

In [None]:
device = torch.device(DEVICE)

## Read data source

In [None]:
def prepare_data_dachs_messages(data_file):
    df = pd.read_csv(data_file, sep="|")
    df = df.dropna()
    df = df.rename(columns={"Hate_Speech": "label"})
    df = df[["message", "label"]]
    return df.sample(frac=1, random_state=SEED_SHUFFLE).reset_index(drop=True)

In [None]:
train_df = prepare_data_dachs_messages(TRAIN_FILE)
print(len(train_df["label"]))
print(train_df["label"].value_counts())
train_df.head(2)

In [None]:
test_df = prepare_data_dachs_messages(TEST_FILE)
print(len(test_df))
print(test_df["label"].value_counts())
test_df.head(2)

## Train / validation split

In [None]:
train_messages, validation_messages, train_labels, validation_labels = train_test_split(train_df["message"], train_df["label"],
                                                                                        random_state=SEED_TRAIN_VALIDATION_SEPARATION,
                                                                                        test_size=VALIDATION_SIZE,
                                                                                        stratify=train_df["label"])

In [None]:
test_messages = test_df["message"]
test_labels = test_df["label"]

## Tokenization

In [None]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

In [None]:
def tokenize_messages(messages):
    return tokenizer.batch_encode_plus(messages.tolist(),
                                       max_length = SEQUENCE_LENGTH,
                                       padding="max_length",
                                       truncation=True,
                                       add_special_tokens=True,
                                       return_token_type_ids=False)

def tokenize_and_create_tensors(messages, labels):
    tokens = tokenize_messages(messages)
    return {"token_ids": torch.tensor(tokens["input_ids"]),
            "attention_mask": torch.tensor(tokens["attention_mask"]),
            "labels": torch.tensor(labels.tolist())}

In [None]:
train_tensors = tokenize_and_create_tensors(train_messages, train_labels)
validation_tensors = tokenize_and_create_tensors(validation_messages, validation_labels)
test_tensors = tokenize_and_create_tensors(test_messages, test_labels)

## Dataset, Sampler and Dataloader

### Train

In [None]:
train_dataset = TensorDataset(train_tensors["token_ids"], train_tensors["attention_mask"], train_tensors["labels"])
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

### Validation

In [None]:
validation_dataset = TensorDataset(validation_tensors["token_ids"], validation_tensors["attention_mask"], validation_tensors["labels"])
validation_sampler = SequentialSampler(validation_dataset)
validation_dataloader = DataLoader(validation_dataset, sampler=validation_sampler, batch_size=BATCH_SIZE)

# Model

## Load pretrained

In [None]:
bert = AutoModel.from_pretrained(BERT_MODEL)

### Freeze all parameters

In [None]:
for param in bert.parameters():
    param.requires_grad = False

## Custom classifier model

In [None]:
class ClassifierModel(nn.Module):
    
    def __init__(self, bert):
        super(ClassifierModel, self).__init__()
        self.bert = bert 
        self.dropout = nn.Dropout(0.1)
        self.relu =  nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        cls_hs = self.bert(sent_id, attention_mask=mask)["pooler_output"]
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [None]:
model = ClassifierModel(bert)

### Device

In [None]:
model = model.to(device)

### Optimizer

In [None]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

### Loss function

In [None]:
class_weights = compute_class_weight("balanced", np.unique(train_labels), train_labels)
weights = torch.tensor(class_weights, dtype=torch.float)
weights = weights.to(device)

In [None]:
cross_entropy = nn.NLLLoss(weight=weights) 

### Train and evaluate functions

In [None]:
def train():
    model.train()
    total_loss = 0
    total_accuracy = 0
    total_preds=[]
    for step, batch in enumerate(tqdm(train_dataloader)):

        # push the batch to gpu
        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch

        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

        # append the model predictions
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
    print("\nEvaluating...")
    
    # deactivate dropout layers
    model.eval()
    
    total_loss, total_accuracy = 0, 0
    # empty list to save the model predictions
    total_preds = []
    
    # iterate over batches
    for step,batch in enumerate(tqdm(validation_dataloader)):

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():

            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)
            
    # compute the validation loss of the epoch
    avg_loss = total_loss / len(validation_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

## Training loop

In [None]:
# set initial loss to infinite
best_valid_loss = float("inf")

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(EPOCHS):
     
    print("\n Epoch {:} / {:}".format(epoch + 1, EPOCHS))

    #train model
    train_loss, _ = train()

    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), OUTPUT_FILE)
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f"\nTraining Loss: {train_loss:.3f}")
    print(f"Validation Loss: {valid_loss:.3f}")

## Performance report

In [None]:
model.load_state_dict(torch.load(OUTPUT_FILE))

In [None]:
with torch.no_grad():
    predictions = model(test_tensors["token_ids"][0:1000].to(DEVICE), test_tensors["attention_mask"][0:1000].to(DEVICE))
    predictions = predictions.detach().cpu().numpy()

In [None]:
predictions = np.argmax(predictions, axis = 1)
print(classification_report(test_tensors["labels"][0:1000], predictions))