# Multi-label text classification using BERT

In [1]:
!nvidia-smi

Thu Oct 10 09:02:56 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   27C    P0             25W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install transformers



In [3]:
# ====================================================
# Required Libraries
# ====================================================

import os
import gc
import re
import sys
import time
import math
import random
import warnings
import json
import shutil
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, jaccard_score, classification_report

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

import os
import pandas as pd
import numpy as np
import shutil
import sys
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn


tokenizers.__version__: 0.20.0
transformers.__version__: 4.45.1
env: TOKENIZERS_PARALLELISM=true
cuda


In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(42)

In [5]:
# data_dir = "/content/drive/MyDrive/Notebooks_BERT/data"
# df_data = pd.read_csv(os.path.join(data_dir,"input","arxiv-preproc-data.tsv"), sep="\t")

In [6]:
df_train = pd.read_csv('/kaggle/input/banth-dataset/train.csv')
df_valid = pd.read_csv('/kaggle/input/banth-dataset/val.csv')
df_test = pd.read_csv('/kaggle/input/banth-hate-multilabel/tbbert-hate-test.csv')

In [7]:
# Hyperparameters
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 2e-5
THRESHOLD = 0.5 # threshold for the sigmoid
mode = '_based'
model_name = 'aplycaebous/tb-BERT-fpt'

In [8]:
from transformers import BertTokenizer, BertModel, AutoModelForPreTraining, AutoModelForMaskedLM

In [9]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

In [10]:
# Test the tokenizer
# test_text = "We are testing BERT tokenizer."
# # generate encodings
# encodings = tokenizer.encode_plus(test_text, 
#                                   add_special_tokens = True,
#                                   max_length = 50,
#                                   truncation = True,
#                                   padding = "max_length", 
#                                   return_attention_mask = True, 
#                                   return_tensors = "pt")
# # we get a dictionary with three keys (see: https://huggingface.co/transformers/glossary.html) 
# encodings

In [11]:
df_train['Text']

0                       Dakha mon tah vorah galoh... ❤❤🎉🎉🎉
1        BGB er moton ei gulare Chakritheke obbahoti de...
2        Or 2 hat kata hok  \nAwamiliger ekta kormio je...
3                       Janwar ta k dore bichar kora hok🤬🤬
4        shorkar er podotag to Pura Bangladesh chai. 18...
                               ...                        
29874                          Shala ekta choddobeshi vong
29875                            bebsa bondo kore dewa hok
29876                      Ruja thake gali detha perlam na
29877    12 vatari Bangladeshe asho tmk modir sathe biy...
29878                     Hasina asle take dim tharapi dow
Name: Text, Length: 29879, dtype: object

In [12]:
df_train.columns

Index(['Text', 'Label', 'Political', 'Religious', 'Gender', 'Personal Offense',
       'Abusive/Violence', 'Origin', 'Body Shaming', 'Misc', 'bangla',
       'english'],
      dtype='object')

In [13]:
target_list = ['Political', 'Religious', 'Gender', 'Personal Offense',
       'Abusive/Violence', 'Origin', 'Body Shaming', 'Misc']

In [14]:
df_valid.columns

Index(['Text', 'Label', 'Political', 'Religious', 'Gender', 'Personal Offense',
       'Abusive/Violence', 'Origin', 'Body Shaming', 'Misc', 'bangla',
       'english'],
      dtype='object')

In [15]:
df_test.Pred_label.value_counts()

Pred_label
0    2798
1     937
Name: count, dtype: int64

In [16]:
df_test = df_test[df_test['Pred_label']==1]
df_test.shape

(937, 13)

In [17]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        self.title = list(df['Text'])
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'title': title
        }

In [18]:
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN, target_list)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN, target_list)
# test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN, target_list)

In [19]:
# Data loaders
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

# test_data_loader = torch.utils.data.DataLoader(test_dataset, 
#     batch_size=TEST_BATCH_SIZE,
#     shuffle=False,
#     num_workers=0
# )

In [20]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
#         self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True, output_attentions=True)
        # self.bert_model = AutoModel.from_pretrained(model_name, config=self.config)
        self.bert_model = AutoModelForMaskedLM.from_pretrained(model_name, config=self.config)
        
#         self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(self.config.hidden_size, len(target_list))
    
        # defining attention network for attention scores 
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1))
        
        self.concat_pool = nn.Linear(self.config.hidden_size*2, self.config.hidden_size)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        # last_hidden_states = output.last_hidden_state # for AutoModel, 
        last_hidden_states = output.hidden_states[-1] # for AutoModelForPreTraining, AutoModelForMaskedLM
        attentions = output.attentions
#         output_dropout = self.dropout(output.pooler_output)

        if mode=='attention_based':
            weights = self.attention(last_hidden_states)
            output = torch.sum(weights * last_hidden_states, dim=1)
        if mode=='add_cls_based':
            weights = self.attention(last_hidden_states)
            output = torch.sum(weights * last_hidden_states, dim=1)
            cls_feature = last_hidden_states[:, 0, :]
            output += cls_feature
#             combine_feature = torch.cat([output, cls_feature], dim = -1)
#             output = self.concat_pool(combine_feature)
        else:
            output = last_hidden_states[:, 0, :]
        output = self.linear(output)
        return output, attentions

model = BERTClass()

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.bert_model.parameters():
#     param.requires_grad = False

model = model.to(device)

model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

In [21]:
# BCEWithLogitsLoss combines a Sigmoid layer and the BCELoss in one single class. 
# This version is more numerically stable than using a plain Sigmoid followed 
# by a BCELoss as, by combining the operations into one layer, 
# we take advantage of the log-sum-exp trick for numerical stability.
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [22]:
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = LEARNING_RATE)         

In [23]:
# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), 
                      leave=True, colour='steelblue')
    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs, _ = model(ids, mask, token_type_ids) # (batch,predict)=(32,8)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs==targets)
        num_samples += targets.size   # total number of elements in the 2D array

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)

In [24]:
def eval_model(validation_loader, model, optimizer):
    losses = []
    target_array = np.empty((0, 8))
    output_array = np.empty((0, 8))
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs, _ = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
#             print(outputs)
            targets = targets.cpu().detach().numpy()
            
            output_array = np.concatenate((output_array, outputs), axis=0)
            target_array = np.concatenate((target_array, targets), axis=0)
            
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

    return float(correct_predictions)/num_samples, np.mean(losses), output_array, target_array

In [25]:
class CustomTestDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        self.title = list(df['Text'])
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'title': title
        }

In [26]:
test_dataset = CustomTestDataset(df_test, tokenizer, MAX_LEN, target_list)
test_data_loader = torch.utils.data.DataLoader(test_dataset, 
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

## Model Training

In [27]:
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss, outputs, targets = eval_model(val_data_loader, model, optimizer)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), "MLTC_model_state.bin")
        best_accuracy = val_acc

Epoch 1/10


  0%|          | 0/1868 [00:00<?, ?it/s]



ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 5 and the array at index 1 has size 8

In [None]:
import matplotlib.pyplot as plt

In [None]:
# plt.rcParams["figure.figsize"] = (10,7)
# plt.plot(history['train_acc'], label='train accuracy')
# plt.plot(history['val_acc'], label='validation accuracy')
# plt.title('Training history')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend()
# plt.ylim([0, 1]);
# plt.grid()

In [None]:
torch.cuda.empty_cache()

## Evaluation of the model

In [None]:
# Loading pretrained model (best model)
model = BERTClass()
model.load_state_dict(torch.load("MLTC_model_state.bin"))
model = model.to(device)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
def get_predictions(model, data_loader):
    """
    Outputs:
      predictions - 
    """
    model = model.eval()
    
    titles = []
    predictions = []
    prediction_probs = []
    target_values = []

    with torch.no_grad():
      for data in data_loader:
        title = data["title"]
        ids = data["input_ids"].to(device, dtype = torch.long)
        mask = data["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data["targets"].to(device, dtype = torch.float)
        
        outputs, _ = model(ids, mask, token_type_ids)
        # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
        outputs = torch.sigmoid(outputs).detach().cpu()
        # thresholding at 0.5
        preds = outputs.round()
        targets = targets.detach().cpu()

        titles.extend(title)
        predictions.extend(preds)
        prediction_probs.extend(outputs)
        target_values.extend(targets)
    
    predictions = torch.stack(predictions)
    prediction_probs = torch.stack(prediction_probs)
    target_values = torch.stack(target_values)
    
    return titles, predictions, prediction_probs, target_values


In [None]:
titles, predictions, prediction_probs, target_values = get_predictions(model, test_data_loader)

In [None]:
print(classification_report(target_values, predictions, target_names=target_list, digits=4))
micro_avg_accuracy = accuracy_score(target_values, predictions)
print(f'Average Accuracy: {micro_avg_accuracy:.4f}')

In [None]:
pred_target_list = ['Pred_Political', 'Pred_Religious', 'Pred_Gender', 'Pred_Personal Offense',
       'Pred_Abusive/Violence', 'Pred_Origin', 'Pred_Body Shaming', 'Pred_Misc']
df_predictions = pd.DataFrame(predictions, columns=pred_target_list)
df_predictions = df_predictions.reset_index(drop=True)
df_test_temp = df_test.reset_index(drop=True)
merged_df = pd.concat([df_test_temp, df_predictions], axis=1)
merged_df.to_csv('tbbert_multilabel_predictions.csv', index=False)