# Installing and Importing necessary modules

In [4]:
!pip install transformers gdown -q

In [48]:
import numpy as np
from nltk.tokenize import word_tokenize
import pandas as pd
import nltk
from nltk.corpus import stopwords
import torch
import torch.nn as nn
import torch.optim as optim
import gdown
import re
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Reading Data


In [49]:
output = 'test.tsv'
gdown.download(f'https://drive.google.com/file/d/19hRm27ej0uHfYP_N5lMkPQEfEVTHy1HL/view?usp=sharing', output, quiet=True,fuzzy=True)

output = 'train.tsv'
gdown.download(f'https://drive.google.com/file/d/1x6KGEdVo2UwcLgOoUYbUXTcC1zEM4iDW/view?usp=sharing', output, quiet=True,fuzzy=True)

output = 'valid.tsv'
gdown.download(f'https://drive.google.com/file/d/1VncNy3EZrak1-lh0G5ne2hBCSpPmIJqN/view?usp=sharing', output, quiet=True,fuzzy=True)

'valid.tsv'

In [50]:
test_cols = ['Comment', 'Comment Label']
df_test = pd.read_csv('test.tsv',sep='\t',names=test_cols)
df_train = pd.read_csv('train.tsv',sep='\t',names=test_cols)
df_valid = pd.read_csv('valid.tsv', sep='\t',names =test_cols)
df_train.head(1)

Unnamed: 0,Comment,Comment Label
0,we cannot continue calling ourselves feminists if the rights of all womxn arent addressed yes to a sexual offences public list but will a trans lesbian bisexual and queer womxn be able to enter their information on the reporting sheet gender forum,normal


1. Preprocess the data, remove unnecessary symbols, stop words,
etc.

In [51]:
def preprocess_text(sentence,token=None):
    sentence = sentence.lower()
    sentence = re.sub(r'\d+', '', sentence) #Remove numbers
    # There is no need to preprocess other things because bert can handle all the other things internally
    return sentence

In [52]:
df_train['Comment'] = df_train['Comment'].apply(preprocess_text)
df_test['Comment'] = df_test['Comment'].apply(preprocess_text)
df_valid['Comment'] = df_valid['Comment'].apply(preprocess_text)
df_train.head(1)

Unnamed: 0,Comment,Comment Label
0,we cannot continue calling ourselves feminists if the rights of all womxn arent addressed yes to a sexual offences public list but will a trans lesbian bisexual and queer womxn be able to enter their information on the reporting sheet gender forum,normal


2. Load the Model & Tokenizer from the huggingface library.

In [53]:
# For colab

model_name = 'bert-base-uncased'

bert = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

3. Design the dataloader class for tokenising and batch-processing
the input data.

In [54]:
class MyDataset(Dataset):
    def __init__(self, tokenizer, texts, labels,max_length,label_encoder):
        super(MyDataset, self).__init__()
        self.max_length = max_length
        self.tokenizer = tokenizer
        # self.texts = torch.tensor([self.tokenizer.encode(text,add_special_tokens=True) for text in texts])
        self.texts = texts
        self.labels = label_encoder.transform(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [55]:
def find_max_seq_len(df):
    max_len = 0
    mat = df.to_numpy()[:,0].astype(str)
    for string in mat:
        tokens = string.split()
        max_len = max(max_len,len(tokens))

    return max_len

max_seq_len = find_max_seq_len(df_train)
# max_seq_len = 50 #Custom max length

In [56]:
num_worker = 2
batch_size = 128
shuffle=True

classes = ['normal', 'hatespeech', 'offensive']
label_encoder = LabelEncoder()
label_encoder.fit(classes)

train_dataset = MyDataset(tokenizer, df_train.to_numpy()[:,0].astype(str), df_train.to_numpy()[:,1].astype(str),max_seq_len,label_encoder)
train_loader = DataLoader(train_dataset,batch_size=batch_size,num_workers=num_worker,shuffle=shuffle)

valid_dataset = MyDataset(tokenizer, df_valid.to_numpy()[:,0].astype(str), df_valid.to_numpy()[:,1].astype(str),max_seq_len,label_encoder)
valid_loader = DataLoader(valid_dataset,batch_size=batch_size,num_workers=num_worker,shuffle=shuffle)

test_dataset = MyDataset(tokenizer, df_test.to_numpy()[:,0].astype(str), df_test.to_numpy()[:,1].astype(str),max_seq_len,label_encoder)
test_loader = DataLoader(test_dataset,batch_size=batch_size,num_workers=num_worker,shuffle=shuffle)

4. Create a model class; here, you can use the classification model
from the huggingface library or the normal model without a
classification head and add your own classification layer above it.
Also, the dropouts and activation functions should be defined
here only.

In [57]:
class BERT(nn.Module):
    def __init__(self,bert, num_classes,dropout):
        super(BERT, self).__init__()
        self.bert = bert
        # Define additional layers for custom classification head
        self.additional_layers = nn.Sequential(
            nn.Linear(768,512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(dropout),
            nn.Linear(512,256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.15),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128,num_classes)
        )

        # Weight initialization (XavierGlorot initialization)
        for layer in self.additional_layers:
            if isinstance(layer, nn.Linear):
                torch.nn.init.xavier_uniform_(layer.weight)

    def forward(self,input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.additional_layers(outputs[1])
        return logits


num_classes = 3
hidden_dim = 512
dropout = 0.5

model = BERT(bert,num_classes,dropout)
model= nn.DataParallel(model)
model.to(device)
print(model)

DataParallel(
  (module): BERT(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
              

5. Design the train loop and the optimizers, schedulers, and loss
functions.

In [58]:
from sklearn.metrics import accuracy_score, f1_score

def compute_accuracy(model, data_loader):
    model.eval()
    with torch.no_grad():
        all_targets = []
        all_predictions = []

        for i, batch in enumerate(data_loader):

            features = batch['input_ids'].long()
            attention_mask = batch['attention_mask'].long()
            targets = batch['labels'].long()

            features = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['labels'].to(device)


            logits = model(features,attention_mask)
            _, predicted_labels = torch.max(logits, 1)

            all_targets.extend(targets.cpu().numpy())
            all_predictions.extend(predicted_labels.cpu().numpy())

    accuracy = accuracy_score(all_targets, all_predictions)
    f1 = f1_score(all_targets, all_predictions, average='macro')
    return (accuracy * 100, f1)

In [59]:
class_wts = compute_class_weight('balanced', classes = [0,1,2],y=train_loader.dataset.labels)

class_wts = torch.tensor(class_wts, dtype=torch.float)
class_wts = class_wts.to(device)

6. Train the model on the training data given.
7. Find validation set performance at the end of each epoch.
8. If validation set performance doesn’t improve over k iterations
(called patience), stop training(early stopping).

In [60]:
from torch.optim.lr_scheduler import OneCycleLR

num_epochs = 20
optimizer = optim.AdamW(model.parameters(), lr=0.0005)

lr_scheduler = OneCycleLR(
        optimizer = optimizer,
        max_lr = 0.005,
        epochs = num_epochs,
        steps_per_epoch = len(train_loader)
    )

criterion = nn.CrossEntropyLoss(weight=class_wts)

for param in model.module.bert.parameters():
    param.requires_grad = False

best_model_state_dict = model.state_dict()
val_acc = 0
val_acc_max = 0
patience = 5

for epoch in range(num_epochs):
    model.train()
    batch_idx = 0

    if epoch== num_epochs // 3:
        for param in model.module.bert.parameters():
            param.requires_grad = True

        optimizer = optim.AdamW(model.parameters(), lr=0.00001)
        lr_scheduler = OneCycleLR(
                optimizer = optimizer,
                max_lr = 0.0001,
                epochs = num_epochs,
                steps_per_epoch = len(train_loader)
            )

    for batch in train_loader:
        batch_idx+=1

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids,attention_mask)

        loss = criterion(outputs,labels)
        optimizer.zero_grad()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    val_acc =  compute_accuracy(model, valid_loader)[0]
    val_acc_max = max(val_acc, val_acc_max)

    if val_acc_max > val_acc:
        k +=1
    else:
        best_model_state_dict = model.state_dict()
        torch.save(best_model_state_dict,'classifier_weights.pth')
        k = 0

    with torch.set_grad_enabled(False):
        print(f'Epoch: {epoch+1}/{num_epochs} | Patience: {k}/{patience}',
              f'\nTraining accuracy: '
              f'{compute_accuracy(model, train_loader)[0]:.2f}%'
              f'\nValid accuracy: '
              f'{val_acc:.2f}%')

    if k >= patience:
        break


Epoch: 1/20 | Patience: 0/5 
Training accuracy: 42.20%
Valid accuracy: 41.52%
Epoch: 2/20 | Patience: 0/5 
Training accuracy: 45.51%
Valid accuracy: 44.12%
Epoch: 3/20 | Patience: 0/5 
Training accuracy: 47.29%
Valid accuracy: 46.46%
Epoch: 4/20 | Patience: 0/5 
Training accuracy: 48.23%
Valid accuracy: 47.71%
Epoch: 5/20 | Patience: 0/5 
Training accuracy: 49.24%
Valid accuracy: 49.53%
Epoch: 6/20 | Patience: 1/5 
Training accuracy: 47.97%
Valid accuracy: 46.57%
Epoch: 7/20 | Patience: 0/5 
Training accuracy: 55.81%
Valid accuracy: 55.36%
Epoch: 8/20 | Patience: 0/5 
Training accuracy: 60.58%
Valid accuracy: 57.96%
Epoch: 9/20 | Patience: 0/5 
Training accuracy: 63.52%
Valid accuracy: 61.13%
Epoch: 10/20 | Patience: 0/5 
Training accuracy: 65.50%
Valid accuracy: 61.55%
Epoch: 11/20 | Patience: 0/5 
Training accuracy: 66.61%
Valid accuracy: 63.48%
Epoch: 12/20 | Patience: 0/5 
Training accuracy: 67.80%
Valid accuracy: 63.89%
Epoch: 13/20 | Patience: 0/5 
Training accuracy: 69.15%
Valid

9. Load best model weights based on the validation set
performance.
10. Find performance on the test set.

In [61]:
model.load_state_dict(torch.load(r'classifier_weights.pth'))
model.eval()

print(f'The Accuracy of the model on the Test set is: {compute_accuracy(model, test_loader)[0]}%',
      f'\nThe macro-f1 score of the model on test set is: {compute_accuracy(model, test_loader)[1]}')

The Accuracy of the model on the Test set is: 67.87941787941789% 
The macro-f1 score of the model on test set is: 0.6685580368108469


In [62]:
torch.save(model,'classifier.pth')

In [67]:
model = torch.load('classifier.pth')

test_sent = "<user> i am bit confused coz chinese ppl can not access twitter thn how this ching chong using it i think he pakistani 🤔 🤔 🤔"

inputs = tokenizer(test_sent, return_tensors = 'pt')
with torch.no_grad():
    model.eval()
    logits = model(inputs['input_ids'],inputs['attention_mask'])

print((logits.cpu().numpy()))

[[0.45174727 0.7216576  0.46821246]]


In [68]:
out_class = np.argmax(logits.cpu().numpy())
classes_dict = {0 : 'normal', 1: 'hatespeech', 2 : 'offensive'}
print(classes_dict[out_class])

hatespeech
