In [1]:
# Small Libs
import pandas as pd
import numpy as np
from collections import Counter
import random
import string
import copy
import warnings
import re
warnings.filterwarnings("ignore")

# Scikit Learn
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import Vocab, GloVe
from torch.optim import RMSprop

# NLTK
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize as nltk_tokenizer
from nltk.stem import WordNetLemmatizer

# HuggingFace for BERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, set_seed

# Plotting library
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dipta007/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/dipta007/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dipta007/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Set Random seed for PyTorch, Random Function, Transformer
RANDOM_SEED = 4

torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
set_seed(RANDOM_SEED)

In [3]:
# Global Variables
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Check for GPU
BERT_MODEL = 'bert-base-uncased' # BERT model name for Transformer

In [4]:
data = pd.read_csv('./data/compiled.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,target,is_it_hate
0,0,u really think i would not have been raped by ...,True,1
1,1,the uk has threatened to return radioactive wa...,True,1
2,2,if english is not imposition then hindi is als...,True,1
3,3,no liberal congratulated hindu refugees post c...,True,1
4,4,he said bro even your texts sound redneck what...,True,1


In [6]:
# split all data into 80-20 for training and validation set
train, val = train_test_split(data, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)
# split teraining data into 90-10 for training and testing
train, test = train_test_split(train, test_size=0.1, random_state=RANDOM_SEED, shuffle=True)

len(train), len(val), len(test)

(62580, 17384, 6954)

In [7]:
train.head()

Unnamed: 0.1,Unnamed: 0,label,target,is_it_hate
72735,28723,@user and why are you still hung up about this...,False,0
40808,21579,The lies on this here Twitter &#8220;@Libra_DT...,True,1
57757,13745,repost: two years ago he made the decision to ...,False,0
78855,2881,It is disgusting .,False,0
16979,16979,<user> who cares which out burkha hijab its ha...,True,1


In [8]:
train['length'] = train['label'].apply(lambda x: len(x))
train.head()

Unnamed: 0.1,Unnamed: 0,label,target,is_it_hate,length
72735,28723,@user and why are you still hung up about this...,False,0,118
40808,21579,The lies on this here Twitter &#8220;@Libra_DT...,True,1,134
57757,13745,repost: two years ago he made the decision to ...,False,0,92
78855,2881,It is disgusting .,False,0,18
16979,16979,<user> who cares which out burkha hijab its ha...,True,1,118


In [9]:
np.mean(train['length'])

94.71944710770214

In [18]:
train['is_it_hate'].value_counts()

0    37140
1    25440
Name: is_it_hate, dtype: int64

In [19]:
val['is_it_hate'].value_counts()

0    10266
1     7118
Name: is_it_hate, dtype: int64

In [20]:
test['is_it_hate'].value_counts()

0    4039
1    2915
Name: is_it_hate, dtype: int64

In [10]:
def get_model_tokenizer(name):
  model = AutoModelForSequenceClassification.from_pretrained(
            name,
            num_labels = 2, # The number of output labels--2 for binary classification             # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
          )
  tokenizer = AutoTokenizer.from_pretrained(name)

  return model, tokenizer

bert_model, tokenizer = get_model_tokenizer(BERT_MODEL)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
bert_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [12]:
def evaluate(type_of_data, dataloader, model, criterion, softmax=False):
  predictions, actuals = torch.as_tensor([]).to(device), torch.as_tensor([]).to(device)
  total_loss = 0
  steps = 0
  for batch in dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    y = labels

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    yhat = outputs[1]
    loss = outputs[0]

    steps += 1
    total_loss += loss.item()
    
    if not softmax:
      yhat = F.softmax(yhat, dim=1)
      yhat = torch.argmax(yhat, dim=1)
    # store
    predictions = torch.cat([predictions, yhat])
    actuals = torch.cat([actuals, y])
      
  actuals = actuals.cpu()
  predictions = predictions.cpu()

  acc = accuracy_score(actuals, predictions)
  f1_micro = f1_score(actuals, predictions, average="micro")
  f1_macro = f1_score(actuals, predictions, average="macro")
  avg_loss = total_loss / steps

  print(type_of_data, 'Acc', round(acc, 3), "f1 micro", round(f1_micro, 3), "f1 macro", round(f1_macro, 3), "loss", round(avg_loss, 3), end=" || ")

  return round(acc, 3), round(avg_loss, 3)

In [13]:
BATCH_SiZE = 16
MAX_LENGTH = 90

class HateBERT(Dataset):
  def __init__(self, df):
    self.df = df

  def __getitem__(self, idx):
    # padding and truncate to maintain same length to 512
    encodings = tokenizer(
        self.df.iloc[idx]["label"],
        padding='max_length',
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors="pt"
    )
    # get the inputs and targets
    item = {key: torch.tensor(val[0]) for key, val in encodings.items()}
    item['labels'] = self.df.iloc[idx]['is_it_hate']
        
    return item

  def __len__(self):
    return len(self.df)

# form pytorch dataset for train, validation and test
train_dataset = HateBERT(train)
val_dataset = HateBERT(val)
test_dataset = HateBERT(test)

# form pytorch dataloader for debug, train, validation and test
debug_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SiZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SiZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SiZE, shuffle=True)

In [14]:
for x in train_dataset:
    print(x)
    break

{'input_ids': tensor([ 101, 1030, 5310, 1998, 2339, 2024, 2017, 2145, 5112, 2039, 2055, 2023,
        1029, 2245, 2017, 2052, 2031, 2593, 2272, 2000, 2115, 9456, 2030, 2333,
        2006, 2011, 2085,  102,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [15]:
class BERT(nn.Module):
    def __init__(self, layer_size):
        super(BERT, self).__init__()
        # Pretrained bert model from transformer with last layer to layer_size
        self.bert = AutoModelForSequenceClassification.from_pretrained(
                        BERT_MODEL,
                        num_labels = layer_size, # The number of output labels--2 for binary classification             # You can increase this for multi-class tasks.   
                        output_attentions = False, # Whether the model returns attentions weights.
                        output_hidden_states = False, # Whether the model returns all hidden-states.
                      )
        # Last layer is the linear layer
        self.last_layer = nn.Linear(layer_size, 6)
        # use sigmoid function for multi label classification
        self.sigmoid = nn.Sigmoid()
        
        for param in self.bert.bert.parameters():
            param.requires_grad = False
        
    def forward(self, x, attention_mask):
        y = self.bert(x, attention_mask=attention_mask)
        y = self.last_layer(y[0])
        y = self.sigmoid(y)
        
        return y

In [16]:
EPOCHS = 100
print(device)

# form BERT model and transfer to available device (GPU/CPU)
# model = BERT()
model = bert_model
model = model.to(device)

# Early Stopping Variables
es_score = float('inf')
es_counter = 0
es_patience = 2
best_model = None

# Binary Cross Entropy loss for multi-label classification
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

for epoch in range(EPOCHS):
    model.train()
    tot_loss = 0
    # Train with the training dataset
    for batch in train_dataloader:
        # Zero grad before training
        optimizer.zero_grad()

        # Get input and output
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        y = labels

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        y_pred = outputs[1]
        loss = outputs[0]

        # propagate backward
        loss.backward()
        optimizer.step()

    # Evaluate for the training and validation dataset
    model.eval()
    with torch.no_grad():
        print('Step', epoch + 1, end=": ")
        # evaluate using training dataset
        evaluate("Training", train_dataloader, model, criterion, False)
        # evaluate using validation dataset
        val_acc, val_loss = evaluate("Validation", val_dataloader, model, criterion)
        print()

        # Early stopping configuration
        if val_loss < es_score:
            es_score = val_loss
            es_counter = 0
            best_model = copy.deepcopy(model.state_dict())
        else:
            es_counter += 1
            if es_counter > es_patience:
                print(f'Loop terminated at {epoch+1} with val_loss {val_loss} and val_acc {val_acc}')
                model.load_state_dict(best_model)
                break
        # print("\n--------------------------------------------------------------------------------------------------------")

cuda
Step 1: Training Acc 0.932 f1 micro 0.932 f1 macro 0.929 loss 0.17 || Validation Acc 0.912 f1 micro 0.912 f1 macro 0.909 loss 0.216 || 
Step 2: Training Acc 0.959 f1 micro 0.959 f1 macro 0.958 loss 0.109 || Validation Acc 0.913 f1 micro 0.913 f1 macro 0.91 loss 0.215 || 
Step 3: Training Acc 0.98 f1 micro 0.98 f1 macro 0.98 loss 0.056 || Validation Acc 0.913 f1 micro 0.913 f1 macro 0.911 loss 0.254 || 
Step 4: Training Acc 0.996 f1 micro 0.996 f1 macro 0.996 loss 0.02 || Validation Acc 0.911 f1 micro 0.911 f1 macro 0.909 loss 0.283 || 
Step 5: Training Acc 0.997 f1 micro 0.997 f1 macro 0.997 loss 0.01 || Validation Acc 0.911 f1 micro 0.911 f1 macro 0.908 loss 0.351 || 
Loop terminated at 5 with val_loss 0.351 and val_acc 0.911


In [46]:
# Test with the test dataset
model.eval()
with torch.no_grad():
    print()
    evaluate("Testing", test_dataloader, model, criterion)
#     plot_confusion_matrix(actuals, y_pred)
    print()


Testing Acc 0.905 f1 micro 0.905 f1 macro 0.902 loss 0.23 || 


In [69]:
def get_output(texts):
    tokens = tokenizer(
            texts,
            padding='max_length',
            max_length=MAX_LENGTH,
            truncation=True,
            return_tensors="pt"
        )

    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)
    # labels = tokens['labels'].to(device)

    input_ids, attention_mask
    output = model(input_ids, attention_mask=attention_mask)
    yhat = output.logits

    yhat = F.softmax(yhat, dim=1)
    yhat = torch.argmax(yhat, dim=1)

    return ["Hate" if x else "Not Hate" for x in yhat.tolist()]

In [70]:
get_output(["a", "b"])

['Not Hate', 'Not Hate']