<a href="https://colab.research.google.com/github/dipta007/hate-speech-election-2020/blob/main/Hate_Speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets
!pip install pyspellchecker



In [2]:
# Small Libs
import pandas as pd
import numpy as np
from pprint import pprint
from collections import Counter
import random
import string
from spellchecker import SpellChecker
import copy
import warnings
warnings.filterwarnings("ignore")

# Scikit Learn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab, GloVe

# NLTK
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize as nltk_tokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# HuggingFace
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, set_seed
from datasets import load_dataset

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
RANDOM_SEED = 4

torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
set_seed(RANDOM_SEED)

In [5]:
def get_model_tokenizer(name):
  model = AutoModelForSequenceClassification.from_pretrained(
            name,
            # num_labels = 2, # The number of output labels--2 for binary classification             # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
          )
  tokenizer = AutoTokenizer.from_pretrained(name)
  return model, tokenizer

In [6]:
model, tokenizer = get_model_tokenizer("mrm8488/distilroberta-finetuned-tweets-hate-speech")

In [7]:
# for param in model.roberta.parameters():
#     param.requires_grad = False

# for param in model.classifier.parameters():
#     print(param.requires_grad)

In [8]:
dataset = load_dataset("hatexplain")

Reusing dataset hatexplain (/root/.cache/huggingface/datasets/hatexplain/plain_text/1.0.0/802fcd855438812094e336cea509c99b04b890e4e0846c0385877ee2c7361e93)


In [9]:
" ".join(dataset["train"][0]["post_tokens"])

'u really think i would not have been raped by feral hindu or muslim back in india or bangladesh and a neo nazi would rape me as well just to see me cry'

In [10]:
dataset["train"][0]["annotators"]["label"]

[0, 2, 2]

In [11]:
for i in range(40, 50):
  print(dataset["train"][i]["annotators"]["label"])
  print(" ".join(dataset["train"][i]["post_tokens"]))
  print("")

[0, 0, 0]
a nigger bich will never have strait hair so keep wearing dat horsehair weave

[0, 0, 0]
apparently some vile nigger animals kidnapped a white special needs teen and tortured and scalped him while shouting fuck white people fuck trump are you being armed and are you shooting in the head when these zombies approach

[0, 0, 0]
<user> you need potassium eat a banana you fucking nigger

[2, 0, 0]
well then send some coon estrogen to my wife she turning more liberal breath by breath due to fake news i do not know how much more i can take hell between her and my two daughters hormones i have started growing breast tissue like pseudo naturally becoming bruce jenner

[0, 0, 0]
it high time we hindus should think that these filthy muslims should be allowed to live here or not kamleshtiwari ypg yourprophetisgay

[1, 1, 1]
<user> why should you ban immigrants from operating small business every business start somewhere and tomorrow that will give people job

[2, 0, 2]
i am sure you are 

In [12]:
def join_all(arr):
  ret = []
  for v in arr:
    ret.append(" ".join(v))
  return ret

train_encodings = tokenizer(join_all(dataset["train"]["post_tokens"]), padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(join_all(dataset["validation"]["post_tokens"]), padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(join_all(dataset["test"]["post_tokens"]), padding=True, truncation=True, return_tensors="pt")

In [13]:
len(dataset["train"])

15383

In [14]:
class HateModel(torch.nn.Module):
  def __init__(self, bert):
    super(HateModel, self).__init__()
    self.bert = bert
    self.linear1 = torch.nn.Linear(2, 128)
    self.linear2 = torch.nn.Linear(128, 3)

  def forward(self, x, attention_mask, labels):
    x = self.bert(x, attention_mask=attention_mask, labels=labels)
    print(x)
    x = F.relu(self.linear1(x[1]))
    x = self.linear2(x)
    return x

In [15]:
class HateExplain(torch.utils.data.Dataset):
  def __init__(self, encodings, dataset):
    self.dataset = dataset
    self.encodings = encodings


  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = self.get_label(self.dataset[idx]["annotators"]["label"])
    return item

  def __len__(self):
    return len(self.dataset)

  def get_label(self, arr):
    cnt = [0, 0, 0]
    for v in arr:
      cnt[v] += 1
    
    # 0 - hate
    # 1 - normal
    # 2 - offensive
    return 1 if torch.argmax(torch.as_tensor(cnt)) != 1 else 0

train_dataset = HateExplain(train_encodings, dataset["train"])
val_dataset = HateExplain(val_encodings, dataset["validation"])
test_dataset = HateExplain(test_encodings, dataset["test"])

In [16]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [17]:
def evaluate(type_of_data, dataloader, model, criterion, softmax=False):
  predictions, actuals = torch.as_tensor([]).to(device), torch.as_tensor([]).to(device)
  total_loss = 0
  steps = 0
  for batch in dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    y = labels

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    yhat = outputs[1]
    loss = outputs[0]

    # yhat = model(input_ids, attention_mask, labels)
    # loss = criterion(yhat, y)

    steps += 1
    total_loss += loss.item()
    
    if not softmax:
      yhat = F.softmax(yhat, dim=1)
      yhat = torch.argmax(yhat, dim=1)
    # store
    predictions = torch.cat([predictions, yhat])
    actuals = torch.cat([actuals, y])
      
  actuals = actuals.cpu()
  predictions = predictions.cpu()

  acc = accuracy_score(actuals, predictions)
  f1_micro = f1_score(actuals, predictions, average="micro")
  f1_macro = f1_score(actuals, predictions, average="macro")
  avg_loss = total_loss / steps

  print(type_of_data, 'Acc', round(acc, 3), "f1 micro", round(f1_micro, 3), "f1 macro", round(f1_macro, 3), "loss", round(avg_loss, 3), end=" || ")

  return round(acc, 3), round(avg_loss, 3)

In [None]:
print(device)
# device = torch.device('cpu')

model.to(device)
# hate_model = HateModel(model)
# hate_model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

EPOCH = 400

# Early Stopping
es_score = float('inf')
es_counter = 0
es_patience = 2
best_model = None

for epoch in range(EPOCH):
  model.train(True)
  for batch in train_loader:
    break
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    y_pred = outputs[1]
    loss = outputs[0]
    
    loss.backward()
    optimizer.step()

  model.eval()
  with torch.no_grad():
    print('Step', epoch + 1, end=": ")
    evaluate("Training", train_loader, model, criterion, False)
    val_acc, val_loss = evaluate("Validation", val_loader, model, criterion, False)
    print()

    if val_loss < es_score:
      es_score = val_loss
      es_counter = 0
      best_model = copy.deepcopy(model.state_dict())
    else:
      es_counter += 1
      if es_counter > es_patience:
        print(f'Loop terminated at {epoch+1} with val_loss {val_loss} and val_acc {val_acc}')
        model.load_state_dict(best_model)
        break

cuda
Training Acc 0.522 f1 micro 0.522 f1 macro 0.521 loss 1.908 || Validation Acc 0.53 f1 micro 0.53 f1 macro 0.529 loss 1.856 || 
Step 2: 

In [None]:
model.eval()
with torch.no_grad():
  evaluate("Testing", test_loader, model, criterion, False)
  print()