# Twitter hate speech detection 
## DistilBERT Model

### December 8, 2021
### Clare Garberg (cag199) & Abby Fremaux (amf338)

### Objectives:
1. Split into train and test dataframes
2. Build and run a DistilBERT model
3. Print evaluative metrics such as accuracy and F1 Score

In [2]:
## implementing a BERT model

## importing necessary libraries

import pandas as pd
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from spacy.language import Language
import numpy as np

## Setting working directory

os.chdir("/Users/claregarberg/Documents/Graduate School/Fall 2021 Semester/580 NLP for Data Analytics/")

### importing data

pre_cv_clean_tweets = pd.read_csv('pre_cv_clean_tweets_b.csv', lineterminator='\n')

In [3]:
## splitting into training and testing dataframes
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(pre_cv_clean_tweets, test_size = 0.2, random_state=43)

df_train = df_train.dropna()
df_test = df_test.dropna()

In [4]:
print(df_train.shape)
print(df_test.shape)

(1599, 3)
(399, 3)


In [10]:
## training BERT on tweets

# hyperparamters
batch_size = 32
epochs = 4
lr_init = 1e-5
max_len = 256
warmup_steps = 3
K = 2

In [11]:
## functions from lab 09

## creating batched inputs

from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

def batch_data(data, bsize):
    batches = []
    sentences = data['tweet'].tolist()
    labels = data['label'].tolist()
    for i in range(0, len(sentences), bsize):
        s = sentences[i: i + bsize]
        Y = labels[i: i + bsize]
        X = tokenizer.batch_encode_plus(
            s, max_length=max_len, padding='longest', truncation=True,
            return_attention_mask=True, return_token_type_ids=False)
        batches.append((X, Y, s))
    return batches

train_batches = batch_data(df_train, bsize=batch_size)
test_batches = batch_data(df_test, bsize=batch_size)

In [12]:
## load pretrained distilbert model

from tqdm import tqdm
import torch
from transformers import DistilBertForSequenceClassification, \
  AdamW, get_linear_schedule_with_warmup


model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased', 
    num_labels=K, 
    output_hidden_states=True)

if torch.cuda.is_available():
    device = torch.device('cuda:0') # GPU
else:
    device = torch.device('cpu') # CPU
model.to(device)

optimizer = AdamW(model.parameters(), lr=lr_init)
lr = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, 
    num_training_steps=len(train_batches))

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier

In [13]:
## training loop

from datasets import load_metric

def runner(batches, desc: str, train=True):
    
    grad_mode = torch.enable_grad if train else torch.no_grad
    preds = []
    
    if train:
        model.train()
    else:
        model.eval()
    
    for epoch in range(epochs if train else 1):
        
        acc = load_metric("accuracy", keep_in_memory=True)
        f1 = load_metric("f1", keep_in_memory=True)
        cumloss = 0.0
        embeds = []
        
        with tqdm(total=len(batches)) as bar:

            for i, batch in enumerate(batches):
                X, Y, _ = batch
                inputs = torch.tensor(X['input_ids'], device=device)
                attmsk = torch.tensor(X['attention_mask'], device=device)
                labels = torch.tensor(Y, device=device)
                batch = {'input_ids': inputs,
                         'attention_mask': attmsk,
                         'labels': labels}
                with grad_mode():
                    outputs = model(**batch)
                    embeds.append(outputs[-1][1][:, 0, :].squeeze().detach().cpu())
                    loss = outputs.loss
                    if train:
                        loss.backward()
                        optimizer.step()
                        lr.step()
                        optimizer.zero_grad()
                    logits = outputs.logits
                    Yhat = torch.argmax(logits, dim=-1)
                    preds.append(Yhat)
                    cumloss += loss.clone().detach().cpu().item()
                    acc.add_batch(predictions=Yhat, references=Y)
                    f1.add_batch(predictions=Yhat, references=Y)

                bar.update(1)
            bar.set_description('epoch: %s, %s loss: %.5f, f1-score: %.5f, accuracy: %.5f' %
                                (epoch + 1, desc,
                                 cumloss / (i + 1),
                                 f1.compute(average="macro")['f1'],
                                 acc.compute()['accuracy']))
                
    embeds = torch.cat(embeds, dim=0)
        
    return preds, embeds

In [14]:
# Train model
runner(train_batches, 'train');

epoch: 1, train loss: 0.67527, f1-score: 0.58846, accuracy: 0.60663: 100%|█| 50/
epoch: 2, train loss: 0.65319, f1-score: 0.73095, accuracy: 0.73296: 100%|█| 50/
epoch: 3, train loss: 0.65189, f1-score: 0.73359, accuracy: 0.73546: 100%|█| 50/
epoch: 4, train loss: 0.65473, f1-score: 0.71571, accuracy: 0.71795: 100%|█| 50/


In [15]:
# Evaluate training set
runner(train_batches, 'train', train=False);

epoch: 1, train loss: 0.64969, f1-score: 0.77562, accuracy: 0.77736: 100%|█| 50/


In [16]:
# Evaluate test set
runner(test_batches, 'test', train=False);

epoch: 1, test loss: 0.65198, f1-score: 0.79644, accuracy: 0.80201: 100%|█| 13/1
