In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
cd /content/drive/MyDrive/hate_speech/models

/content/drive/MyDrive/hate_speech/models


# LSTM for "offensive.language"

In [None]:
label_to_class = "offensive.language" # enter the label to be classified

In [None]:
import torch.nn as nn
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report, auc
from sklearn.model_selection import train_test_split
import time
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import numpy as np
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import nltk

In [None]:
# build the model
class LSTM(nn.Module):
  def __init__(self,vocab_size, emb_size, lstm_size, hidden_size, dropout):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_size)
    self.lstm = nn.LSTM(emb_size, lstm_size, bidirectional=True, batch_first=True)
    self.hidden = nn.Linear(lstm_size*2, hidden_size)
    self.linear = nn.Linear(hidden_size, 1)
    self.sigmoid = nn.Sigmoid()
    self.dropout = nn.Dropout(dropout)

  def forward(self, input, lengths): # input.shape: (batch_size, texts_length)
    emb = self.emb(input) # (batch_size, texts_length, emb_size)
    emb = self.dropout(emb)
    packed = pack_padded_sequence(emb, lengths, batch_first=True, enforce_sorted=False)  #PackedSequence: data (packed length, lstm_size*2)
    lstm, _ = self.lstm(packed.float())  # (batch_size, texts_length, lstm_size*2)
    padded, _ = pad_packed_sequence(lstm, batch_first=True)
    output = torch.max(padded, dim=1).values # max pooling, (batch_size, lstm_size*2)
    output = self.hidden(self.dropout(output)) # (batch_size, hidden_size)
    output = self.linear(self.dropout(output)) # (batch_size, 1)
    output = self.sigmoid(output)
    return output.squeeze() # (batch_size)

In [None]:
EPOCHS = 15
BATCH_SIZE = 64
EMB_SIZE = 256
LSTM_SIZE = 256
HIDDEN_SIZE = 128
DROPOUT = 0.3
VOCAB_SIZE=5000
LEARNING_RATE = 0.00001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
csv = pd.read_csv('../data/ourdata/full_train.csv', names=[label_to_class,'tweet_hashed'],header=0)


In [None]:
# create a new csv df
csv_new = pd.DataFrame(csv, columns=[label_to_class, "tweet_hashed"])
# drop all rows that have any NaN values
csv_new_clean = csv_new.dropna(axis=0,how="any")

In [None]:
train_csv, dev_csv = train_test_split(csv_new_clean, test_size=0.2, random_state=42)

In [None]:
X_train = [nltk.word_tokenize(text) for text in list(train_csv["tweet_hashed"])]
X_dev = [nltk.word_tokenize(text) for text in list(dev_csv["tweet_hashed"])]

y_train = list(train_csv[label_to_class])
y_dev = list(dev_csv[label_to_class])

In [None]:
train_data = list(zip(X_train, y_train))
dev_data = list(zip(X_dev, y_dev))

In [None]:
vocab = build_vocab_from_iterator(X_train, max_tokens=VOCAB_SIZE, specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])  # index 0 reserved for '<unk>' as default, 1 reserved for '<pad>'

In [None]:
def collate(batch, vocab, device):
    lengths = []
    word_ids = []
    labels = []
    for b in batch:
        text = b[0]
        label = b[1]
        ids = []
        lengths.append(len(text))
        labels.append(label)
        for word in text:
            ids.append(vocab[word])
        word_ids.append(ids)
    texts = pad_sequence([torch.LongTensor(ids) for ids in word_ids], batch_first=True, padding_value=1)
    return texts.to(device), torch.tensor(labels).to(device), torch.LongTensor(lengths)

In [None]:
def collate(batch, vocab, device):
    texts, labels = zip(*batch)
    lengths = [len(text) for text in texts]
    word_ids = [[vocab[word] for word in text] for text in texts]
    texts = pad_sequence([torch.LongTensor(ids) for ids in word_ids], batch_first=True, padding_value=1)
    return texts.to(device), torch.tensor(labels).to(device), torch.LongTensor(lengths)

In [None]:
train_loader = DataLoader(dataset=train_data,
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          collate_fn=lambda batch: collate(batch, vocab, DEVICE))

In [None]:
dev_loader = DataLoader(dataset=dev_data,
                        batch_size=BATCH_SIZE,
                        collate_fn=lambda batch: collate(batch, vocab, DEVICE))

In [None]:
model = LSTM(VOCAB_SIZE, EMB_SIZE, LSTM_SIZE, HIDDEN_SIZE, DROPOUT).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCELoss()

In [None]:
best_dev_acc = 0
best_epoch = 0
print(f'Start training...')
for epoch in range(EPOCHS):
    start_time = time.time()
    # train
    train_loss = 0
    train_acc = 0
    model.train()
    for texts, labels, lengths in tqdm(train_loader):
        output = model(texts, lengths)
        preds = torch.round(output)
        #acc = (output.ge(0.5) == labels).sum().item() / labels.size(0)
        acc = torch.eq(labels, preds).sum().item() / labels.size(0)
        model.zero_grad()
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_acc += acc
    train_loss, train_acc = train_loss / len(train_loader), train_acc / len(train_loader)
    
    # dev
    dev_loss = 0
    dev_acc = 0
    model.eval()
    with torch.no_grad():
        for texts, labels, lengths in tqdm(dev_loader):
            output = model(texts, lengths)
            loss = criterion(output, labels)
            preds = torch.round(output)
            #acc = (output.ge(0.5) == labels).sum().item() / labels.size(0)
            acc = torch.eq(labels, preds).sum().item() / labels.size(0)
            dev_loss += loss.item()
            dev_acc += acc
    dev_loss, dev_acc = dev_loss / len(dev_loader), dev_acc / len(dev_loader)

    print(f'Epoch {epoch + 1}: train loss: {train_loss:.4f}, train acc: {train_acc:.4f}')
    print(f'Epoch {epoch + 1}: dev loss: {dev_loss:.4f}, dev acc: {dev_acc:.4f}')
    if dev_acc > best_dev_acc:
        best_dev_acc = dev_acc
        best_epoch = epoch + 1
        torch.save(model, "LSTM."+label_to_class)
        print(f'*** Epoch {epoch + 1}: dev acc higher than best dev acc, model saved!')
    print()
sec = time.time()-start_time
print(f'Training finished! Best epoch is {best_epoch}, best dev acc is {best_dev_acc:.4f}, {sec} seconds used.')

Start training...


100%|██████████| 832/832 [00:13<00:00, 59.65it/s]
100%|██████████| 208/208 [00:01<00:00, 187.22it/s]


Epoch 1: train loss: 0.6391, train acc: 0.6565
Epoch 1: dev loss: 0.6009, dev acc: 0.7039
*** Epoch 1: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.59it/s]
100%|██████████| 208/208 [00:01<00:00, 185.51it/s]


Epoch 2: train loss: 0.5915, train acc: 0.6991
Epoch 2: dev loss: 0.5533, dev acc: 0.7288
*** Epoch 2: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.54it/s]
100%|██████████| 208/208 [00:01<00:00, 186.18it/s]


Epoch 3: train loss: 0.5576, train acc: 0.7199
Epoch 3: dev loss: 0.5242, dev acc: 0.7458
*** Epoch 3: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:15<00:00, 54.21it/s]
100%|██████████| 208/208 [00:01<00:00, 189.91it/s]


Epoch 4: train loss: 0.5338, train acc: 0.7406
Epoch 4: dev loss: 0.5027, dev acc: 0.7692
*** Epoch 4: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 63.26it/s]
100%|██████████| 208/208 [00:01<00:00, 186.08it/s]


Epoch 5: train loss: 0.5152, train acc: 0.7560
Epoch 5: dev loss: 0.4844, dev acc: 0.7827
*** Epoch 5: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.71it/s]
100%|██████████| 208/208 [00:01<00:00, 182.04it/s]


Epoch 6: train loss: 0.5015, train acc: 0.7692
Epoch 6: dev loss: 0.4699, dev acc: 0.7932
*** Epoch 6: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.89it/s]
100%|██████████| 208/208 [00:01<00:00, 190.03it/s]


Epoch 7: train loss: 0.4876, train acc: 0.7797
Epoch 7: dev loss: 0.4554, dev acc: 0.8021
*** Epoch 7: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.71it/s]
100%|██████████| 208/208 [00:01<00:00, 184.94it/s]


Epoch 8: train loss: 0.4774, train acc: 0.7862
Epoch 8: dev loss: 0.4432, dev acc: 0.8111
*** Epoch 8: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.86it/s]
100%|██████████| 208/208 [00:01<00:00, 190.20it/s]


Epoch 9: train loss: 0.4666, train acc: 0.7952
Epoch 9: dev loss: 0.4342, dev acc: 0.8149
*** Epoch 9: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 63.19it/s]
100%|██████████| 208/208 [00:01<00:00, 187.88it/s]


Epoch 10: train loss: 0.4600, train acc: 0.8006
Epoch 10: dev loss: 0.4268, dev acc: 0.8200
*** Epoch 10: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.56it/s]
100%|██████████| 208/208 [00:01<00:00, 185.23it/s]


Epoch 11: train loss: 0.4510, train acc: 0.8068
Epoch 11: dev loss: 0.4219, dev acc: 0.8229
*** Epoch 11: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.85it/s]
100%|██████████| 208/208 [00:01<00:00, 183.85it/s]


Epoch 12: train loss: 0.4458, train acc: 0.8101
Epoch 12: dev loss: 0.4165, dev acc: 0.8273
*** Epoch 12: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 61.49it/s]
100%|██████████| 208/208 [00:01<00:00, 185.27it/s]


Epoch 13: train loss: 0.4387, train acc: 0.8115
Epoch 13: dev loss: 0.4136, dev acc: 0.8299
*** Epoch 13: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.27it/s]
100%|██████████| 208/208 [00:01<00:00, 189.02it/s]


Epoch 14: train loss: 0.4347, train acc: 0.8156
Epoch 14: dev loss: 0.4111, dev acc: 0.8313
*** Epoch 14: dev acc higher than best dev acc, model saved!



100%|██████████| 832/832 [00:13<00:00, 62.29it/s]
100%|██████████| 208/208 [00:01<00:00, 186.10it/s]


Epoch 15: train loss: 0.4302, train acc: 0.8189
Epoch 15: dev loss: 0.4085, dev acc: 0.8338
*** Epoch 15: dev acc higher than best dev acc, model saved!

Training finished! Best epoch is 15, best dev acc is 0.8338, 14.539275646209717 seconds used.


# test the model

In [None]:
# test the final model with test file
test_csv = pd.read_csv('../data/ourdata/full_test.csv', header=0)

# create a new csv df
test_csv_new = pd.DataFrame(test_csv, columns=["id",	"version",	"batch.tweet", label_to_class, "tweet.id", "tweet_hashed"])
# drop all rows that have any NaN values
test_csv_new_clean = test_csv_new.dropna(axis=0,how="any")

X_test = [nltk.word_tokenize(text) for text in list(test_csv_new_clean["tweet_hashed"])]
y_test = list(test_csv_new_clean[label_to_class])

test_data = list(zip(X_test, y_test))

test_loader = DataLoader(dataset=test_data,
                        batch_size=BATCH_SIZE,
                        collate_fn=lambda batch: collate(batch, vocab, DEVICE))

In [None]:
best_model = torch.load("LSTM."+label_to_class, map_location=DEVICE)
best_model.eval()
preds_list = []
labels_list = []
with torch.no_grad():
    for texts, labels, lengths in tqdm(test_loader):
        output = best_model(texts, lengths)
        preds = torch.round(output)
        labels_list.extend(labels.tolist())
        preds_list.extend(preds.tolist())

100%|██████████| 349/349 [00:01<00:00, 224.68it/s]


In [None]:
print(classification_report(labels_list, preds_list, target_names=["non-"+label_to_class,label_to_class]))

                        precision    recall  f1-score   support

non-offensive.language       0.75      0.75      0.75      9832
    offensive.language       0.80      0.80      0.80     12450

              accuracy                           0.78     22282
             macro avg       0.78      0.78      0.78     22282
          weighted avg       0.78      0.78      0.78     22282



In [None]:
precision_recall_fscore_support(labels_list, preds_list)

(array([0.74885752, 0.80233213]),
 array([0.75      , 0.80136546]),
 array([0.74942832, 0.8018485 ]),
 array([ 9832, 12450]))

In [None]:
test_csv_new_clean.insert(6,label_to_class+"_preds_lstm_full",preds_list)

In [None]:
test_csv_new_clean

Unnamed: 0,id,version,batch.tweet,offensive.language,tweet.id,tweet_hashed,offensive.language_preds_lstm_full
0,125,E,R1,0.0,1,@###### bro that hoe live,1.0
1,173,E,R1,1.0,1,@###### bro that hoe live,1.0
2,219,E,R1,1.0,1,@###### bro that hoe live,1.0
3,223,E,R1,1.0,1,@###### bro that hoe live,1.0
4,228,E,R1,1.0,1,@###### bro that hoe live,1.0
...,...,...,...,...,...,...,...
22297,1315,A,R43,1.0,2993,Lol that's some hoe shit,1.0
22298,1324,A,R43,1.0,2993,Lol that's some hoe shit,1.0
22299,1312,A,R49,1.0,2999,RT @###### My favorite episode of Friends is t...,0.0
22300,1315,A,R49,0.0,2999,RT @###### My favorite episode of Friends is t...,0.0


In [None]:
test_csv_new_clean.to_csv("../data/preds/" + label_to_class+ "_preds_lstm_full.csv")