# LSTM

In [None]:
label_to_class = "hate.speech" # enter the label to be classified
version = "A" # enter the version


In [3]:
import torch.nn as nn
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import time
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import numpy as np
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import nltk
from collections import Counter

In [4]:
# build the model
class LSTM(nn.Module):
  def __init__(self,vocab_size, emb_size, lstm_size, hidden_size, dropout):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_size)
    self.lstm = nn.LSTM(emb_size, lstm_size, bidirectional=True, batch_first=True)
    self.hidden = nn.Linear(lstm_size*2, hidden_size)
    self.linear = nn.Linear(hidden_size, 1)
    self.sigmoid = nn.Sigmoid()
    self.dropout = nn.Dropout(dropout)

  def forward(self, input, lengths): # input.shape: (batch_size, texts_length)
    emb = self.emb(input) # (batch_size, texts_length, emb_size)
    emb = self.dropout(emb)
    packed = pack_padded_sequence(emb, lengths, batch_first=True, enforce_sorted=False)  #PackedSequence: data (packed length, lstm_size*2)
    lstm, _ = self.lstm(packed.float())  # (batch_size, texts_length, lstm_size*2)
    padded, _ = pad_packed_sequence(lstm, batch_first=True)
    output = torch.max(padded, dim=1).values # max pooling, (batch_size, lstm_size*2)
    output = self.hidden(self.dropout(output)) # (batch_size, hidden_size)
    output = self.linear(self.dropout(output)) # (batch_size, 1)
    output = self.sigmoid(output)
    return output.squeeze() # (batch_size)

In [5]:
EPOCHS = 20
BATCH_SIZE = 64
EMB_SIZE = 512
LSTM_SIZE = 512
HIDDEN_SIZE = 256
DROPOUT = 0.3
VOCAB_SIZE=5000
LEARNING_RATE = 5e-05
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = nltk.tokenize.TweetTokenizer()

In [None]:
# no drops
# csv = pd.read_csv('../data/version'+version+'_train.csv',header=0)

# # create a new csv df
# csv_new = pd.DataFrame(csv, columns=[label_to_class, "tweet_hashed"])
# # drop all rows that have any NaN values
# csv_new_clean = csv_new.dropna(axis=0,how="any")

# drops to 3 examples per tweet
csv = pd.read_csv('../data/version'+version+'_train.csv',header=0)
# create a new csv df
csv_new = pd.DataFrame(csv, columns=[label_to_class, "tweet_hashed"])
# drop all rows that have any NaN values
csv_new_clean = csv_new.dropna(axis=0,how="any")
# save tweets into a list
tweet=list(csv_new_clean['tweet_hashed'])
# count tweet freqs
tweet_count=Counter(tweet)
for t, c in tweet_count.items():
    if c > 3:
        # get the index for a specific tweet into a list
        index = csv_new_clean[csv_new_clean.tweet_hashed == t].index.tolist()
        # randomly choose index to drop
        index_to_drop = random.sample(index, c-3)
        csv_new_clean = csv_new_clean.drop(index_to_drop, axis=0)

In [None]:
train_csv, dev_csv = train_test_split(csv_new_clean, test_size=0.2, random_state=42)

In [None]:
X_train = [tokenizer.tokenize(text.lower()) for text in list(train_csv["tweet_hashed"])]
X_dev = [tokenizer.tokenize(text.lower()) for text in list(dev_csv["tweet_hashed"])]

y_train = list(train_csv[label_to_class])
y_dev = list(dev_csv[label_to_class])

In [None]:
train_data = list(zip(X_train, y_train))
dev_data = list(zip(X_dev, y_dev))

In [None]:
vocab = build_vocab_from_iterator(X_train, max_tokens=VOCAB_SIZE, specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])  # index 0 reserved for '<unk>' as default, 1 reserved for '<pad>'
torch.save(vocab, "LSTM.vocab."+label_to_class+version)

In [None]:
def collate(batch, vocab, device):
    texts, labels = zip(*batch)
    lengths = [len(text) for text in texts]
    word_ids = [[vocab[word] for word in text] for text in texts]
    texts = pad_sequence([torch.LongTensor(ids) for ids in word_ids], batch_first=True, padding_value=1)
    return texts.to(device), torch.tensor(labels).to(device), torch.LongTensor(lengths)

In [None]:
train_loader = DataLoader(dataset=train_data,
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          collate_fn=lambda batch: collate(batch, vocab, DEVICE))
dev_loader = DataLoader(dataset=dev_data,
                        batch_size=BATCH_SIZE,
                        collate_fn=lambda batch: collate(batch, vocab, DEVICE))

In [None]:
model = LSTM(VOCAB_SIZE, EMB_SIZE, LSTM_SIZE, HIDDEN_SIZE, DROPOUT).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCELoss()

In [None]:
best_dev_acc = 0
best_epoch = 0
print(f'Start training...')
start_time = time.time()
for epoch in range(EPOCHS):
    # train
    train_loss = 0
    train_acc = 0
    model.train()
    for texts, labels, lengths in tqdm(train_loader):
        output = model(texts, lengths)
        preds = torch.round(output)
        #acc = (output.ge(0.5) == labels).sum().item() / labels.size(0)
        acc = torch.eq(labels, preds).sum().item() / labels.size(0)
        model.zero_grad()
        loss = criterion(output, labels.float())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_acc += acc
    train_loss, train_acc = train_loss / len(train_loader), train_acc / len(train_loader)
    
    # dev
    dev_loss = 0
    dev_acc = 0
    model.eval()
    with torch.no_grad():
        for texts, labels, lengths in tqdm(dev_loader):
            output = model(texts, lengths)
            loss = criterion(output, labels.float())
            preds = torch.round(output)
            #acc = (output.ge(0.5) == labels).sum().item() / labels.size(0)
            acc = torch.eq(labels, preds).sum().item() / labels.size(0)
            dev_loss += loss.item()
            dev_acc += acc
    dev_loss, dev_acc = dev_loss / len(dev_loader), dev_acc / len(dev_loader)

    print(f'Epoch {epoch + 1}: train loss: {train_loss:.4f}, train acc: {train_acc:.4f}')
    print(f'Epoch {epoch + 1}: dev loss: {dev_loss:.4f}, dev acc: {dev_acc:.4f}')
    if dev_acc > best_dev_acc:
        best_dev_acc = dev_acc
        best_epoch = epoch + 1
        torch.save(model, "LSTM.model."+label_to_class+version)
        print(f'*** Epoch {epoch + 1}: dev acc higher than best dev acc, model saved!')
    print()
sec = time.time()-start_time
print(f'Training finished! Best epoch is {best_epoch}, best dev acc is {best_dev_acc:.4f}, {sec} seconds used.')

Start training...


100%|██████████| 118/118 [00:03<00:00, 37.81it/s]
100%|██████████| 30/30 [00:00<00:00, 138.98it/s]


Epoch 1: train loss: 0.6235, train acc: 0.6780
Epoch 1: dev loss: 0.6184, dev acc: 0.6775
*** Epoch 1: dev acc higher than best dev acc, model saved!



100%|██████████| 118/118 [00:03<00:00, 36.06it/s]
100%|██████████| 30/30 [00:00<00:00, 109.23it/s]


Epoch 2: train loss: 0.6068, train acc: 0.6859
Epoch 2: dev loss: 0.6081, dev acc: 0.6858
*** Epoch 2: dev acc higher than best dev acc, model saved!



100%|██████████| 118/118 [00:03<00:00, 38.57it/s]
100%|██████████| 30/30 [00:00<00:00, 145.65it/s]


Epoch 3: train loss: 0.5888, train acc: 0.6965
Epoch 3: dev loss: 0.5796, dev acc: 0.7004
*** Epoch 3: dev acc higher than best dev acc, model saved!



100%|██████████| 118/118 [00:03<00:00, 38.95it/s]
100%|██████████| 30/30 [00:00<00:00, 141.14it/s]


Epoch 4: train loss: 0.5702, train acc: 0.7142
Epoch 4: dev loss: 0.5543, dev acc: 0.7390
*** Epoch 4: dev acc higher than best dev acc, model saved!



100%|██████████| 118/118 [00:03<00:00, 38.81it/s]
100%|██████████| 30/30 [00:00<00:00, 131.64it/s]


Epoch 5: train loss: 0.5405, train acc: 0.7349
Epoch 5: dev loss: 0.5309, dev acc: 0.7437
*** Epoch 5: dev acc higher than best dev acc, model saved!



100%|██████████| 118/118 [00:03<00:00, 35.93it/s]
100%|██████████| 30/30 [00:00<00:00, 109.96it/s]


Epoch 6: train loss: 0.5154, train acc: 0.7515
Epoch 6: dev loss: 0.5091, dev acc: 0.7656
*** Epoch 6: dev acc higher than best dev acc, model saved!



100%|██████████| 118/118 [00:03<00:00, 38.75it/s]
100%|██████████| 30/30 [00:00<00:00, 135.47it/s]


Epoch 7: train loss: 0.4957, train acc: 0.7695
Epoch 7: dev loss: 0.4983, dev acc: 0.7676
*** Epoch 7: dev acc higher than best dev acc, model saved!



100%|██████████| 118/118 [00:03<00:00, 38.52it/s]
100%|██████████| 30/30 [00:00<00:00, 134.35it/s]


Epoch 8: train loss: 0.4689, train acc: 0.7884
Epoch 8: dev loss: 0.5206, dev acc: 0.7576



100%|██████████| 118/118 [00:03<00:00, 38.59it/s]
100%|██████████| 30/30 [00:00<00:00, 135.93it/s]


Epoch 9: train loss: 0.4556, train acc: 0.7863
Epoch 9: dev loss: 0.5114, dev acc: 0.7587



100%|██████████| 118/118 [00:03<00:00, 35.21it/s]
100%|██████████| 30/30 [00:00<00:00, 117.70it/s]


Epoch 10: train loss: 0.4461, train acc: 0.7925
Epoch 10: dev loss: 0.5089, dev acc: 0.7691
*** Epoch 10: dev acc higher than best dev acc, model saved!



100%|██████████| 118/118 [00:03<00:00, 38.82it/s]
100%|██████████| 30/30 [00:00<00:00, 130.06it/s]


Epoch 11: train loss: 0.4352, train acc: 0.7999
Epoch 11: dev loss: 0.5229, dev acc: 0.7623



100%|██████████| 118/118 [00:03<00:00, 38.34it/s]
100%|██████████| 30/30 [00:00<00:00, 138.31it/s]


Epoch 12: train loss: 0.4235, train acc: 0.8053
Epoch 12: dev loss: 0.5312, dev acc: 0.7618



100%|██████████| 118/118 [00:03<00:00, 38.90it/s]
100%|██████████| 30/30 [00:00<00:00, 145.65it/s]


Epoch 13: train loss: 0.4144, train acc: 0.8104
Epoch 13: dev loss: 0.5377, dev acc: 0.7623



100%|██████████| 118/118 [00:03<00:00, 36.12it/s]
100%|██████████| 30/30 [00:00<00:00, 115.28it/s]


Epoch 14: train loss: 0.4098, train acc: 0.8146
Epoch 14: dev loss: 0.5465, dev acc: 0.7587



100%|██████████| 118/118 [00:03<00:00, 38.65it/s]
100%|██████████| 30/30 [00:00<00:00, 141.85it/s]


Epoch 15: train loss: 0.4007, train acc: 0.8148
Epoch 15: dev loss: 0.5814, dev acc: 0.7514



100%|██████████| 118/118 [00:03<00:00, 38.85it/s]
100%|██████████| 30/30 [00:00<00:00, 142.42it/s]


Epoch 16: train loss: 0.3979, train acc: 0.8144
Epoch 16: dev loss: 0.5512, dev acc: 0.7623



100%|██████████| 118/118 [00:03<00:00, 39.01it/s]
100%|██████████| 30/30 [00:00<00:00, 142.48it/s]


Epoch 17: train loss: 0.3964, train acc: 0.8128
Epoch 17: dev loss: 0.5691, dev acc: 0.7603



100%|██████████| 118/118 [00:03<00:00, 36.61it/s]
100%|██████████| 30/30 [00:00<00:00, 117.68it/s]


Epoch 18: train loss: 0.3943, train acc: 0.8155
Epoch 18: dev loss: 0.5711, dev acc: 0.7624



100%|██████████| 118/118 [00:03<00:00, 38.42it/s]
100%|██████████| 30/30 [00:00<00:00, 146.26it/s]


Epoch 19: train loss: 0.3822, train acc: 0.8179
Epoch 19: dev loss: 0.5740, dev acc: 0.7566



100%|██████████| 118/118 [00:03<00:00, 39.00it/s]
100%|██████████| 30/30 [00:00<00:00, 141.28it/s]

Epoch 20: train loss: 0.3781, train acc: 0.8186
Epoch 20: dev loss: 0.5935, dev acc: 0.7519

Training finished! Best epoch is 10, best dev acc is 0.7691, 67.89233040809631 seconds used.





# test the models

In [18]:
test_data_version = "A"
BATCH_SIZE = 64
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = nltk.tokenize.TweetTokenizer()

def collate_for_test(batch, vocab, device):
    texts, labels = zip(*batch)
    lengths = [len(text) for text in texts]
    word_ids = [[vocab[word] for word in text] for text in texts]
    texts = pad_sequence([torch.LongTensor(ids) for ids in word_ids], batch_first=True, padding_value=1)
    return texts.to(device), torch.LongTensor(lengths)

# init test file
test_csv = pd.read_csv('./data/version'+test_data_version+'_test.csv', header=0)

# create a new csv df with all the original columns
test_csv_new = pd.DataFrame(test_csv, columns=["id",	"version",	"batch.tweet", "tweet.id", "tweet_hashed", "hate.speech", "offensive.language"])

# preprocess the test data

X_test = [tokenizer.tokenize(text.lower()) for text in list(test_csv_new["tweet_hashed"])]

In [19]:
# choose trained model version to test

for label_to_test in ["hate.speech", "offensive.language"]:
  for version_to_test in ["A","B","C","D","E"]:

    vocab_test = torch.load("LSTM.vocab."+label_to_test+version_to_test, map_location=DEVICE)
    model_test = torch.load("LSTM.model."+label_to_test+version_to_test, map_location=DEVICE)
    
    y_test = list(test_csv_new[label_to_test])
    test_data = list(zip(X_test, y_test))

    test_loader = DataLoader(dataset=test_data,
                            batch_size=BATCH_SIZE,
                            collate_fn=lambda batch: collate_for_test(batch, vocab_test, DEVICE))

    model_test.eval()
    preds_list = []
    preds_scores = []
    with torch.no_grad():
        for texts, lengths in tqdm(test_loader):
            output = model_test(texts, lengths)
            preds = torch.round(output)
            preds_list.extend(preds.tolist())
            preds_scores.extend(output.tolist())

    preds_list = [int(i) for i in preds_list]

    if label_to_test == "hate.speech":
      if version_to_test == "A":
        column = 7
      elif version_to_test == "B":
        column = 9
      elif version_to_test == "C":
        column = 11
      elif version_to_test == "D":
        column = 13
      elif version_to_test == "E":
        column = 15
      else:
        raise KeyError
    elif label_to_test == "offensive.language":
      if version_to_test == "A":
        column = 17
      elif version_to_test == "B":
        column = 19
      elif version_to_test == "C":
        column = 21
      elif version_to_test == "D":
        column = 23
      elif version_to_test == "E":
        column = 25
      else:
        raise KeyError
    else:
      raise KeyError

    test_csv_new.insert(column,label_to_test+"_preds_"+version_to_test,preds_list)
    test_csv_new.insert(column+1,label_to_test+"_preds_"+version_to_test+"_scores",preds_scores)

test_csv_new

100%|██████████| 53/53 [00:00<00:00, 126.03it/s]
100%|██████████| 53/53 [00:00<00:00, 140.73it/s]
100%|██████████| 53/53 [00:01<00:00, 48.13it/s]
100%|██████████| 53/53 [00:00<00:00, 119.10it/s]
100%|██████████| 53/53 [00:00<00:00, 137.36it/s]
100%|██████████| 53/53 [00:00<00:00, 155.78it/s]
100%|██████████| 53/53 [00:00<00:00, 165.49it/s]
100%|██████████| 53/53 [00:00<00:00, 166.30it/s]
100%|██████████| 53/53 [00:00<00:00, 163.35it/s]
100%|██████████| 53/53 [00:00<00:00, 162.83it/s]


Unnamed: 0,id,version,batch.tweet,tweet.id,tweet_hashed,hate.speech,offensive.language,hate.speech_preds_A,hate.speech_preds_A_scores,hate.speech_preds_B,...,offensive.language_preds_A,offensive.language_preds_A_scores,offensive.language_preds_B,offensive.language_preds_B_scores,offensive.language_preds_C,offensive.language_preds_C_scores,offensive.language_preds_D,offensive.language_preds_D_scores,offensive.language_preds_E,offensive.language_preds_E_scores
0,125,E,R1,1,@###### bro that hoe live,1.0,0,0,0.062568,0,...,1,0.903320,1,0.844781,1,0.873763,1,0.785041,1,0.873017
1,173,E,R1,1,@###### bro that hoe live,0.0,1,0,0.062568,0,...,1,0.903320,1,0.844781,1,0.873763,1,0.785041,1,0.873017
2,219,E,R1,1,@###### bro that hoe live,0.0,1,0,0.062568,0,...,1,0.903320,1,0.844781,1,0.873763,1,0.785041,1,0.873017
3,223,E,R1,1,@###### bro that hoe live,0.0,1,0,0.062568,0,...,1,0.903320,1,0.844781,1,0.873763,1,0.785041,1,0.873017
4,228,E,R1,1,@###### bro that hoe live,0.0,1,0,0.062568,0,...,1,0.903320,1,0.844781,1,0.873763,1,0.785041,1,0.873017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3330,996,E,R49,2999,RT @###### My favorite episode of Friends is t...,0.0,1,0,0.255284,1,...,0,0.216544,0,0.307929,1,0.506910,1,0.766770,1,0.983291
3331,1073,E,R49,2999,RT @###### My favorite episode of Friends is t...,0.0,0,0,0.255284,1,...,0,0.216544,0,0.307929,1,0.506910,1,0.766770,1,0.983291
3332,1472,E,R49,2999,RT @###### My favorite episode of Friends is t...,0.0,0,0,0.255284,1,...,0,0.216544,0,0.307929,1,0.506910,1,0.766770,1,0.983291
3333,1481,E,R49,2999,RT @###### My favorite episode of Friends is t...,0.0,0,0,0.255284,1,...,0,0.216544,0,0.307929,1,0.506910,1,0.766770,1,0.983291


In [20]:
test_csv_new.to_csv("./preds/lstm_test"+test_data_version+".csv")