### Train and test with LSTM model

In [None]:
import torch.nn as nn
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random
import numpy as np
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import nltk

In [4]:
# prepare the tables for saving the results
l = ["Epoch"]
for i in range(1,21):
    l.append(str(i))
train_log = pd.DataFrame(columns=l)

test_csv = dict()
test_csv_new = dict()
for test_data_version in ["A","B","C","D","E"]:
    # init test file
    test_csv[test_data_version] = pd.read_csv('../data/version'+test_data_version+'_test_s.csv', header=0)
    # create a new csv df with all the original columns
    test_csv_new[test_data_version] = pd.DataFrame(test_csv[test_data_version], columns=["id","version","batch.tweet","tweet.id", "tweet_hashed", "hate.speech", "offensive.language"])

In [None]:
for SEED in [10,42,84,420,567,888,1100,1234,5566,7890]: ### train with seeds
    random.seed(SEED)
    for label_to_class in ["hate.speech", "offensive.language"]:
        for version in ["A","B","C", "D","E"]:
            #### bild the model
            class LSTM(nn.Module):
                def __init__(self,vocab_size, emb_size, lstm_size, hidden_size, dropout):
                    super().__init__()
                    self.emb = nn.Embedding(vocab_size, emb_size)
                    self.lstm = nn.LSTM(emb_size, lstm_size, bidirectional=True, batch_first=True)
                    self.hidden = nn.Linear(lstm_size*2, hidden_size)
                    self.linear = nn.Linear(hidden_size, 1)
                    self.sigmoid = nn.Sigmoid()
                    self.dropout = nn.Dropout(dropout)
                def forward(self, input, lengths): # input.shape: (batch_size, texts_length)
                    emb = self.emb(input) # (batch_size, texts_length, emb_size)
                    emb = self.dropout(emb)
                    packed = pack_padded_sequence(emb, lengths, batch_first=True, enforce_sorted=False)  #PackedSequence: data (packed length, lstm_size*2)
                    lstm, _ = self.lstm(packed.float())  # (batch_size, texts_length, lstm_size*2)
                    padded, _ = pad_packed_sequence(lstm, batch_first=True)
                    output = torch.max(padded, dim=1).values # max pooling, (batch_size, lstm_size*2)
                    output = self.hidden(self.dropout(output)) # (batch_size, hidden_size)
                    output = self.linear(self.dropout(output)) # (batch_size, 1)
                    output = self.sigmoid(output)
                    return output.squeeze() # (batch_size)
            EPOCHS = 20
            BATCH_SIZE = 64
            EMB_SIZE = 512
            LSTM_SIZE = 512
            HIDDEN_SIZE = 256
            DROPOUT = 0.3
            VOCAB_SIZE=5000
            LEARNING_RATE = 5e-05
            DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            tokenizer = nltk.tokenize.TweetTokenizer()
            train_csv= pd.read_csv("../data_sampled/train_dev_split/version"+version+label_to_class+'_train_sampled_train.csv',header=0) 
            dev_csv = pd.read_csv("../data_sampled/train_dev_split/version"+version+label_to_class+'_train_sampled_dev.csv',header=0)
            X_train = [tokenizer.tokenize(text.lower()) for text in list(train_csv["tweet_hashed"])]
            X_dev = [tokenizer.tokenize(text.lower()) for text in list(dev_csv["tweet_hashed"])]
            y_train = list(train_csv[label_to_class])
            y_dev = list(dev_csv[label_to_class])
            train_data = list(zip(X_train, y_train))
            dev_data = list(zip(X_dev, y_dev))
            vocab = build_vocab_from_iterator(X_train, max_tokens=VOCAB_SIZE, specials=["<unk>", "<pad>"])
            vocab.set_default_index(vocab["<unk>"])  # index 0 reserved for '<unk>' as default, 1 reserved for '<pad>'
            torch.save(vocab, "lstm.vocab."+label_to_class+version)

            def collate(batch, vocab, device):
                texts, labels = zip(*batch)
                lengths = [len(text) for text in texts]
                word_ids = [[vocab[word] for word in text] for text in texts]
                texts = pad_sequence([torch.LongTensor(ids) for ids in word_ids], batch_first=True, padding_value=1)
                return texts.to(device), torch.tensor(labels).to(device), torch.LongTensor(lengths)

            train_loader = DataLoader(dataset=train_data,
                                    batch_size=BATCH_SIZE,
                                    shuffle=True,
                                    collate_fn=lambda batch: collate(batch, vocab, DEVICE))
            dev_loader = DataLoader(dataset=dev_data,
                                    batch_size=BATCH_SIZE,
                                    collate_fn=lambda batch: collate(batch, vocab, DEVICE))

            model = LSTM(VOCAB_SIZE, EMB_SIZE, LSTM_SIZE, HIDDEN_SIZE, DROPOUT).to(DEVICE)
            optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            criterion = nn.BCELoss()
            best_dev_acc = 0
            best_epoch = 0            
            print(f'Start training seed {SEED}, {label_to_class}, {version}...')
            train_acc_list = ["seed"+str(SEED)+"_"+label_to_class+version+"_train"]
            dev_acc_list = ["seed"+str(SEED)+"_"+label_to_class+version+"_dev"]
            train_loss_list = ["seed"+str(SEED)+"_"+label_to_class+version+"_train_loss"]
            dev_loss_list = ["seed"+str(SEED)+label_to_class+version+"_dev_loss"]
            
            for epoch in range(EPOCHS):
                ### train
                train_loss = 0
                train_true_list=[]
                train_preds_list=[]
                train_preds_score_list=[]
                model.train()
                ln = 0
                for texts, labels, lengths in tqdm(train_loader):
                    model.zero_grad()
                    output = model(texts, lengths)
                    preds = torch.round(output)
                    if labels.shape == preds.shape:
                        loss = criterion(output, labels.float())
                        loss.backward()
                        optimizer.step()
                        train_loss += loss.item()
                        train_preds_list.extend(preds.tolist())
                        train_true_list.extend(labels.tolist())
                        train_preds_score_list.extend(output.tolist())
                        ln+=1
                train_loss = train_loss / ln
                train_loss_list.append(train_loss)
                train_acc = accuracy_score(train_true_list,train_preds_list)
                train_acc_list.append(train_acc)              
                
                ### dev
                dev_loss = 0
                dev_true_list=[]
                dev_preds_list=[]
                dev_preds_score_list=[]
                model.eval()
                with torch.no_grad():
                    for texts, labels, lengths in tqdm(dev_loader):
                        output = model(texts, lengths)
                        loss = criterion(output, labels.float())
                        preds = torch.round(output)
                        dev_loss += loss.item()
                        dev_preds_list.extend(preds.tolist())
                        dev_true_list.extend(labels.tolist())
                        dev_preds_score_list.extend(output.tolist())
                dev_loss = dev_loss / len(dev_loader)
                dev_loss_list.append(dev_loss)
                dev_acc = accuracy_score(dev_true_list,dev_preds_list)
                dev_acc_list.append(dev_acc)
                

                print(f'Epoch {epoch + 1}: train loss: {train_loss:.4f}, train acc: {train_acc:.4f}')
                print(f'Epoch {epoch + 1}: dev loss: {dev_loss:.4f}, dev acc: {dev_acc:.4f}')
                if dev_acc > best_dev_acc:
                    best_dev_acc = dev_acc
                    best_epoch = epoch + 1
                    torch.save(model, "lstm.model."+label_to_class+version)
                    print(f'*** Epoch {epoch + 1}: dev metric higher than best dev metric, model saved!')
                print()
            print(f'Training finished! Best epoch is {best_epoch}, best dev acc is {best_dev_acc:.4f}')

            train_log.loc[len(train_log)]=train_acc_list
            train_log.loc[len(train_log)]=dev_acc_list
            train_log.loc[len(train_log)]=train_loss_list
            train_log.loc[len(train_log)]=dev_loss_list
    
    ### test
    for test_data_version in ["A","B","C","D","E"]:
        BATCH_SIZE = 64
        DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        tokenizer = nltk.tokenize.TweetTokenizer()
        def collate_for_test(batch, vocab, device):
            texts, labels = zip(*batch)
            lengths = [len(text) for text in texts]
            word_ids = [[vocab[word] for word in text] for text in texts]
            texts = pad_sequence([torch.LongTensor(ids) for ids in word_ids], batch_first=True, padding_value=1)
            return texts.to(device), torch.LongTensor(lengths)
        # preprocess the test data
        X_test = [tokenizer.tokenize(text.lower()) for text in list(test_csv_new[test_data_version]["tweet_hashed"])]
        # choose trained model version to test
        for label_to_test in ["hate.speech", "offensive.language"]:
            for version_to_test in ["A","B","C","D","E"]:
                vocab_test = torch.load("lstm.vocab."+label_to_test+version_to_test, map_location=DEVICE)
                model_test = torch.load("lstm.model."+label_to_test+version_to_test, map_location=DEVICE)
                y_test = list(test_csv_new[test_data_version][label_to_test])
                test_data = list(zip(X_test, y_test))
                test_loader = DataLoader(dataset=test_data,
                                batch_size=BATCH_SIZE,
                                collate_fn=lambda batch: collate_for_test(batch, vocab_test, DEVICE))
                model_test.eval()
                preds_list = []
                preds_scores = []
                with torch.no_grad():
                    for texts, lengths in tqdm(test_loader):
                        output = model_test(texts, lengths)
                        preds = torch.round(output)
                        preds_list.extend(preds.tolist())
                        preds_scores.extend(output.tolist())
                preds_list = [int(i) for i in preds_list]
                test_csv_new[test_data_version]["seed"+str(SEED)+"_"+label_to_test+"_preds_"+version_to_test] = preds_list
                test_csv_new[test_data_version]["seed"+str(SEED)+"_"+label_to_test+"_preds_"+version_to_test+"_scores"]=preds_scores

In [None]:
# save the train log and the test results
train_log.to_csv("seed_lstm_train_accs_sampled.csv")
for test_data_version in ["A","B","C","D","E"]:
    test_csv_new[test_data_version].to_csv("seed_lstm_test"+test_data_version+"_sampled.csv")