In [None]:
!pip install madgrad

In [None]:
nltk.download('punkt')


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
import pandas as pd
import gzip
import json
import numpy as np
import regex as re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import re
import os
import sys
import math
import time
import nltk
import torch
import random
import string
import collections

import numpy as np
import pandas as pd
import torch.nn as nn

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim.models import Word2Vec
from nltk.corpus import brown
from sklearn.manifold import TSNE
from torch.autograd import Variable
from torchtext import data, datasets
from torchtext.vocab import Vectors

from IPython.display import Image, YouTubeVideo
from torch.nn import functional as F

from sklearn.model_selection import train_test_split

from madgrad import MADGRAD

from torchtext.legacy import data

import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
batch_size = 32 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# max size of our vocab vector
max_vocab = 40000

def load_dataset():
    TEXT = data.Field(tokenize = nltk.word_tokenize,
                      include_lengths = True, batch_first = True)
    LABEL = data.LabelField(dtype = torch.float)

    fields = {'title_plus_review': ('text', TEXT), 'neutrality': ('label', LABEL)}
    train_data, test_data = data.TabularDataset.splits(
        path = '',
        train = '/content/drive/Shareddrives/519 Project/Data/preprocessed/Final Data/neutrality_binary_train.json',
        test = '/content/drive/Shareddrives/519 Project/Data/preprocessed/Final Data/neutrality_binary_test.json',
        format = 'json',
        fields = fields
    )
    

    #We use "glove.6B.100d" for 6 billion 100-dimensional glove embeddings and set the non-glove words via Gaussian distribution
    TEXT.build_vocab(train_data, max_size = max_vocab, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
    LABEL.build_vocab(train_data)

    train_data, valid_data = train_data.split(split_ratio=0.75, random_state = random.seed(42))
    
    #Use BucketIterator sort_within_batch = True, sort_key=lambda x: len(x.text), and shuffle=True
    #to split batches into reviews of similar length and pad each batch accordingly.
    #This will greatly speed up our processing by making us have to process way fewer non-useful pad tokens.
    train_batches, valid_batches, test_batches = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                                   batch_size= batch_size, sort_key=lambda x: len(x.text),
                                                                   repeat=False, shuffle=True, sort_within_batch = True)
    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, train_batches, valid_batches, test_batches

In [None]:
%%time
TEXT, vocab_size, train_batches, valid_batches, test_batches = load_dataset()

In [None]:
vocab_size

In [None]:
import pdb

Train, Test, Helper functions

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

def train(model, device, train_batches, valid_batches, epochs, learning_rate, weight_decay = 0, criterion = nn.CrossEntropyLoss(), optim = 'Adam'):
      if optim == 'Adam':
          optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = weight_decay)
      elif optim == 'Madgrad':
          optimizer =  MADGRAD(model.parameters(), lr=learning_rate, weight_decay = weight_decay)

      #We'll set up a best validation loss (set to infinity at first) so we can save the best epoch
      best_validation_loss = float('inf')
      train_loss, validation_loss = [], []
      train_acc, validation_acc = [], []

      for epoch in range(epochs):
          # train
          model.train()
          running_loss = 0.
          correct, total = 0, 0 
          steps = 0

          for idx, batch in enumerate(train_batches):
              text = batch.text[0]
            
              target = batch.label
              target = torch.autograd.Variable(target).long()
              text, target = text.to(device), target.to(device)

              
              optimizer.zero_grad()
              output = model(text)

              loss = criterion(output, target)
              loss.backward()

              #gradient clipping to help with vanishing gradients
              torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

              optimizer.step()
              steps += 1
              running_loss += loss.item()

              # get accuracy 
              _, predicted = torch.max(output, 1)
              total += target.size(0)
              correct += (predicted == target).sum().item()


          t_loss = running_loss/len(train_batches)
          t_acc = correct/total

          train_loss.append(t_loss)
          train_acc.append(t_acc)

          print(f'Epoch: {epoch + 1},  Training Loss: {t_loss: .4f}, Training Accuracy: {100*t_acc: .2f}%')

          # validate
          model.eval()
          running_loss = 0.
          correct, total = 0, 0 

          with torch.no_grad():
              for idx, batch in enumerate(valid_batches):
                  text = batch.text[0]
                  target = batch.label
                  target = torch.autograd.Variable(target).long()
                  text, target = text.to(device), target.to(device)

                  optimizer.zero_grad()
                  output = model(text)

                  loss = criterion(output, target)
                  running_loss += loss.item()

                  # accuracy 
                  _, predicted = torch.max(output, 1)
                  total += target.size(0)
                  correct += (predicted == target).sum().item()

          v_loss = running_loss/len(valid_batches)
          v_acc = correct/total

          validation_loss.append(v_loss)
          validation_acc.append(v_acc)

          print (f'Validation Loss: {v_loss:.4f}, Validation Accuracy: {100*v_acc: .2f}%')
          
          # If the current epoch has the lowest validation loss, save the model state and use that state for testing
          if v_loss < best_validation_loss:
              best_validation_loss = v_loss
              chosen_train_loss = t_loss
              chosen_validation_acc = v_acc
              chosen_train_acc = t_acc
              #save model to load in testing
              torch.save(model.state_dict(), '/content/sample_data/RNN-train.pt')


      return train_loss, train_acc, validation_loss, validation_acc, best_validation_loss, chosen_train_loss, chosen_validation_acc, chosen_train_acc

# return f1, accuracy, prec, recall
def get_metrics(true_labels, pred_labels):
    #set average to 'macro' for multiclass problems
    f1 = f1_score(true_labels, pred_labels, average= 'macro')
    acc = accuracy_score(true_labels, pred_labels)
    rec = recall_score(true_labels, pred_labels, average= 'macro')
    prec = precision_score(true_labels, pred_labels, average= 'macro')
    return f1, acc, rec, prec

def metric_test(model,  device, test_batches, choose_best_epoch = True):
    
    #Load from best epoch
    if choose_best_epoch:
        model.load_state_dict(torch.load('/content/sample_data/RNN-train.pt'))
    model.eval()
    correct = 0
    total = 0
    predictions = []
    labels = []
    with torch.no_grad():
        for idx, batch in enumerate(test_batches):
            text = batch.text[0]
            target = batch.label
            target = torch.autograd.Variable(target).long()
            text, target = text.to(device), target.to(device)

            #detatch and convert labels to numpy, then add each to labels array
            target_arr = target.detach().cpu().numpy()
            for label in target_arr:
              labels.append(label)

            outputs = model(text)
            _, predicted = torch.max(outputs, 1)

            preds_arr = predicted.detach().cpu().numpy()
            for pred in preds_arr:
              predictions.append(pred)

    #pdb.set_trace()
    labels = np.asarray(labels)
    predictions = np.asarray(predictions)
    #pdb.set_trace()
    f1, acc, rec, prec = get_metrics(labels, predictions)
    return f1, acc, rec, prec, model


Notes on RNN notes: These are the final models, though there were many configurations and settings I fiddled with before landing here

GRU RNN

In [None]:
#num_layers changes also require corresponding architecture changes
class GRURNN(nn.Module):
    def __init__(self, output_size, hidden_size, vocab_size, embed_size,  num_layers = 2, dropout_prob= 0):
        super(GRURNN, self).__init__()

        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, embed_size)
        #Unidirectional model seemed to perform better than bidirectional
        self.gru = nn.GRU(embed_size, hidden_size, num_layers=num_layers, dropout=dropout_prob)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(2*hidden_size, output_size)


    def forward(self, inputs):
        input = self.embedding(inputs)
        input = input.permute(1, 0, 2)
        x =  Variable(torch.zeros(self.num_layers, input.size()[1], self.hidden_size).to(device)) 
        output, x = self.gru(input, x)
        x = x.permute(1, 0, 2) 
        x = x.contiguous().view(x.size()[0], x.size()[1]*x.size()[2])
        #Model actually seems to perform better without an additional dropout layer called
        #x = self.dropout(x)
        outs = self.fc(x)
        return outs
    


RNN with Bidirectional LSTM

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, output_size, hidden_size, vocab_size, embed_size,  num_layers = 2, dropout_prob = 0):
        super(BiLSTM, self).__init__(**kwargs)
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers,
                               bidirectional=True, dropout = dropout_prob)
        
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(4 * hidden_size, output_size)


    def forward(self, inputs):

        input = self.embedding(inputs.T)
    
        # (no. of words, batch size, 2 * no. of hidden units).
        self.lstm.flatten_parameters()
        outputs, _ = self.lstm(input)

        # (batch size, 4 * no. of hidden units)
        x = torch.cat((outputs[0], outputs[-1]), dim=1)
        x = self.dropout(x)
        outs = self.fc(x)

        return outs

Used same method for hyperparameter tuning on small samples and full samples, as well as for final training/testing (with single values in all arrays but optimizer, which was a planned comparison)

Test accuracy only used for final test. Max validation accuracy used for hyperparam tuning 

In [None]:
%%time

'''
lr = [.0002, .0005, .001, .005]
optimizers = ['Adam', 'Madgrad']

output_size = 5
hidden_size = [200, 300] 
embedding_length = 200
epochs = 10
dropout = [0, .3, .5]

num_layers = 2
'''

lr = [.001]
optimizers = ['Madgrad']

output_size = 2
hidden_size = [300] 
embedding_length = 200
epochs = 20
num_layers = 2
dropout = [.5]

All_Final_GRU_neut_Results = pd.DataFrame(columns = ['loss_rate', 'dropout', 'hidden_size', 'optimizer',  'test_acc', 'test_f1', 'test_recall', 'test_precision', 'train_acc', 'train_loss', 'validation_acc', 'validation_loss', 'training_time'])

for rate in lr:
  for d in dropout:
    for hid in hidden_size:
      for optim_name in optimizers:
        start_time = time.time()
        rnn_model = GRURNN(output_size, hid, vocab_size, embedding_length, num_layers, d)
        rnn_model.to(device)
        train_loss, train_acc, validation_loss, validation_acc, best_validation_loss, chosen_train_loss, chosen_validation_acc, chosen_train_acc = train(rnn_model, device, train_batches, valid_batches, epochs, learning_rate = rate, optim = optim_name)
        train_time = time.time() - start_time
        f1, test_acc, rec, prec, model = metric_test(rnn_model, device, test_batches)

        model_file = '/content/drive/Shareddrives/519 Project/Data/Final Results/' + optim_name + '_All_GRU_neut_model.pt'
        torch.save(model.state_dict(), model_file)
        
        print('\n---------------------------------------------')
        print('%s, LR OF %.4f, DROPOUT OF %.2f RESULTS' % (optim_name, rate, d))
        print("--- Time taken to train = %s seconds ---" % (train_time))
        print('TRAIN ACC: %.4f, TRAIN LOSS: %.4f' % (chosen_train_acc, chosen_train_loss))
        print('VALIDATION ACC: %.4f, VALIDATION LOSS: %.4f' % (chosen_validation_acc, best_validation_loss))
        print('TEST ACC: %.4f, F1: %.4f, RECALL: %.4f, PRECISION: %.4f' % (test_acc, f1, rec, prec))
        print('---------------------------------------------\n')
        
        All_Final_GRU_neut_Results = All_Final_GRU_neut_Results.append({'loss_rate': rate,
                                                'dropout': d,
                                                'hidden_size': hid,
                                                'optimizer': optim_name,
                                                'train_acc' : chosen_train_acc,
                                                'train_loss': chosen_train_loss,
                                                'validation_acc': chosen_validation_acc,
                                                'validation_loss': best_validation_loss,
                                                'test_acc': test_acc,
                                                'test_f1': f1,
                                                'test_recall': rec,
                                                'test_precision': prec,
                                                'training_time': train_time}, ignore_index = True)
      
print('\n-----Best Model-----\n')
display(All_Final_GRU_neut_Results.sort_values('validation_acc', ascending=False).iloc[0])
All_Final_GRU_neut_Results.to_csv('/content/drive/Shareddrives/519 Project/Data/Final Results/All_Final_GRU_neut_Results.csv')

In [None]:
%%time


lr = [.0002]
optimizers = ['Adam']
num_layers = 2
output_size = 2
hidden_size = [300] 
embedding_length = 200
epochs = 20
dropout = [0]

All_Final_BiLSTM_neut_Results = pd.DataFrame(columns = ['loss_rate', 'dropout', 'hidden_size', 'optimizer',  'test_acc', 'test_f1', 'test_recall', 'test_precision', 'train_acc', 'train_loss', 'validation_acc', 'validation_loss', 'training_time'])

for rate in lr:
  for d in dropout:
    for hid in hidden_size:
      for optim_name in optimizers:
        start_time = time.time()
        rnn_model = BiLSTM(output_size, hid, vocab_size, embedding_length, num_layers, d)
        rnn_model.to(device)
        train_loss, train_acc, validation_loss, validation_acc, best_validation_loss, chosen_train_loss, chosen_validation_acc, chosen_train_acc = train(rnn_model, device, train_batches, valid_batches, epochs, learning_rate = rate, optim = optim_name)
        train_time = time.time() - start_time
        f1, test_acc, rec, prec, model = metric_test(rnn_model, device, test_batches)

        model_file = '/content/drive/Shareddrives/519 Project/Data/Final Results/' + optim_name + '_All_BiLSTM_neut_model.pt'
        torch.save(model.state_dict(), model_file)
        
        print('\n---------------------------------------------')
        print('%s, LR OF %.4f, DROPOUT OF %.2f RESULTS' % (optim_name, rate, d))
        print("--- Time taken to train = %s seconds ---" % (train_time))
        print('TRAIN ACC: %.4f, TRAIN LOSS: %.4f' % (chosen_train_acc, chosen_train_loss))
        print('VALIDATION ACC: %.4f, VALIDATION LOSS: %.4f' % (chosen_validation_acc, best_validation_loss))
        print('TEST ACC: %.4f, F1: %.4f, RECALL: %.4f, PRECISION: %.4f' % (test_acc, f1, rec, prec))
        print('---------------------------------------------\n')
        
        All_Final_BiLSTM_neut_Results = All_Final_BiLSTM_neut_Results.append({'loss_rate': rate,
                                                'dropout': d,
                                                'hidden_size': hid,
                                                'optimizer': optim_name,
                                                'train_acc' : chosen_train_acc,
                                                'train_loss': chosen_train_loss,
                                                'validation_acc': chosen_validation_acc,
                                                'validation_loss': best_validation_loss,
                                                'test_acc': test_acc,
                                                'test_f1': f1,
                                                'test_recall': rec,
                                                'test_precision': prec,
                                                'training_time': train_time}, ignore_index = True)
      
print('\n-----Best Model-----\n')
display(All_Final_BiLSTM_neut_Results.sort_values('validation_acc', ascending=False).iloc[0])
All_Final_BiLSTM_neut_Results.to_csv('/content/drive/Shareddrives/519 Project/Data/Final Results/All_Final_BiLSTM_neut_Results.csv')

Load/display downsampled hyperparam training results

In [None]:
All_Final_GRU_neut_Results_small = pd.read_csv('/content/drive/Shareddrives/519 Project/Data/Subsample Results/all_small_GRU_neutrality_results.csv')
All_Final_BiLSTM_neut_Results_small = pd.read_csv('/content/drive/Shareddrives/519 Project/Data/Subsample Results/all_small_BiLSTM_neutrality_results.csv')

In [None]:
display(All_Final_BiLSTM_neut_Results_small[All_Final_BiLSTM_neut_Results_small['optimizer'] == 'Madgrad'].sort_values('validation_acc', ascending=False).iloc[0])
display(All_Final_BiLSTM_neut_Results_small[All_Final_BiLSTM_neut_Results_small['optimizer'] == 'Adam'].sort_values('validation_acc', ascending=False).iloc[0])