In [None]:
import collections
import gensim
import gzip
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import re
import string
import torch
import torch.nn as nn
import torch.optim as optim
from google.colab import drive
from sklearn.model_selection import ParameterGrid

In [None]:
# Mount Google Drive
drive.mount('/content/drive')
path = "/content/drive/My Drive/UCL/Modules/DL/assignment2/"

# Load Word2Vec Embedding

In [None]:
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

# Load Google's pre-trained Word2Vec model
w2v = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True, limit=1000000)

# Normalise the vectors
w2v.init_sims(replace=True)

In [None]:
token2index_map = {
    word: idx for idx, word in enumerate(w2v.vocab)
}

def token2index(token):
  """
  Get the word2vec embedding index for a token
  Input:
  - Token = a string
  Output:
  - The index of the word2vec embedding of the token"""
  # Use "UNK" as missing token embedding
  default = token2index_map.get("UNK")

  # Remove multi-word tokens to match embeddings in Julia starter code
  if "_" in token:
    return default

  return token2index_map.get(token, default)

# Load Stanford sentiment treebank dataset

In [None]:
# Load stanford sentiment treebank dataset
# Site for more information on data set: https://nlp.stanford.edu/sentiment/

!wget https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip
!unzip trainDevTestTrees_PTB.zip

In [None]:
# Parse the tree into sentences and classifications

def parse_tree(sentence_list_tree):
    """
    Function for extracting text only from sentiment tree (remove tree structure)
    Input: list of strings of trees of sentences/lines
    Output: x_list is a list of sentences, y_list is a list of sentence sentiment classes
    """
    # remove empty strings from list
    while("" in sentence_list_tree): 
        sentence_list_tree.remove("")
    
    # initialise lists to store x (the sentence), y (the sentiment)
    x_list = []
    y_list = []
    for sentence in sentence_list_tree:
        y_list.append(int(sentence[1])) # y = sentiment class of sentence
        # remove digits and parentheses using regular expression
        patterns = r'[0-9]|\(|\)' 
        sentence = re.sub(patterns, '', sentence)
        # remove -RRB- and -LRB-
        sentence = re.sub(r"-RRB-", "", sentence)
        sentence = re.sub(r"-LRB-", "", sentence)
        # remove duplicate spaces
        sentence = " ".join(sentence.split())
        # remove spaces before some types of punctuation
        sentence = re.sub(r"\s([?.!;\,](?:\s|$))", r'\1', sentence)
        # remove spaces before apostrophes
        sentence = re.sub(r" '\b", "'", sentence)
        # remove spaces before n't
        sentence = re.sub(r" n't\b", "n't", sentence)
        # remove space before ...
        sentence = re.sub(r" \.\.\.", r"...", sentence)
        # make all lower case
        sentence = sentence.lower()
        # remove slashes (replace with space)
        sentence = sentence.replace("\\"," ")

        # remove dashes (replace with space)
        sentence = re.sub("-", " ", sentence)
        # remove all other punctuation
        sentence = re.sub('[!"#$%&()*+,/:;<=>?@[\]^_`{|}~]', "", sentence)
        # remove double quotation marks
        sentence = re.sub("''", "", sentence)
        # remove single quotation marks (space beforehand so as to not replace apostrophes)
        sentence = re.sub(" '", " ", sentence)
        # remove duplicate spaces
        sentence = " ".join(sentence.split())

        # replace letters with accents, with standard non-accented english letters
        sentence = re.sub('[àáâã]', "a", sentence)
        sentence = re.sub('[æ]', "ae", sentence)
        sentence = re.sub('[ç]', "c", sentence)
        sentence = re.sub('[èé]', "e", sentence)
        sentence = re.sub('[íï]', "i", sentence)
        sentence = re.sub('[ñ]', "n", sentence)
        sentence = re.sub('[óôö]', "o", sentence)
        sentence = re.sub('[ûü]', "u", sentence)

        x_list.append(sentence)

    return x_list, y_list

In [None]:
# Load and parse string data

train_file = open("/content/trees/train.txt", "r")
train_tree = train_file.read()
train_tree_sentence_list = train_tree.split("\n")
x_train_list, y_train_list = parse_tree(train_tree_sentence_list)
train_length = len(x_train_list)

dev_file = open("/content/trees/dev.txt","r")
dev_tree = dev_file.read()
dev_tree_sentence_list = dev_tree.split("\n")
x_dev_list, y_dev_list = parse_tree(dev_tree_sentence_list)
dev_length = len(x_dev_list)

test_file = open("/content/trees/test.txt", "r")
test_tree = test_file.read()
test_tree_sentence_list = test_tree.split("\n")
x_test_list, y_test_list = parse_tree(test_tree_sentence_list)
test_length = len(x_test_list)

# Data processing

In [None]:
class SentenceBatcher:
    def __init__(self, inputs, labels, batch_size=128, drop_last=False):
        # Tokenise the input sentences
        tokenised_inputs = np.array([sentence.split() for sentence in inputs])

        # Map the tokens to word2vec indices
        indices_mapping = [torch.tensor([token2index(token) for token in tokens]) for tokens in tokenised_inputs]

        # Store sentences by length
        self.sentences_by_length = {}
        for input, label in zip(indices_mapping, labels):
            length = input.shape[0]
            
            if length not in self.sentences_by_length:
                self.sentences_by_length[length] = []
            self.sentences_by_length[length].append([input, label])
         
        #  Create a DataLoader for each set of sentences of the same length
        self.loaders = {length : torch.utils.data.DataLoader(
                                  sentences,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  drop_last=drop_last)
          for length, sentences in self.sentences_by_length.items()}
        
    def __iter__(self):
        # Create an iterator for each sentence length
        iters = [iter(loader) for loader in self.loaders.values()]
        while iters:
            # Get a random iterator
            i = random.choice(iters)
            try:
                yield next(i)
            except StopIteration:
                iters.remove(i)

# Data exploration

In [None]:
# Display example data for training set

start_index = 0
end_index = 30

for sentiment, sentence in zip(y_train_list[start_index:end_index], x_train_list[start_index:end_index]):
  print(f"y = {sentiment} | x = ({sentence})")

# Number of examples in each dataset

print(f"Number of sentences training set: {len(x_train_list)}")
print(f"Number of sentences validation set: {len(x_dev_list)}")
print(f"Number of sentences test set: {len(x_test_list)}")

# Plot distribution of sentiment classifications by dataset
from collections import Counter

train_counter = Counter(y_train_list)
plt.bar(train_counter.keys(), train_counter.values())
plt.title("Number of examples per classification")
plt.ylabel('Count')
plt.xlabel('Classification')
plt.savefig(f"sentence_lengths.png", dpi=300)
plt.show()

dev_counter = Counter(y_dev_list)
plt.bar(dev_counter.keys(), dev_counter.values())
plt.title("Number of examples per classification")
plt.ylabel('Count')
plt.xlabel('Classification')
plt.savefig(f"sentence_lengths.png", dpi=300)
plt.show()

# Plot distribution of sentence lengths by dataset

train_sentence_lengths = Counter(map(lambda s : len(s.split()), x_train_list))
plt.bar(train_sentence_lengths.keys(), train_sentence_lengths.values())
plt.title("Number of tokens per sentence")
plt.ylabel('Count')
plt.xlabel('Number of tokens')
plt.savefig(f"sentence_lengths.png", dpi=300)
plt.show()

# Models

In [None]:
class RNN(nn.Module):
  def __init__(self, embedding_size, hidden_size, num_layers, num_classes, cell_type, num_dense_layers, hidden_to_dense_size_ratio=2, dropout=0, fixed_embeddings=True):
    super(RNN, self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.cell_type = cell_type
    self.num_dense_layers = num_dense_layers

    self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(w2v.vectors))
    self.embedding.weight.requires_grad = not fixed_embeddings

    # Only one of these will be used in forward statement (as specified by hyperparam cell_type for current run)
    self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first = True)
    self.gru = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first = True)
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first = True)
    
    # Number of these linear layers that gets used depends on hyperparam num_dense_layers
    if num_dense_layers == 1:
      self.linear = nn.Linear(hidden_size, num_classes)
    elif num_dense_layers == 2:
      self.linear1 = nn.Linear(hidden_size, hidden_size//hidden_to_dense_size_ratio)
      self.linear2 = nn.Linear(hidden_size//hidden_to_dense_size_ratio, num_classes)
      self.dropout = nn.Dropout(dropout)
  
  def forward(self, x_batch):
    # Get the word embeddings
    embedded = self.embedding(x_batch)
        
    if self.cell_type == 'VanillaRNN':
      packed_out, _ = self.rnn(embedded)
    elif self.cell_type == 'GRU':
      packed_out, _ = self.gru(embedded)
    elif self.cell_type == 'LSTM':
      packed_out, _ = self.lstm(embedded)

    # Extract the final output and feed into a dense layer
    out = packed_out[:, -1, :]
    if self.num_dense_layers == 1:
      out = self.linear(out)
    elif self.num_dense_layers == 2:
      out = self.linear1(out)
      out = torch.relu(out)
      out = self.dropout(out)
      out = self.linear2(out)
    return out

# Model, loss, optimizer, and training loop

In [None]:
# Hyperparameters to grid search over
hyperparam_grid = {'cell_type': ['GRU'],
                   'num_layers': [3],
                   'num_dense_layers': [2],
                   'hidden_size': [128],
                   'hidden_to_dense_size_ratio': [2],
                   'learning_rate': [0.001],
                   'dropout': [0.25],
                   'fixed_embeddings': [True],
                   'batch_size': [128]
}

# Initialise dataframe to store results of grid search 
results_df = pd.DataFrame()

# Range of epochs
MIN_EPOCHS = 20
MAX_EPOCHS = 100

for run_index, hyperparams in enumerate(ParameterGrid(hyperparam_grid)):

  # Get training and validation data loaders
  train_loader = SentenceBatcher(x_train_list, y_train_list, hyperparams['batch_size'], drop_last=False)
  dev_loader = SentenceBatcher(x_dev_list, y_dev_list, hyperparams['batch_size'], drop_last=False)

  # Initialise model
  model = RNN(embedding_size=300,
              hidden_size=hyperparams['hidden_size'],
              num_layers=hyperparams['num_layers'],
              num_classes=5,
              cell_type=hyperparams['cell_type'],
              num_dense_layers=hyperparams['num_dense_layers'],
              hidden_to_dense_size_ratio=hyperparams['hidden_to_dense_size_ratio'],
              dropout=hyperparams['dropout'],
              fixed_embeddings=hyperparams['fixed_embeddings'])
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  # Loss & optimizer
  criterion = nn.CrossEntropyLoss() # note that this will apply the softmax for us
  optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams['learning_rate'])

  # Training loop
  running_loss = 0
  running_corrects = 0
  training_accuracy_list = []
  training_loss_list = []

  best_epoch = 0
  best_train_accuracy = 0
  best_dev_accuracy = 0

  dev_accuracy_list = []
  dev_loss_list = []

  converged = False
  epoch = 0
  epochs_to_plot = []

  while (converged == False) and (epoch <= MAX_EPOCHS):

    model.train()

    running_loss = 0
    running_corrects = 0
    num_of_batches = 0

    # Generate batch
    for inputs, labels in train_loader: 
      model.train()

      optimizer.zero_grad()

      # Send to GPU
      inputs = inputs.to(device)
      labels = labels.to(device)

      # Forward pass
      predicted = model(inputs)

      # Loss
      loss = criterion(predicted, labels)
      running_loss += loss.item()

      # Backward pass
      loss.backward()

      # Updates
      optimizer.step()

      num_of_batches +=1

      # Get number of correct training predictions per batch and sum
      _, train_pred = torch.max(predicted, 1)
      running_corrects += (train_pred == labels).sum().item()

    model.eval()
    with torch.no_grad():
      # Calculate training accuracy per epoch
      training_accuracy = running_corrects/train_length
      training_accuracy_list.append(training_accuracy)

      training_loss_list.append(running_loss/num_of_batches)

      # Evaluate on validation dataset

      dev_running_loss = 0
      dev_running_corrects = 0
      num_of_batches = 0

      for dev_inputs, dev_labels in dev_loader: 

          # Send to GPU
          dev_inputs = dev_inputs.to(device)
          dev_labels = dev_labels.to(device)

          # Forward pass
          dev_predicted = model(dev_inputs)

          # Loss
          dev_loss = criterion(dev_predicted, dev_labels)
          dev_running_loss += dev_loss.item()

          num_of_batches +=1

          # Get number of correct validation predictions per batch and sum
          _, dev_pred = torch.max(dev_predicted, 1)
          dev_running_corrects += (dev_pred == dev_labels).sum().item()
          
      dev_accuracy = dev_running_corrects/dev_length
      dev_accuracy_list.append(dev_accuracy)

      dev_loss_list.append(dev_running_loss/num_of_batches)

      print(f"Epoch: {epoch}, train_loss: {loss.item():.4f}, train_acc: {training_accuracy:.4f}, val_loss: {dev_loss.item():.4f}, val_acc: {dev_accuracy:.4f}")

      epochs_to_plot.append(epoch)

      # Store the current results if it is the highest validation accuracy so far
      if dev_accuracy > best_dev_accuracy:
          best_dev_accuracy = dev_accuracy  
          best_dev_loss = dev_loss.item()
          best_train_accuracy = training_accuracy
          best_loss = loss.item()
          best_epoch = epoch
          best_model = model
        
      # Early stopping criteria
      if epoch >= MIN_EPOCHS:
        if (np.mean(dev_loss_list[-15:]) - np.mean(dev_loss_list[-30:-15])) > 0:
          converged = True
          print("Model Converged")

    epoch += 1

  hyperparams['model_index'] = run_index + 1
  hyperparams['best_epoch'] = best_epoch
  hyperparams['best_validation_accuracy'] = best_dev_accuracy
  hyperparams['associated_validation_loss'] = best_dev_loss
  hyperparams['associated_training_accuracy'] = best_train_accuracy
  hyperparams['associated_training_loss'] = best_loss

  results_df = results_df.append(hyperparams , ignore_index=True)
  results_df = results_df.sort_values("best_validation_accuracy", ascending=False).round(4)
  results_df.to_csv(f"{path}RESULTS_TABLE_{hyperparams['cell_type']}.csv")

  print(f"\n RUN_INDEX: {run_index + 1} \n")
  print(hyperparams)

  # Save the model
  torch.save(best_model, f"{path}{hyperparams['cell_type']}_{best_dev_accuracy:.4f}_devacc_{best_train_accuracy:.4f}_trainacc.pth")

  # Loss plot
  plt.plot(epochs_to_plot, training_loss_list, color='k', linestyle='-')
  plt.plot(epochs_to_plot, dev_loss_list, color='r', linestyle='-')
  plt.legend(['Training', 'Validation'], loc='upper right')
  plt.ylabel('Loss', color='k')
  plt.xlabel('Epoch', color='k')
  plt.title(f'''Loss plot ({hyperparams['cell_type']}, Val Accuracy {best_dev_accuracy:.4f}) \n 
  Hidden size: {hyperparams['hidden_size']}, Recurrent layers: {hyperparams['num_layers']}, Dense layers: {hyperparams['num_dense_layers']}, 
  Hidden to dense size ratio: {hyperparams['hidden_to_dense_size_ratio']}, LR: {hyperparams['learning_rate']}, Dropout: {hyperparams['dropout']}, 
  Fixed embeddings: {hyperparams['fixed_embeddings']}, Batch size: {hyperparams['batch_size']}''', color='k')

  plt.savefig(f"{path}{hyperparams['cell_type']}_{best_dev_accuracy:.4f}_devacc_{best_train_accuracy:.4f}_trainacc_LOSSPLOT.png", dpi=300, bbox_inches = "tight")
  plt.show()

  # Accuracy plot
  plt.plot(epochs_to_plot, training_accuracy_list, color='k', linestyle='-')
  plt.plot(epochs_to_plot, dev_accuracy_list, color='r', linestyle='-')
  plt.legend(['Training', 'Validation'], loc='lower right')
  plt.ylabel('Accuracy', color='k')
  plt.xlabel('Epoch', color='k')
  plt.title(f'''Accuracy plot ({hyperparams['cell_type']}, Val Accuracy {best_dev_accuracy:.4f}) \n 
  Hidden size: {hyperparams['hidden_size']}, Recurrent layers: {hyperparams['num_layers']}, Dense layers: {hyperparams['num_dense_layers']},  
  Hidden to dense size ratio: {hyperparams['hidden_to_dense_size_ratio']}, LR: {hyperparams['learning_rate']}, Dropout: {hyperparams['dropout']},
  Fixed embeddings: {hyperparams['fixed_embeddings']}, Batch size: {hyperparams['batch_size']}''', color='k')

  plt.savefig(f"{path}{hyperparams['cell_type']}_{best_dev_accuracy:.4f}_devacc_{best_train_accuracy:.4f}_trainacc_ACCPLOT.png", dpi=300, bbox_inches = "tight")
  plt.show()

# Evaluate on test dataset

In [None]:
# Load best model (highest validation accuracy) for further evaluation

best_model = torch.load(f"{path}GRU_0.4505_devacc_0.4896_trainacc.pth")
best_model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model.to(device)

# Get test data loader
test_loader = SentenceBatcher(x_test_list, y_test_list, hyperparams['batch_size'], drop_last=False)

# Loss & optimiser
criterion = nn.CrossEntropyLoss() # note that this will apply the softmax for us

# Final performance of best model on test set 

test_running_loss = 0
test_running_corrects = 0

for test_inputs, test_labels in test_loader: 

    # Send to GPU
    test_inputs = test_inputs.to(device)
    test_labels = test_labels.to(device)

    # Forward pass
    test_predicted = best_model(test_inputs)

    # Test loss
    loss_test = criterion(test_predicted, test_labels)
    test_running_loss += loss_test

    # Test accuracy
    _, test_pred = torch.max(test_predicted, 1)
    test_running_corrects += (test_pred == test_labels).sum().item()

test_loss_avg = test_running_loss/test_length
test_accuracy_avg = test_running_corrects/test_length

# print results
print(f"Final test accuracy: {test_accuracy_avg}")
print(f"Final test loss: {test_loss_avg}")

# Online review classifications

In [None]:
# Online reviews, pre-processed as in parse_tree
x_online_list = [
                 "to this day this is still my favorite pixar film the animation is stellar its heartwarming funny and proves that pixar movies are always bound to be great except for cars but thats a different story this has a shot at the title best movie of the century", # https://www.imdb.com/review/rw5485122
                 "this is just a wonderful telling of charles dickens great christmas story the story being so good you would have to try had to make a bad movie out of it", # https://www.imdb.com/review/rw0310420
                 "honestly i really should be giving this film a lower score somehow i enjoyed it quite a bit even in the face of the many fundamental issues which is a testament to the strength of the best sequences", # https://www.imdb.com/review/rw4075393
                 "but the worst thing of all with this film is the mangling of austen's dialogue and the atrocious modern dialogue austen's dialogue needs no assistance from a writer who thinks he she can write like austen", # https://www.imdb.com/review/rw1213354
                 "hours of boredom half the audience fell asleep including most of the kiddies beautiful to look at but that does not make for a interesting film" # https://www.imdb.com/review/rw0717356
]
# SentenceBatcher will randomly order the reviews, so we keep track of their indices
x_online_index = range(len(x_online_list))

# Get online data loader
online_loader = SentenceBatcher(x_online_list, x_online_index, batch_size=128, drop_last=False)

# Evaluate model on online reviews

for online_inputs, online_index in online_loader:
  online_inputs = online_inputs.to(device)

  online_predicted = best_model(online_inputs)
  _, online_pred = torch.max(online_predicted, 1)

  for index, pred in zip(online_index, online_pred):
    print(f"Review: {x_online_list[index]}")
    print(f"Prediction: {pred}")