In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score, confusion_matrix, ConfusionMatrixDisplay
from copy import deepcopy
import re
import string
import gensim.models.keyedvectors as word2vec 
import gensim.downloader as api
import gensim.models
import tempfile
from gensim.models import KeyedVectors
pd.set_option('display.max_colwidth', None)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Use GPU if available
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [4]:
def decontracted(phrase):
    """ 
    Expands contractions using regex
    Source: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
    """
    
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)

    return phrase

In [5]:
def clean_text(review):
    """ 
    Convert text to lower case, expand contractions, remove numbers & punctuation
    In milestone 4, we found that this minimal preprocessing performed the best.
    """
    # Cast to lower case
    review = review.lower()

    # Expand contractions
    review = decontracted(review)

    # Remove numbers
    review = re.sub(r'\d+', ' ', review)

    # Remove punctuation 
    review = re.sub(r'[^\w\s]',' ', review)
    review = re.sub(r'_',' ', review)

    return review

def tokenize_text(review, remove_stopword=False):
    """
    Tokenize review after text cleaning using NLTK's word tokenizer
    """
    tokens = nltk.word_tokenize(review)
    return tokens

In [6]:
def calculate_word_occurrences(df):
    """
    Create a dictionary that counts the word occurrences in our data set, where
    key = word, value = count of word (among train/test/val dataset)
    After counting, we delete infrequent words (words that appear < 2 times)
    """
    data = df.copy(deep=True)
    word_counts = Counter()
    for idx, row in data.iterrows():
        word_counts.update(row['review_tokens'])

    print(f"Number of words before: {len(word_counts.keys())}")

    # Delete infrequent words
    for w in set(word_counts):
        if word_counts[w] < 2:
            del word_counts[w]
    
    print(f"Number of words before: {len(word_counts.keys())}")
    
    return word_counts

In [7]:
def encode_review(review_tokens, word_to_idx, max_len):
    """
    Map each token in the review to their corresponding ID (using the word_to_idx dictionary), up to max_len
    If a review has > max_len characters, then we will truncate the review 
    If a review has < max_len characters, then we will pad it with zeros
    """
    # Initialize the encoded array with 0s
    encoded = np.zeros(max_len, dtype=int)

    # Get the corresponding ID for each token in the review, using the word_to_idx dictionary
    temp = np.array([word_to_idx.get(w, word_to_idx['UNK']) for w in review_tokens])

    # Determine whether the review is > max_len
    length = min(max_len, len(temp))

    # Truncate reviews that are longer than max_len
    encoded[:length] = np.copy(temp[:length])

    return encoded, length

In [8]:
class ReviewsDataset(Dataset):
    """
    Create a custom Dataset class for PyTorch
    """
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0]), self.y[idx], self.X[idx][1]

In [9]:
def load_vectors(fname):
    """
    Function to load in pretrained word2vec embeddings
    For convenience, the pretrained embeddings are downloaded and can be found
    in the word2vec folder
    """
    model = KeyedVectors.load(fname, mmap='r')
    return model

def create_embedding_matrix(word_vecs, word_counts, emb_size=50):
    """ 
    This function returns: 
        1. embedding matrix from word2vec embeddings
        2. numpy array of vocabulary words
        3. word_to_idx dictionary
    """

    # Add 2 for unknown words and padding 
    vocab_size = len(word_counts) + 2
    weights_matrix = np.zeros((vocab_size, emb_size), dtype='float32')

    # Vector for padding 
    weights_matrix[0] = np.zeros(emb_size, dtype='float32')

    # Vector for unknown words 
    weights_matrix[1] = np.random.normal(scale=0.6, size=(emb_size, ))

    word_to_idx = {"": 0, "UNK": 1}
    words = ["", "UNK"]

    for idx, word in enumerate(word_counts):
        word_to_idx[word] = idx + 2
        words.append(word)

        if word in word_vecs:
            weights_matrix[idx+2] = word_vecs[word]
        else:
            weights_matrix[idx+2] = np.random.normal(scale=0.6, size=(emb_size, ))
    
    # Returns weights matrix, array of all words, dictionary that maps from words to index 

    return weights_matrix, np.array(words), word_to_idx

In [10]:
class RNN(nn.Module):
    """
    The RNN classifier. This can be configured to either RNN or LSTM, and bidirectional or unidirectional.
    Reference for creating model: https://github.com/bentrevett/pytorch-sentiment-analysis
    """

    def __init__(self, vocab_size, embedding_dim, embedding_matrix, hidden_dim, num_layers, bidirectional, dropout, rnn_model='lstm'):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = True
        
        if rnn_model == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=bidirectional, num_layers=num_layers)
        elif rnn_model == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bidirectional=bidirectional, num_layers=num_layers)

        self.linear = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 5)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, l):
        x = self.embedding(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, l, batch_first=True, enforce_sorted=False)

        if rnn_model == 'lstm':
            rnn_out, (ht, ct) = self.rnn(x_pack)
        elif rnn_model == 'gru':
            rnn_out, ht = self.rnn(x_pack)

        if bidirectional:
            dense_outputs = self.dropout(torch.cat([ht[-1], ht[-2]], dim=-1))
        else:
            dense_outputs = self.dropout(ht[-1])

        outputs = self.linear(dense_outputs)
        return outputs

In [11]:
# Static variables
clip_len = 200
emb_size = 300
batch_size = 60

# IMPORTANT: If you need to rerun this notebook, change your base path to where the data files are located
base_path = '/content/drive/MyDrive/530 Project/'

In [12]:
# Read in all data
train = pd.read_csv(base_path + "data/train_data.csv")
val = pd.read_csv(base_path + "data/dev_data.csv")
test = pd.read_csv(base_path + "data/test_data.csv")

# Concat train, dev, test data for preprocessing
data = pd.concat([train, val, test], axis=0, ignore_index=True)
data.rename(columns={'Summary': 'review_title'}, inplace=True)

# Change ratings to 0-numbering
zero_numbering = {1:0, 2:1, 3:2, 4:3, 5:4}
data['rating'] = data['star_rating'].apply(lambda x: zero_numbering[x])

# Text preprocessing
data['review_title'] = data['review_title'].fillna('')
data['review_body'] = data['review_body'].fillna('')
data['review'] = data['review_title'] + ' ' + data['review_body']
data['clean_review'] = data['review'].apply(clean_text)

# Create tokens after text cleaning
data['review_tokens'] = data['clean_review'].apply(tokenize_text)

# Calculate review after text cleaning
data['review_length'] = data['review_tokens'].apply(lambda x: len(x))

In [13]:
# Get word embeddings
vector_file = base_path + "word2vec-google-news-300"
word_vectors = load_vectors(vector_file)

In [14]:
# Count number of occurrences of each word
word_counts = calculate_word_occurrences(data)
vocab_size = len(word_counts)

# Create vocabulary 
weights_matrix, vocabulary, word_to_idx = create_embedding_matrix(word_vectors, word_counts, emb_size)

# Encode text review
data['encoded'] = data['review_tokens'].apply(lambda x: encode_review(x, word_to_idx, clip_len))
data = data.drop(data[data['review_length'] <= 0].index)

Number of words before: 64144
Number of words before: 39215


In [15]:
# Prepare PyTorch dataset
X = list(data['encoded'])
y = list(data['rating'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1) 

In [16]:
# Create train/test/dev split using custom Dataset class
train_ds = ReviewsDataset(X_train, y_train)
val_ds = ReviewsDataset(X_val, y_val)
test_ds = ReviewsDataset(X_test, y_test)

# Create dataloaders for train/test/dev datasets
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [17]:
def calculate_num_correct(output, y, return_preds=False):
    """ Calculate the count of correct predictions"""
    preds = torch.max(output, 1)[1]
    correct = 0
    correct = torch.sum(preds == y).item()

    if return_preds:
        preds_arr = preds.data.cpu().numpy()
        np.savetxt('output.csv', preds_arr, fmt='%i')

    return correct

In [18]:
def evaluate_model(model, dataloader, return_preds=False, output_file='y_pred.csv', ytrue_file='y_true.csv'):
    """ Function for evluating the model on the development/test dataset"""

    parameters = model.parameters()
    criterion = nn.CrossEntropyLoss()
    predictions = []
    true_labels = []

    model.eval()

    # Training step 
    running_loss = 0
    total = 0
    num_correct = 0

    for i, (encode, label, length) in enumerate(dataloader):

        # Send to GPU 
        encode = encode.to(device)
        label = label.to(device)

        # Predictions from the network
        output = model(encode, length) 
        preds = torch.max(output, 1)[1]

        # Append true labels and predictions to their corresponding lists
        true_labels.extend(label.data.cpu().numpy())
        predictions.extend(preds.data.cpu().numpy())

        # Compute loss 
        loss = criterion(output, label)

        # Compute number of correctly classified instances 
        correct = calculate_num_correct(output, label)

        running_loss += loss.item() * output.shape[0]
        total += output.shape[0]
        num_correct += correct
        
    # Calculate loss and accuracy for the epoch
    epoch_loss = running_loss / total
    epoch_acc = num_correct / total

    if return_preds:
        np.savetxt(output_file, predictions, fmt='%i')
        np.savetxt(ytrue_file, true_labels, fmt='%i')
    
    return epoch_loss, epoch_acc

In [19]:
import time

def train_model(model, train_dl, val_dl, epochs=5, lr=0.001):
    start = time.time()
    print("Begin training")

    print_every = 3
    parameters = model.parameters()

    optimizer = torch.optim.Adam(parameters, lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    best_accuracy = 0
    best_epoch = 0

    train_loss = []
    train_acc = []

    val_loss_arr = []
    val_acc_arr = []

    for e in range(1, epochs + 1):
        model.train()

        # Training step 
        running_loss = 0
        total = 0
        num_correct = 0

        for i, (encode, label, length) in enumerate(train_dl):

            # Send to GPU 
            encode = encode.to(device)
            label = label.to(device)

            # Zero gradients after every batch
            optimizer.zero_grad()

            # Predictions from the network
            output = model(encode, length)

            # Compute loss 
            loss = criterion(output, label)

            # Compute number of correctly classified instances
            correct = calculate_num_correct(output, label)

            # Backpropage the loss and compute the gradients
            loss.backward()

            # Update the weights
            optimizer.step()

            running_loss += loss.item() * output.shape[0]
            total += output.shape[0]
            num_correct += correct

        # End of epoch
        # Calculate loss and accuracy for one epoch
        epoch_loss = running_loss / total
        epoch_acc = num_correct / total

        train_loss.append(epoch_loss)
        train_acc.append(epoch_acc)

        val_loss, val_acc = evaluate_model(model, val_dl)

        val_loss_arr.append(val_loss)
        val_acc_arr.append(val_acc)

        # Save model with best dev performance over all epochs
        if val_acc > best_accuracy:
            best_accuracy = val_acc
            best_model = deepcopy(model)
            best_epoch = e
        
        # Print training statistics
        if e % print_every == 0:
            print("----- Train statistics -----")
            print(f"Epoch {e}")
            print(f"Train accuracy: {epoch_acc}")
            print(f"Train loss: {epoch_loss}")
            print()

            print("----- Validation statistics -----")
            print(f"Val loss: {val_loss}")
            print(f"Val accuracy: {val_acc}")
            print()
        
    # End of training
    print(f"Best epoch: {best_epoch}")
    print(f"Train accuracy: {train_acc[best_epoch-1]}")
    print(f"Train loss: {train_loss[best_epoch-1]}")
    print()
    print(f"Dev loss: {val_loss_arr[best_epoch-1]}")
    print(f"Dev accuracy: {best_accuracy}")

    end = time.time()
    print(f"Total train time (minutes): {(end - start)/60}")

    torch.save(best_model.state_dict(), 'best-model.pt')
        
    return train_loss, train_acc

In [20]:
vocab_size = len(vocabulary)
embedding_dim = 300
hidden_dim = 300
num_layers = 1
bidirectional = True
dropout = 0.3
rnn_model='lstm'

model = RNN(vocab_size, embedding_dim, weights_matrix, hidden_dim, num_layers, bidirectional, dropout, rnn_model=rnn_model)
model = model.to(device)
train_loss, train_acc = train_model(model, train_dl, val_dl, 15, 0.003)

Begin training
----- Train statistics -----
Epoch 3
Train accuracy: 0.7711517857142857
Train loss: 0.582670745248241

----- Validation statistics -----
Val loss: 0.8855831099408014
Val accuracy: 0.6678571428571428

----- Train statistics -----
Epoch 6
Train accuracy: 0.8690446428571429
Train loss: 0.3453462529900883

----- Validation statistics -----
Val loss: 1.1076075327396393
Val accuracy: 0.6772857142857143

----- Train statistics -----
Epoch 9
Train accuracy: 0.8933482142857143
Train loss: 0.28319150510510166

----- Validation statistics -----
Val loss: 1.2962410075323922
Val accuracy: 0.6725714285714286

----- Train statistics -----
Epoch 12
Train accuracy: 0.8981517857142857
Train loss: 0.2718177162050935

----- Validation statistics -----
Val loss: 1.3895961623532431
Val accuracy: 0.6695

----- Train statistics -----
Epoch 15
Train accuracy: 0.8956875
Train loss: 0.2777675553963387

----- Validation statistics -----
Val loss: 1.4125259410483497
Val accuracy: 0.6663571428571429


In [21]:
# Load in model with best dev accuracy
best_model = RNN(vocab_size, embedding_dim, weights_matrix, hidden_dim, num_layers, bidirectional, dropout, rnn_model=rnn_model)
best_model = best_model.to(device)
best_model.load_state_dict(torch.load('/content/best-model.pt'))

<All keys matched successfully>

In [22]:
# PRINT VALIDATION STATISTICS

val_loss, val_acc = evaluate_model(best_model, val_dl, return_preds=True, output_file='val_ypred.csv', ytrue_file='val_ytrue.csv')

val_output_csv = pd.read_csv('val_ypred.csv', header=None)
val_true_y = pd.read_csv('val_ytrue.csv', header=None)

print(len(val_output_csv) == len(val_true_y))

print()

print('Validation accuracy: ', accuracy_score(val_true_y, val_output_csv))

print()

print('Validation Average F1:', f1_score(val_true_y, val_output_csv, average="weighted"))

print()

print('Validation RMSE: ', np.sqrt(mean_squared_error(val_true_y, val_output_csv)))

True

Validation accuracy:  0.6792142857142857

Validation Average F1: 0.6798049488054211

Validation RMSE:  0.7743661002025186


In [23]:
# PRINT TEST STATISTICS
test_loss, test_acc = evaluate_model(best_model, test_dl, return_preds=True)

output_csv = pd.read_csv('y_pred.csv', header=None)
true_y = pd.read_csv('y_true.csv', header=None)

print(len(output_csv) == len(true_y))

print('Test accuracy: ', accuracy_score(true_y, output_csv))

print()

print('Test Average F1:', f1_score(true_y, output_csv, average="weighted"))

print()

print('Test RMSE: ', np.sqrt(mean_squared_error(true_y, output_csv)))

True
Test accuracy:  0.6761428571428572

Test Average F1: 0.6769670777196883

Test RMSE:  0.7832624081366346
