# Seentiment Analysis with Recurrent Neural Network vs Gated Recurrent Units vs Long and Short-Term Memory

In [185]:
import os
import re
import nltk
import pytreebank
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F


import plotly.express as px
import matplotlib.pyplot as plt

from nltk.tree import Tree
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter
from torch.utils.data import TensorDataset, DataLoader
from torchtext.vocab import GloVe

from sklearn import metrics

# download all the nltk corpora only once
#nltk.download('all')

In [3]:
# set available device
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [186]:
# load GLoVe embeddings downloaded at ../data/glove/glove.6B.200d.txt
glove = GloVe(name='6B', dim=200, cache='../data/glove')

In [82]:
# display sample of GLoVe embeddings
print(glove.vectors.shape)
print(glove.vectors[0])

torch.Size([400000, 200])
tensor([-7.1549e-02,  9.3459e-02,  2.3738e-02, -9.0339e-02,  5.6123e-02,
         3.2547e-01, -3.9796e-01, -9.2139e-02,  6.1181e-02, -1.8950e-01,
         1.3061e-01,  1.4349e-01,  1.1479e-02,  3.8158e-01,  5.4030e-01,
        -1.4088e-01,  2.4315e-01,  2.3036e-01, -5.5339e-01,  4.8154e-02,
         4.5662e-01,  3.2338e+00,  2.0199e-02,  4.9019e-02, -1.4132e-02,
         7.6017e-02, -1.1527e-01,  2.0060e-01, -7.7657e-02,  2.4328e-01,
         1.6368e-01, -3.4118e-01, -6.6070e-02,  1.0152e-01,  3.8232e-02,
        -1.7668e-01, -8.8153e-01, -3.3895e-01, -3.5481e-02, -5.5095e-01,
        -1.6899e-02, -4.3982e-01,  3.9004e-02,  4.0447e-01, -2.5880e-01,
         6.4594e-01,  2.6641e-01,  2.8009e-01, -2.4625e-02,  6.3302e-01,
        -3.1700e-01,  1.0271e-01,  3.0886e-01,  9.7792e-02, -3.8227e-01,
         8.6552e-02,  4.7075e-02,  2.3511e-01, -3.2127e-01, -2.8538e-01,
         1.6670e-01, -4.9707e-03, -6.2714e-01, -2.4904e-01,  2.9713e-01,
         1.4379e-01, -1.2

## Prepare Data for Analysis

In [187]:
# load the SST corpus in the parenthesis format
dataset = pytreebank.load_sst()

# add Javascript  and CSS to the Ipython Notebook
pytreebank.LabeledTree.inject_visualization_javascript()

# visualize an example
example = dataset["train"][1]
example.display()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [188]:
def extract_text_and_label(tree):
    """  Extract text and label text from a given tree"""
    text = tree.to_lines()[0]
    label = tree.label
    return text, label

def add_sentiment(df):
    """ Remove data with neutral label of 2 and add sentiment: 
        negative if label is 0 or 1, positive otherwise. """
    df = df[df["label"] != 2]
    df["positivity"] = df["label"].apply(lambda x: 1 if x > 2 else 0)
    
    return df

def extract_dataframe(dataset, category):
    """ Extract text and label from all trees into a dataframe. """
    data = []
    for tree in dataset[category]:
        text, label = extract_text_and_label(tree)
        data.append([text, label])
    
    df = pd.DataFrame(data, columns=["text", "label"])
    df = add_sentiment(df)

    return df


In [189]:
# extract text and label dataframes for train,test and validate dataset
df_train = extract_dataframe(dataset, "train")
df_test = extract_dataframe(dataset, "test")
df_validate = extract_dataframe(dataset, "dev")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [191]:
df_train.head()

Unnamed: 0,text,label,positivity
0,The Rock is destined to be the 21st Century 's...,3,1
1,The gorgeously elaborate continuation of `` Th...,4,1
2,Singer/composer Bryan Adams contributes a slew...,3,1
4,Yet the act is still charming here .,3,1
5,Whether or not you 're enlightened by any of D...,4,1


In [192]:
x_train = df_train["text"]; y_train = df_train["positivity"]
x_test = df_test["text"]; y_test = df_test["positivity"]
x_validate = df_validate["text"]; y_validate = df_validate["positivity"]

print(f'shape of train  data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')
print(f'shape of validate data is {x_validate.shape}')

shape of train  data is (6920,)
shape of test data is (1821,)
shape of validate data is (872,)


##  Data statistics

In [11]:
# concatenate train, test and validate data frames alonng axis=0 with additional column "data_type"
df_train["data_type"] = "train"
df_test["data_type"] = "test"
df_validate["data_type"] = "validate"

df = pd.concat([df_train, df_test, df_validate], axis=0)
df

Unnamed: 0,text,label,positivity,data_type
0,The Rock is destined to be the 21st Century 's...,3,1,train
1,The gorgeously elaborate continuation of `` Th...,4,1,train
2,Singer/composer Bryan Adams contributes a slew...,3,1,train
4,Yet the act is still charming here .,3,1,train
5,Whether or not you 're enlightened by any of D...,4,1,train
...,...,...,...,...
1095,... Designed to provide a mix of smiles and te...,1,0,validate
1096,it seems to me the film is about the art of ri...,1,0,validate
1097,It 's just disappointingly superficial -- a mo...,1,0,validate
1098,The title not only describes its main characte...,1,0,validate


In [12]:
# use plotly.express to plot df label count distribution as bar plot
fig = px.histogram(df, color="positivity", x="data_type", barmode="group", title="Label count distribution")
# order the x-axis by sentiment
fig.update_xaxes(categoryorder="array", categoryarray=["negative", "positive"])
fig.show()    

## Pre-processing text

Tokenization, Removing Stop Words and Lemmatization:
Word tokanization splits a sentence into tokens (words and punctuation)
Lemmatization gets the base form of the word;  NLTK WordNetLemmatizer treats everythins as a noun

In [101]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/elliemcintosh/nltk_data...


True

In [203]:
def preprocess_text(text):
    """ Preprocess string by removing all non-alphanumeric characters, 
    replacing all runs of whitespace with a no space, and digits with no space."""

    # remove all non-alphanumeric characters
    text = re.sub(r"[^\w\s]", "", text)

    # replace all runs of whitespace with a single no space
    text = re.sub(r"\s+", "", text)

    # replace all digits with no space
    text = re.sub(r"\d", "", text)

    return text

def tockenize_text(x_train, x_test):
    """ Create a one-hot dictionaty of tokens for train and test text:
        One-hot-dictionary maps each word in the corpus to an unique integer and convert 
        the words in each sentence into their corresponding integer tokens."""
    word_list = []
    stop_words = set(stopwords.words("english"))
    
    for sentance in x_train:
        for word in sentance.lower().split():
            word = preprocess_text(word)
            if word not in stop_words and word != "":
                word_list.append(word)

    corpus = Counter(word_list)
    # sort the corpus by frequency
    corpus = sorted(corpus, key=corpus.get, reverse=True)
    # create a one-hot dictionary, which maps each word to a unique integer
    one_hot_dict = {word: i for i, word in enumerate(corpus, 1)}

    # tokenize train and test data
    x_train_tokens = [[one_hot_dict[preprocess_text(word)] for word in sentance.lower().split() if preprocess_text(word) in one_hot_dict.keys()] for sentance in x_train]
    x_test_tokens = [[one_hot_dict[preprocess_text(word)] for word in sentance.lower().split() if preprocess_text(word) in one_hot_dict.keys()] for sentance in x_test]

    return x_train_tokens, x_test_tokens, one_hot_dict


### Get vocabulary and emebding matrix for all vocaulary words

In [204]:
# tockenize train and test input data and get a vocabulary to use for embedding
x_train_tock, x_test_tock, vocab = tockenize_text(x_train, x_test)

### Embedding Matrix

In [207]:
vocab_size = len(vocab) + 1
embedding_dim = glove.dim

# create an emedding matrix with shape (vocab_size, embedding_dim)
embedding_matrix = torch.zeros((vocab_size, embedding_dim))
for word, idx in vocab.items():
    try:
        embedding_matrix[idx] = glove[word]
    except KeyError:
        # word not in Glove vocabulary, skip
        continue

In [208]:
print(type(embedding_matrix))
print(embedding_matrix.shape)

<class 'torch.Tensor'>
torch.Size([14417, 200])


In [17]:
print(f"Length of vocabulary is {len(vocab)}")
# print out the first (sorted of basis of most common) 10 key value paris of vocab, where each word is mapped to a unique integer index
[(k, v) for i,(k,v) in enumerate(vocab.items()) if i < 10]

Length of vocabulary is 14416


[('film', 1),
 ('movie', 2),
 ('nt', 3),
 ('one', 4),
 ('like', 5),
 ('story', 6),
 ('even', 7),
 ('good', 8),
 ('comedy', 9),
 ('much', 10)]

In [18]:
# each sentence of the training/test text is tokenized into a list of integers
x_train_tock[0]

[440,
 2737,
 2250,
 1116,
 31,
 4645,
 141,
 17,
 3440,
 7,
 3441,
 1892,
 2738,
 6947,
 3442,
 6948,
 661,
 6949]

### Analysis of sentence length

Pytorch TensorDatasett and DataLoader classes used for batching and loading data  expect all tensors in a batch to have the same shape.

In [22]:
sentence_lengths = [len(sentence) for sentence in x_train]
# plot the distribution of sentence lengths in train data with plotly.express
fig = px.histogram(x=sentence_lengths, title="Distribution of sentence lengths in train data")
fig.show()

pd.Series(sentence_lengths).describe()

count    6920.000000
mean      103.413295
std        51.374877
min         5.000000
25%        63.750000
50%        99.000000
75%       138.000000
max       267.000000
dtype: float64

### Padding

In [209]:
sen_len = max(len(sentence) for sentence in x_train)
print(f'maximum sentence length is {sen_len}')

maximum sentence length is 267


In [210]:
def padding(sentences, sen_len, padding="pre"):
    """ Pad with zeros  or truncate each sentence of a sentences to a chosen sentence length. """
    features = np.zeros((len(sentences), sen_len), dtype=int)
    for i, sentence in enumerate(sentences):
        if len(sentence) != 0:
            if len(sentence) >= sen_len:
                features[i, :] = np.array(sentence)[:sen_len]
            else:
                if padding == "pre":
                    features[i, -len(sentence):] = np.array(sentence) # pre-padding
                else:
                    features[i, 0:len(sentence)] = np.array(sentence)  # post-padding
    return features

In [211]:
# pad the input sentences in train and test data and turn them into numpy arrays
x_train_tock_pad = padding(x_train_tock, sen_len)
x_test_tock_pad = padding(x_test_tock, sen_len)

# turn labels into numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

In [212]:
type(x_train_tock_pad)

numpy.ndarray

## Batching and Loading

In [213]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_tock_pad), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(x_test_tock_pad), torch.from_numpy(y_test))

# define dataloaders, shuffling the data at each epoch
batch_size = 32

# load batched and shuffled data in parallel with DataLoader object
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=2)  
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size, num_workers=2)

In [214]:
# x needs to be (batch_size, seq_length, input_size) and y (batch_size, 1)
print(f"train data size is {x_train_tock_pad.shape}")
print(f"test data size is {x_test_tock_pad.shape}")

train data size is (6920, 267)
test data size is (1821, 267)


In [215]:
# the number of samples in the last traing and testing batches
print(f"number of training samples in last batch is {x_train_tock_pad.shape[0] %  batch_size}")
print(f"number of test samples in last batch is {x_test_tock_pad.shape[0] % batch_size}")

number of training samples in last batch is 8
number of test samples in last batch is 29


In [216]:
# obtain one batch of training data
data_iter = iter(train_loader)
# the batch is a list of of 50 sentences, 267 tokens each, and their sentiments
one_batch = next(data_iter)

x_train_batch, y_train_batch = one_batch

print(f"Batch of input sentences: \n{x_train_batch}")
print(f"Batch of sentiment for input sentences: \n {y_train_batch}")

Batch of input sentences: 
tensor([[    0,     0,     0,  ...,  3568,   443,    37],
        [    0,     0,     0,  ...,   377, 13792, 13793],
        [    0,     0,     0,  ...,    64,    13,   337],
        ...,
        [    0,     0,     0,  ...,  2692,   921,    22],
        [    0,     0,     0,  ...,     8,   267,   261],
        [    0,     0,     0,  ...,    33,    36,  2160]])
Batch of sentiment for input sentences: 
 tensor([1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
        1, 0, 1, 0, 1, 0, 1, 1])


In [217]:
print(x_train_batch.size())
print(x_train_batch.size(0))

torch.Size([32, 267])
32


## Sentiment Analysis with RNN

RNNs can process inputs of any length, the model size is not proportional to the sizez of the input, coputation takes into  account previous information.

RNN has difficulty leearnig long-term dependencies because of  the diminishing influence of the input data over time. Common issue is when all batch inputs start converging to the same RNN output over time, **Vanishing  gradients** problem. In an RNN, the same weights are used at each time step. If the weigths are samll, the gradients can become exponentially small as they propagate back  though time, leading  to vanishing effect. 

As a result, the RNN might start to produce similar outputs for all inputs in a batch, especially for later time steps, because it's "forgetting" the earlier inputs.

In [242]:
class RNN(nn.Module):
    """ An RNN model with single hidden layer and fully connected linear layer. """

    def __init__(self, input_size, embedding_dim, hidden_dim, num_layers, num_classes):
        """ Initialize the model by setting up the layers. """
        super(RNN, self).__init__()
        
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # embedding input size = vocab size + 1 for padding
        self.embedding = nn.Embedding(input_size, embedding_dim)
        # RNN layer
        self.rnn = nn.RNN(embedding_dim,  # the input size is the output size of the embedding layer
                           hidden_dim, 
                           num_layers, 
                           batch_first=True) # Batch as the first dimension of the tensor input and output
                                             # so the shape of the input should be (batch_size, seq_length, input_size)


        # Linear layer with output size = number of classes
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        """ Perform a forward pass, setting initial hidden states """
        batch_size = x.size(0)
        embeddings = self.embedding(x)
        # initialize the hidden state
        hidden_state = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        # forward pass of the embeddings of shape (batch_size, seq_len, embedding_dim)through RNN
        # the rnn output is a tensor of shape (batch_size, seq_len, hidden_dim) and the hidden state for the last time step
        out, _ = self.rnn(embeddings, hidden_state)
        # since the rnn output contains the features from all the time steps, we take the last time step output by slicing the tensor
        out  = out[:, -1, :]
        out = self.fc(out)
        out = torch.sigmoid(out)

        return out    
        

In [249]:
class GRU(nn.Module):
    """ A GRU model with a single hidden layer and fully connected linear layer."""

    def __init__(self, input_size, embedding_dim, hidden_dim, num_layers, num_classes):
        """ Initialize the model by setting up the layers. """
        super(GRU, self).__init__()
 
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # embedding input size = vocab size + 1 for padding
        self.embedding = nn.Embedding(input_size, embedding_dim)
        # GRU layer
        self.gru = nn.GRU(embedding_dim,  # the input size is the output size of the embedding layer
                           hidden_dim, 
                           num_layers, 
                           batch_first=True) # Batch as the first dimension of the tensor input and output
                                             # so the shape of the input should be (batch_size, seq_length, input_size)


        # Linear layer with output size = number of classes
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        """ Perform a forward pass, setting initial hidden states """
        batch_size = x.size(0)
        embeddings = self.embedding(x)
        # initialize the hidden state
        hidden_state = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        # forward pass of the embeddings of shape (batch_size, seq_len, embedding_dim)through RNN
        # the rnn output is a tensor of shape (batch_size, seq_len, hidden_dim) and the hidden state for the last time step
        out, _ = self.gru(embeddings, hidden_state)

        # take the last time step output by slicing the tensor
        out = self.fc(out[:, -1, :])
        out = torch.sigmoid(out)

        return out

In [237]:
class LSTM(nn.Module):
    """ LSTM model with a hidden and cell state and fully connected linear layer."""

    def __init__(self, embedding_matrix, hidden_dim, num_layers, num_classes):
        """ Initialize the model by setting up the layers. """
        # call constructor from parent class
        super(LSTM, self).__init__()
        
        self.embedding_matrix = embedding_matrix
        num_words = self.embedding_matrix.size()[0]
        embedding_dim  = self.embedding_matrix.size()[1]

        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.num_layers = num_layers
 
        # nn.Embedding() takes the word tocken ids and converts them into the embedding vectors;
        self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim = embedding_dim)
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim,  # the input size is the output size of the embedding layer
                            hidden_dim, 
                            num_layers, 
                            batch_first=True) # batch as the first dimension of the tensor input and output


        # Linear layer with output size = number of classes
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        """ Perform a forward pass, setting initial hidden and cell states """
        batch_size = x.size(0)

        # initialize the hidden and cell states
        hidden_state = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell_state = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        embeddings = self.embedding(x)
        # the lstm layer automatically updates the hidden state after processing each sequence element and returns the output features from each step
        out, _ = self.lstm(embeddings, (hidden_state, cell_state))
        # the final state of each sequence encodes the representation of the entire sentence
        out = self.fc(out[:, -1, :])
        out = torch.sigmoid(out)
        
        return out


In [250]:
def train(data_loader, model, optimizer, device, criterion, clip):
    """ Train the model on the training set. """

    # set model to training mode
    model.train()
    # go through the batches of data in data_loader:
    for data in data_loader:
        
        sentences = data[0]
        sentiment = data[1]
        # move the data to the device that we want to use
        sentences = sentences.to(device,dtype=torch.long)
        sentiment = sentiment.to(device,dtype=torch.float)
        
        # clear the gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(sentences)

        # Backword pass
        loss = criterion(predictions, sentiment.view(-1,1))
        loss.backward()

        # gradient clipping to prevent exploding gradients
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()

def evaluate(data_loader, model, device):

    final_predictions = []
    final_targets = []
    
    model.eval()

    for data in data_loader:
        sentences = data[0]
        sentiment = data[1]
        # move the data to the device that we want to use
        sentences = sentences.to(device,dtype=torch.long)
        sentiment = sentiment.to(device,dtype=torch.float)

        predictions = model(sentences)
        # apply sigmoid function
        predictions = torch.sigmoid(predictions)
        #convert to binary values
        predictions = (predictions >= 0.5).float()
        
        # use detatch() to separatet the tensor from the computational graph
        predictions = predictions.detach().cpu().numpy().tolist()
        sentiment = sentiment.detach().cpu().numpy().tolist()

        final_predictions.extend(predictions)
        final_targets.extend(sentiment)     

        return  final_predictions, final_targets   

In [255]:
# hyperparameters
input_size = vocab_size     # embedding_matrix.size()[0] or  len(vocab)+ 1, number of unique words in dataset plus 1 for padding
sequence_length = sen_len   # maximum sentence length
embedding_dim = glove.dim   # transform each word from vocab index into a dense vector representation of size embedding_dim
hidden_dim = 128
num_layers = 2              # 2 is staking two RNNs in sequence, the second RNN taking as input the hidden state output of the first RNN
num_classes = 1             # probability for positivity sentiment

#model = RNN(input_size, embedding_dim, hidden_dim, num_layers, num_classes).to(device)
#model = GRU(input_size, embedding_dim, hidden_dim, num_layers, num_classes).to(device)
model = LSTM(embedding_matrix, hidden_dim, num_layers, num_classes).to(device)

print(model)

lr = 0.001
#criterion = nn.BCEWithLogitsLoss()
criterion = nn.BCELoss()                     
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
clip = 5
epochs = 3

LSTM(
  (embedding): Embedding(14417, 200)
  (lstm): LSTM(200, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)


In [256]:
for epoch in range(1, epochs+1):
        train(train_loader, model, optimizer, device, criterion, clip)
        outputs, targets = evaluate(test_loader, model, device)
        accuracy = metrics.accuracy_score(targets, outputs)
        
        print(f"Epoch:{epoch}, Accuracy Score: {accuracy}")

Epoch:1, Accuracy Score: 0.9375
Epoch:2, Accuracy Score: 0.9375
Epoch:3, Accuracy Score: 0.9375
