# **NLP - Sentiment Analysis of Tweets using biLSTM**
A deep learning model built using PyTorch and TorchText to classify sentiments of tweets using a subset of the <a href="https://www.kaggle.com/kazanova/sentiment140">sentiment140 dataset</a>.

1. [Dataset Preparation](#section1)
2. [Preprocessing](#section2)
3. [Model](#section3)
4. [Training](#section4)
5. [Prediction](#section5)

In [1]:
import csv
import pandas as pd
import numpy as np
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [32]:
create_subset = False # set to false if sentiment140-small-tokenized.csv already exists

In [2]:
# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

Using device: cuda

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


<a id='section1'></a>
# **1. Dataset Preparation**
The first column contains the sentiments and the last column contains the tweets.

In [None]:
if create_subset:
    # Read in data into a dataframe
    df = pd.read_csv("training.1600000.processed.noemoticon.csv", engine="python", header=None)

    df.head(5)

The dataset consists of two sentiments (0 - negative, 4 - positive)

In [None]:
if create_subset:
    # Count the number of tweets per sentiment
    df[0].value_counts()

In [None]:
if create_subset:
    # Model the sentiments as binary (0 - negative, 1 - positive)
    df[0]=df[0].replace(to_replace=4,value=1)
    df[0].value_counts()

In [None]:
from utils.sentiment_util import tokenize_csv

if create_subset:
    # Save a subset as a smaller dataset from training
    df.sample(500000).to_csv("sentiment140-small.csv", header=None, index=None)
    # remove stopwords, punctuation, and make everything lowercase
    tokenize_csv('sentiment140-small.csv', 'sentiment140-small-tokenized.csv')

<a id='section2'></a>
# **2. Preprocessing**

In [3]:
# Declare fields for tweets and labels
# include_lengths tells the RNN how long the actual sequences are
TEXT = torchtext.legacy.data.Field(tokenize='spacy', lower=True, include_lengths= True)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.float)

# Map data to fields
fields = [('label', LABEL), ('text', TEXT)]

# Apply field definition to create torch dataset
dataset = torchtext.legacy.data.TabularDataset(
        path="sentiment140-small-tokenized.csv",
        format="CSV",
        fields=fields,
        skip_header=False)

# Split data into train, test, validation sets
(train_data, test_data, valid_data) = dataset.split(split_ratio=[0.8,0.1,0.1])

print("Number of train data: {}".format(len(train_data)))
print("Number of test data: {}".format(len(test_data)))
print("Number of validation data: {}".format(len(valid_data)))



Number of train data: 399335
Number of test data: 49917
Number of validation data: 49917


In [4]:
# An example from the training set
print(vars(train_data.examples[10]))

{'label': '0', 'text': ['aw', 'hour', 'left', 'fun', 'weekend', 'fun', 'far', 'currently', 'reading', 'body', 'script']}


### **Build Vocabulary**
Build the vocabulary for the training set using pre-trained GloVe embeddings.
GloVe embeddings were trained on 6 billion tokens and the embeddings are 100-dimensional.

In [6]:
MAX_VOCAB_SIZE = 287799

# unk_init initializes words in the vocab using the Gaussian distribution
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

# build vocab for training set - convert words into integers
LABEL.build_vocab(train_data)

# Most frequent tokens
TEXT.vocab.freqs.most_common(10)

[('i', 138768),
 ('day', 25038),
 ('good', 22465),
 ('get', 21319),
 ('like', 19745),
 ('go', 19442),
 ('got', 17400),
 ('love', 16768),
 ('work', 16663),
 ('today', 16156)]

### **Iterator**
Pad each tweet to be the same length to process in batch. 
The BucketIterator will group tweets of similar lengths together for minimized padding in each batch.


In [8]:
BATCH_SIZE = 128

# sort_within_batch sorts all the tensors within a batch by their lengths
train_iterator, valid_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    device = device,
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True)

<a id='section3'></a>
# **3. Model**

### **Create Model**

In [9]:
from models.sentiment_model import SentimentLSTM

INPUT_DIM = len(TEXT.vocab)
# dim must be equal to the dim of pre-trained GloVe vectors
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
# 2 layers of biLSTM
N_LAYERS = 2
BIDIRECTIONAL = True
# Dropout probability
DROPOUT = 0.5
# Get pad token index from vocab
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# Create an instance of LSTM class
model = SentimentLSTM(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

print(model)

[nltk_data] Downloading package wordnet to /home/emilyjin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/emilyjin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


SentimentLSTM(
  (embedding): Embedding(200867, 100, padding_idx=1)
  (encoder): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (predictor): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [10]:
# Sample from the training set
print(vars(train_iterator.dataset[10]))

{'label': '0', 'text': ['aw', 'hour', 'left', 'fun', 'weekend', 'fun', 'far', 'currently', 'reading', 'body', 'script']}


In [11]:
# Copy the pre-trained word embeddings into the embedding layer
pretrained_embeddings = TEXT.vocab.vectors

# [vocab size, embedding dim]
print(pretrained_embeddings.shape)

torch.Size([200867, 100])


In [12]:
# Replace the initial weights of the embedding layer with the pre-trained embeddings
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.3073, -0.1130, -1.0766,  ...,  0.4044, -0.9157, -0.1943],
        [-1.4156, -0.0118, -1.0315,  ..., -0.7681,  0.6292,  1.8166],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [ 0.1778, -0.7691, -0.0896,  ..., -1.8917, -0.4900,  0.6874],
        [ 0.1090,  2.0768, -0.7294,  ...,  0.1431, -0.9128,  1.1328],
        [-2.2512,  1.1518, -0.2669,  ...,  0.0503, -0.6633, -0.8909]])

In [13]:
# Initialize <unk> and <pad> both to all zeros - irrelevant for sentiment analysis
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

# Setting row in the embedding weights matrix to zero using the token index
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [ 0.1778, -0.7691, -0.0896,  ..., -1.8917, -0.4900,  0.6874],
        [ 0.1090,  2.0768, -0.7294,  ...,  0.1431, -0.9128,  1.1328],
        [-2.2512,  1.1518, -0.2669,  ...,  0.0503, -0.6633, -0.8909]])


<a id='section4'></a>
# **4. Training**

In [14]:
# Adam optimizer used to update the weights
# optimizer = optim.Adam(model.parameters(), lr=2e-3)

# Loss function: binary cross entropy with logits
# It restricts the predictions to a number between 0 and 1 using the logit function
# then use the bound scarlar to calculate the loss using binary cross entropy
criterion = nn.BCEWithLogitsLoss()

# Use GPU
model = model.to(device)
criterion = criterion.to(device) 

### **Train the model**

In [15]:
from utils.sentiment_util import batch_accuracy, timer, evaluate, train

In [16]:
# Number of epochs
NUM_EPOCHS = 10

# Lowest validation lost
best_valid_loss = float('inf')

learning_rates = [1e-3, 5e-3, 1e-2, 5e-2]

for lr in learning_rates:
    print(f'learning rate is {lr}')
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(NUM_EPOCHS):

        start_time = time.time()

        # Evaluate training loss and accuracy
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        # Evaluate validation loss and accuracy
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        end_time = time.time()

        mins, secs = timer(start_time, end_time)

        # At each epoch, if the validation loss is the best
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            # Save the parameters of the model
            torch.save(model.state_dict(), 'model-small.pt')

        print("Epoch {}:".format(epoch+1))
        print("\t Total Time: {}m {}s".format(mins, secs))
        print("\t Train Loss {} | Train Accuracy: {}%".format(round(train_loss, 2), round(train_acc*100, 2)))
        print("\t Validation Loss {} | Validation Accuracy: {}%".format(round(valid_loss, 2), round(valid_acc*100, 2)))

learning rate is 0.001
Epoch 1:
	 Total Time: 1m 45s
	 Train Loss 0.52 | Train Accuracy: 74.12%
	 Validation Loss 0.47 | Validation Accuracy: 77.65%
Epoch 2:
	 Total Time: 1m 44s
	 Train Loss 0.47 | Train Accuracy: 77.76%
	 Validation Loss 0.46 | Validation Accuracy: 78.37%
Epoch 3:
	 Total Time: 1m 44s
	 Train Loss 0.45 | Train Accuracy: 79.08%
	 Validation Loss 0.45 | Validation Accuracy: 78.68%
Epoch 4:
	 Total Time: 1m 44s
	 Train Loss 0.43 | Train Accuracy: 80.21%
	 Validation Loss 0.46 | Validation Accuracy: 78.22%
Epoch 5:
	 Total Time: 1m 44s
	 Train Loss 0.41 | Train Accuracy: 81.07%
	 Validation Loss 0.46 | Validation Accuracy: 78.45%
Epoch 6:
	 Total Time: 1m 44s
	 Train Loss 0.4 | Train Accuracy: 81.92%
	 Validation Loss 0.47 | Validation Accuracy: 77.51%
Epoch 7:
	 Total Time: 1m 44s
	 Train Loss 0.38 | Train Accuracy: 82.75%
	 Validation Loss 0.47 | Validation Accuracy: 77.89%
Epoch 8:
	 Total Time: 1m 44s
	 Train Loss 0.37 | Train Accuracy: 83.48%
	 Validation Loss 0.49 

<a id='section5'></a>
# **5. Prediction**

In [17]:
from utils.sentiment_util import predict

In [18]:
# Load the model with the best validation loss
model.load_state_dict(torch.load('model-small.pt'))

# Evaluate test loss and accuracy
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print("Test Loss: {} | Test Acc: {}%".format(round(test_loss, 2), round(test_acc*100, 2)))

Test Loss: 0.45 | Test Acc: 78.91%


In [None]:
# Single example prediction from the test set
print("Tweet: {}".format(TreebankWordDetokenizer().detokenize(test_data[100].text)))

print("Prediction: {}".format(round(predict(model, test_data[100].text), 2)))

print("True Label: {}".format(test_data[10].label))

In [None]:
# Example prediction from the test set

# List to append data to
d = []


for idx in range(10):

    # Detokenize the tweets from the test set
    tweet = TreebankWordDetokenizer().detokenize(test_data[idx].text)
                                                 
    # Append tweet, prediction, and true label
    d.append({'Tweet': tweet, 'Prediction': predict(model, test_data[idx].text), 'True Label': test_data[idx].label})

# Convert list to dataframe
pd.DataFrame(d)