In [1]:
import csv
import pandas as pd
import numpy as np
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data 
import torchtext
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [2]:
# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

Using device: cuda
GeForce GTX 1080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


# Preprocessing

In [3]:
# Declare fields for tweets and labels
# include_lengths tells the RNN how long the actual sequences are
# spacy.load('en_core_web_sm')

spacy.load('en', disable=['ner', 'parser', 'tagger'])

def tokenize(s):
    return s.split(' ')

TEXT = data.Field(tokenize=tokenize, lower=True, include_lengths=True)
UPVOTE = data.LabelField(sequential=False, use_vocab=False, dtype=torch.int64)
CHANGE = data.LabelField(sequential=False, use_vocab=False, dtype=torch.float)
SENT_LABEL = data.LabelField(sequential=False, dtype=torch.int64)
CHAN_LABEL = data.LabelField(sequential=False, dtype=torch.int64)

# Map data to fields
fields = [('text', TEXT), ('upvote', None), ('change', None), ('sent_label', SENT_LABEL), ('chan_label', None)]

# Apply field definition to create torch dataset
train_data = torchtext.legacy.data.TabularDataset(
        path="train_data.csv",
        format="CSV",
        fields=fields,
        skip_header=False)

valid_data = torchtext.legacy.data.TabularDataset(
        path="valid_data.csv",
        format="CSV",
        fields=fields,
        skip_header=False)

test_data = torchtext.legacy.data.TabularDataset(
        path="test_data.csv",
        format="CSV",
        fields=fields,
        skip_header=False)

print("Number of train data: {}".format(len(train_data)))
print("Number of valid data: {}".format(len(valid_data)))
print("Number of test data: {}".format(len(test_data)))

Number of train data: 145027
Number of valid data: 18128
Number of test data: 163155


 # Build vocab and iterator

In [4]:
MAX_VOCAB_SIZE = 10000000

# unk_init initializes words in the vocab using the Gaussian distribution
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

# build vocab for training set - convert words into integers
SENT_LABEL.build_vocab(train_data)

# Most frequent tokens
TEXT.vocab.freqs.most_common(10)

[('', 145027),
 ('gme', 23666),
 ('spy', 19902),
 ('call', 18742),
 ('aapl', 14643),
 ('nio', 13227),
 ('tesla', 11658),
 ('today', 10423),
 ('buy', 10418),
 ('go', 9792)]

In [5]:
BATCH_SIZE = 128

# sort_within_batch sorts all the tensors within a batch by their lengths
(train_iterator, valid_iterator, test_iterator) = torchtext.legacy.data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    device = device,
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True)

# Model


In [6]:
from models.sentiment_model import SentimentLSTM

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
# 2 layers of biLSTM
N_LAYERS = 2
BIDIRECTIONAL = True
# Dropout probability
DROPOUT = 0.5
# Get pad token index from vocab
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# Create an instance of LSTM class
model = SentimentLSTM(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

print(model)

[nltk_data] Downloading package wordnet to /home/rshuai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/rshuai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/rshuai/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


SentimentLSTM(
  (embedding): Embedding(29210, 100, padding_idx=1)
  (encoder): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (predictor): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [7]:
# Copy the pre-trained word embeddings into the embedding layer
pretrained_embeddings = TEXT.vocab.vectors

# [vocab size, embedding dim]
print(pretrained_embeddings.shape)

torch.Size([29210, 100])


In [8]:
# Replace the initial weights of the embedding layer with the pre-trained embeddings
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.5816, -0.1002,  0.6567,  ..., -1.0982, -2.0876, -0.4422],
        [-0.0334,  1.9419, -0.1282,  ..., -1.0294, -0.5595, -0.6538],
        [ 0.1734, -0.5768,  0.8373,  ...,  0.6636,  0.2077,  0.0641],
        ...,
        [-0.4172, -0.4581,  1.2460,  ...,  0.1602, -0.0563,  1.3825],
        [ 0.4593,  0.0251,  0.0319,  ..., -0.0136, -0.5006,  0.5798],
        [-0.0310,  0.5543, -0.1443,  ..., -1.2080,  1.9179, -0.2627]])

In [9]:
# Initialize <unk> and <pad> both to all zeros - irrelevant for sentiment analysis
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

# Setting row in the embedding weights matrix to zero using the token index
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1734, -0.5768,  0.8373,  ...,  0.6636,  0.2077,  0.0641],
        ...,
        [-0.4172, -0.4581,  1.2460,  ...,  0.1602, -0.0563,  1.3825],
        [ 0.4593,  0.0251,  0.0319,  ..., -0.0136, -0.5006,  0.5798],
        [-0.0310,  0.5543, -0.1443,  ..., -1.2080,  1.9179, -0.2627]])


# Training

In [10]:
# Adam optimizer used to update the weights
# optimizer = optim.Adam(model.parameters(), lr=2e-3)

# Loss function: binary cross entropy with logits
# It restricts the predictions to a number between 0 and 1 using the logit function
# then use the bound scarlar to calculate the loss using binary cross entropy
criterion = nn.BCEWithLogitsLoss()

# Use GPU
model = model.to(device)
# criterion = criterion.to(device) 

In [11]:
def batch_accuracy(predictions, label):
    """
    Returns accuracy per batch.

    predictions - float
    label - 0 or 1
    """

    # Round predictions to the closest integer using the sigmoid function
    preds = torch.round(torch.sigmoid(predictions))
    # If prediction is equal to label
    correct = (preds == label).float()
    # Average correct predictions
    accuracy = correct.sum() / len(correct)

    return accuracy

def timer(start_time, end_time):
    """
    Returns the minutes and seconds.
    """

    time = end_time - start_time
    mins = int(time / 60)
    secs = int(time - (mins * 60))

    return mins, secs

In [12]:
def train(model, iterator, optimizer, criterion):
    """
    Function to evaluate training loss and accuracy.

    iterator - train iterator
    """
    
    # Cumulated Training loss
    training_loss = 0.0
    # Cumulated Training accuracy
    training_acc = 0.0
    
    # Set model to training mode
    model.train()
    
    # For each batch in the training iterator
    for batch in iterator:
        
        # 1. Zero the gradients
        optimizer.zero_grad()
        
        # batch.text is a tuple (tensor, len of seq)
        text, text_lengths = batch.text
        
        # 2. Compute the predictions
        predictions = model(text, text_lengths).squeeze(1)
        
        # 3. Compute loss
        loss = criterion(predictions, batch.sent_label.float())
        
        # Compute accuracy
        accuracy = batch_accuracy(predictions, batch.sent_label.float())
        
        # 4. Use loss to compute gradients
        loss.backward()
        
        # 5. Use optimizer to take gradient step
        optimizer.step()
        
        training_loss += loss.item()
        training_acc += accuracy.item()
        
    # Return the loss and accuracy, averaged across each epoch
    # len of iterator = num of batches in the iterator
    return training_loss / len(iterator), training_acc / len(iterator)


def evaluate(model, iterator, criterion):
    """
    Function to evaluate the loss and accuracy of validation and test sets.

    iterator - validation or test iterator
    """
    
    # Cumulated Training loss
    eval_loss = 0.0
    # Cumulated Training accuracy
    eval_acc = 0
    
    # Set model to evaluation mode
    model.eval()
    
    # Don't calculate the gradients
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.sent_label.float())
            
            accuracy = batch_accuracy(predictions, batch.sent_label.float())

            eval_loss += loss.item()
            eval_acc += accuracy.item()
        
    return eval_loss / len(iterator), eval_acc / len(iterator)        

In [13]:
# Number of epochs
NUM_EPOCHS = 10

# Lowest validation lost
best_valid_loss = float('inf')

learning_rates = [1e-3, 5e-3, 1e-2, 5e-2]

for lr in learning_rates:
    print(f'learning rate is {lr}')
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(NUM_EPOCHS):

        start_time = time.time()

        # Evaluate training loss and accuracy
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        # Evaluate validation loss and accuracy
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        end_time = time.time()

        mins, secs = timer(start_time, end_time)

        # At each epoch, if the validation loss is the best
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            # Save the parameters of the model
            torch.save(model.state_dict(), 'trained_sentiment.pt')

        print("Epoch {}:".format(epoch+1))
        print("\t Total Time: {}m {}s".format(mins, secs))
        print("\t Train Loss {} | Train Accuracy: {}%".format(round(train_loss, 2), round(train_acc*100, 2)))
        print("\t Validation Loss {} | Validation Accuracy: {}%".format(round(valid_loss, 2), round(valid_acc*100, 2)))

learning rate is 0.001
Epoch 1:
	 Total Time: 0m 13s
	 Train Loss 0.31 | Train Accuracy: 86.72%
	 Validation Loss 0.19 | Validation Accuracy: 91.22%
Epoch 2:
	 Total Time: 0m 13s
	 Train Loss 0.15 | Train Accuracy: 94.15%
	 Validation Loss 0.12 | Validation Accuracy: 95.52%
Epoch 3:
	 Total Time: 0m 13s
	 Train Loss 0.11 | Train Accuracy: 95.93%
	 Validation Loss 0.07 | Validation Accuracy: 97.46%
Epoch 4:
	 Total Time: 0m 13s
	 Train Loss 0.08 | Train Accuracy: 97.01%
	 Validation Loss 0.06 | Validation Accuracy: 97.97%
Epoch 5:
	 Total Time: 0m 12s
	 Train Loss 0.06 | Train Accuracy: 97.63%
	 Validation Loss 0.06 | Validation Accuracy: 97.8%
Epoch 6:
	 Total Time: 0m 12s
	 Train Loss 0.05 | Train Accuracy: 98.04%
	 Validation Loss 0.05 | Validation Accuracy: 98.27%
Epoch 7:
	 Total Time: 0m 13s
	 Train Loss 0.05 | Train Accuracy: 98.28%
	 Validation Loss 0.08 | Validation Accuracy: 97.3%
Epoch 8:
	 Total Time: 0m 13s
	 Train Loss 0.04 | Train Accuracy: 98.52%
	 Validation Loss 0.05 |

# Prediction

In [14]:
# Load the model with the best validation loss
model.load_state_dict(torch.load('trained_sentiment.pt'))

# Evaluate test loss and accuracy
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print("Test Loss: {} | Test Acc: {}%".format(round(test_loss, 2), round(test_acc*100, 2)))

Test Loss: 0.02 | Test Acc: 99.5%
