In [1]:
import random
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

In [2]:
SEED = 1234

torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

In [3]:
%%time
train, test = datasets.IMDB.splits(TEXT, LABEL)

Wall time: 4min 58s


In [4]:
train, valid = train.split(random_state=random.seed(SEED))

In [None]:
# FIXME: This doesn't work. For info on fixing, see https://github.com/pytorch/text/issues/201
"""
This time we'll initialize the word embeddings with pre-trained word vectors, from Google News word2vec vectors.

GloVe and other standard embeddings are available via torchtext, so we can just specify it as a string, but 
to demonstrate a more difficult use case, we'll load the file we've downloaded as a vocab.Vectors object.
"""
import spacy
import gensim
nlp = spacy.blank('en')

google_news_filepath = os.path.join('input', 'word2vec', 'GoogleNews-vectors-negative300.bin.gz')

# Load google news vecs in gensim
model = gensim.models.KeyedVectors.load_word2vec_format(google_news_filepath, binary=True)

# Loop through range of all indexes, get words associated with each index.
# The words in the keys list will correspond to the order of the google embed matrix
keys = []
for idx in range(3000000):
    keys.append(model.index2word[idx])

nlp.vocab.vectors = spacy.vocab.Vectors(data=model.syn0, keys=keys)

In [5]:
# TEXT.build_vocab(train, max_size=25000, vectors=nlp.vocab.vectors)

# I'm forced to redownload the GloVe word embeddings because torchtext only natively supports its own set of pretrained aliases
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [6]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

For detailed information on how the RNN architecture is different in this notebook compared to pytorch-simple, please refer to the [original notebook by bentrevett](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb). 

In summary, we wil use LSTM to extend the persistance of the hidden stage, and build a multi-layer RNN where the inputs of each layer are the outputs of the previous layer. 

Each layer will also be a bi-directional RNN, which returns the hidden state as the concatenation of the hidden state from processing the corpus forward, and the hidden state from processing the corpus backward.

We also perform regularization using dropout to reduce parameters in each forward pass randomly.

In [7]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sentence_len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sentence_len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sentence_len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [8]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [9]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [10]:
# Replace the initial weights of the embedding layer with the pre-trained embeddings
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [11]:
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)

In [12]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [13]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [14]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5
modeldir = os.path.join('output', 'models')
trainset = 'IMDB'
modeltype = 'LSTM'

for epoch in range(N_EPOCHS):
    
    prev_netstatename = trainset + '-' + modeltype + '-epoch' + str(epoch - 1) + '-fullstate.pth'
    this_netstatename = trainset + '-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'
    
    if epoch >= 1:
        state = torch.load(os.path.join(modeldir, 'full_state', prev_netstatename))
        model.load_state_dict(state['state_dict'])
        optimizer.load_state_dict(state['optimizer'])
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    state = { 
    'epoch': epoch,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()
    }
    netstatename = trainset + '-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'
    torch.save(state, os.path.join(modeldir, 'full_state', this_netstatename))

    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')