In [1]:
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data

from torchtext import datasets

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb

In [2]:
SEED = 1234

torch.manual_seed(SEED)

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

In [None]:
%%time
train, test = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
print('train.fields:', train.fields)

In [None]:
print('vars(train[0]):', vars(train[0]))

In [None]:
train, valid = train.split(random_state=random.seed(SEED))

In [None]:
print('len(train):', len(train))
print('len(test):', len(test))
print('len(valid):', len(valid))

In [None]:
TEXT.build_vocab(train, max_size=25000)
LABEL.build_vocab(train)
print('len(TEXT.vocab):', len(TEXT.vocab))
print('len(LABEL.vocab):', len(LABEL.vocab))

In [None]:
print(TEXT.vocab.freqs.most_common(20))

In [None]:
# We can also see the vocabulary directly using either the stoi (string to int) or itos (int to string) method.
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

In [None]:
"""
BucketIterator first sorts of the examples using the sort_key, here we use the length of the sentences, 
and then partitions them into buckets. When the iterator is called it returns a batch of examples from the same bucket. 
This will return a batch of examples where each example is a similar length, minimizing the amount of padding.
"""

BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

In [None]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):

        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [None]:
INPUT_DIM = len(TEXT.vocab) # dimensionality of one-hot vectors
EMBEDDING_DIM = 100 # dimensionality of word embeddings
HIDDEN_DIM = 256 #dimensionality of hidden states
OUTPUT_DIM = 1 # scalar class labels

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [None]:
# update step: stochastic gradient descent
optimizer = optim.SGD(model.parameters(), lr=1e-3) #lr: learning rate

In [None]:
# loss function: binary cross entropy with logits
criterion  = nn.BCEWithLogitsLoss()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # set model in 'training mode' to turn on dropout and batch normalization
    model.train()
    
    for batch in iterator:
        # reset the gradients before processing the next batch
        optimizer.zero_grad()
        # feed batch text into model, and change the dimensionality of predictions from [batch size, 1] to [batch size]
        predictions = model(batch.text).squeeze(1)
        # calculate loss and accuracy for this batch
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        # calculate gradient of each parameter
        loss.backward()
        # update parameters using the gradients and optimizer algorithm
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # set model in 'evaluation mode' to turn off dropout and batch normalization
    model.eval()
    # do not calculate gradients (speeds up computation)
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
# Now we train the model through multiple epochs, each epoch being a complete pass through all examples in the split
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')