In [0]:
# !curl -0 https://s3-us-west-1.amazonaws.com/pytorch-course-datasets/sentiment-analysis-on-movie-reviews.zip > reviews.zip
# !unzip reviews.zip
# !mkdir models

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data

import random
import time
import os

In [0]:
def tlog(msg):
    print('{}   {}'.format(time.asctime(), msg))

def count_model_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def count_correct(guesses, labels):
    return (guesses == labels).float().sum()

def save_model(model, epoch):
    savefile = "{}-e{}-{}.pt".format('pytorch-sentiment', epoch, int(time.time()))
    tlog('Saving model {}'.format(savefile))
    path = os.path.join('models', savefile)
    # recommended way from https://pytorch.org/docs/stable/notes/serialization.html
    torch.save(model.state_dict(), path)
    return savefile

In [0]:
# global device handle
if not torch.cuda.is_available():
    device = torch.device('cpu')
    print('*** GPU not available - running on CPU. ***')
else:
    device = torch.device('cuda')
    print('GPU ready to go!')

In [0]:
# dataset constants
VOCAB_SIZE = 15000 # max size of vocabulary
VOCAB_VECTORS = "glove.6B.100d" # Stanford NLP GloVe (global vectors) for word rep
BATCH_SIZE = 64



def get_data():
    tlog('Preparing data...')
    phrases_fieldspec = data.Field(include_lengths=True, tokenize='spacy')
    labels_fieldspec = data.LabelField(dtype=torch.int64, sequential=False)

    fields = [
        ('SKIP_phrase_id', None),
        ('SKIP_sentence_id', None),
        ('phrases', phrases_fieldspec),
        ('labels', labels_fieldspec)
    ]


    train_data = data.TabularDataset(
        'train.tsv', # path to file
        'TSV', # file format
        fields,
        skip_header = True # we have a header row
    )
    
    train_data, eval_data = train_data.split()
    phrases_fieldspec.build_vocab(train_data, max_size=VOCAB_SIZE, vectors=VOCAB_VECTORS)
    labels_fieldspec.build_vocab(train_data)
    vocab_size = len(phrases_fieldspec.vocab)
    output_size = len(labels_fieldspec.vocab)
    
    train_iter, eval_iter = data.BucketIterator.splits(
        (train_data, eval_data), 
        batch_size=BATCH_SIZE,
        device=device, sort=False, shuffle=True)
    
    tlog('Data prepared')
    return train_iter, eval_iter, vocab_size, output_size

In [0]:
# model constants
EMBEDDING_SIZE = 100 # must match dimensions in vocab vectors above
HIDDEN_SIZE = 100
OUTPUT_SIZE = 5 # 0 to 4
NUM_LAYERS = 2



class SentimentAnalyzer(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, pretrained_embedding, hidden_size, output_size):
        super(SentimentAnalyzer, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.embedding.weight.data.copy_(pretrained_embedding)
        self.embedding.weight.requires_grad = False
        
        self.rnn = nn.LSTM(
            input_size=embedding_size,
            num_layers=NUM_LAYERS,
            hidden_size=hidden_size,
            dropout=0.5
        )
        
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, phrases, hidden):
        if hidden is None: # tuple w/ 2 for LSTM, make one for RNN or GRU
            hidden = (torch.zeros(NUM_LAYERS, BATCH_SIZE, self.hidden_size, dtype=torch.float).to(device),
                      torch.zeros(NUM_LAYERS, BATCH_SIZE, self.hidden_size, dtype=torch.float).to(device))
        # print(hidden[0].shape)
        x = self.embedding(phrases)
        x, hidden = self.rnn(x, hidden)
        # print(hidden[0].shape)
        x = self.fc(hidden[0][-1]) # remove [0] for RNN or GRU, [-1] is for layers
        return x.squeeze(0), hidden

In [0]:
def get_model(vocab_size, output_size, vectors):
    tlog('Creating model...')
    sa = SentimentAnalyzer(vocab_size, EMBEDDING_SIZE, vectors, HIDDEN_SIZE, output_size)
    tlog('The model has {} trainable parameters'.format(count_model_params(sa)))
    tlog(sa)
    return sa

In [0]:
def train(model, iterator, loss_fn, optimizer): # one epoch
    curr_loss = 0.
    curr_correct = 0.
    hidden = None
    model.train() # makes sure that training-only fns, like dropout, are active
    
    for batch in iterator:
        # get the data
        phrases, lengths = batch.phrases
        
        if phrases.shape[1] == BATCH_SIZE:        
            # predict and learn
            optimizer.zero_grad()
            guesses, hidden = model(phrases, hidden)
            loss = loss_fn(guesses, batch.labels)
            loss.backward()
            optimizer.step()
            
            hidden[0].detach_() # or we get double-backward errors
            hidden[1].detach_() # or we get double-backward errors

            # measure
            curr_loss += loss.item()
            curr_correct += count_correct(torch.argmax(guesses, 1), batch.labels)
        
    return curr_loss / len(iterator), curr_correct / (len(iterator) * BATCH_SIZE)

In [0]:
def evaluate(model, iterator, loss_fn):
    curr_loss = 0.
    curr_correct = 0.
    hidden = None
    model.eval() # makes sure that training-only fns, like dropout, are inactive
    
    with torch.no_grad(): # not training
        for batch in iterator:
            # get the data
            phrases, lengths = batch.phrases
            
            if phrases.shape[1] == BATCH_SIZE:        
                # predict
                guesses, hidden = model(phrases, hidden) # .squeeze(1)
                loss = loss_fn(guesses, batch.labels)

                # measure
                curr_loss += loss.item()
                curr_correct += count_correct(torch.argmax(guesses, 1), batch.labels)

    
    return curr_loss / len(iterator), curr_correct / (len(iterator) * BATCH_SIZE)        

In [0]:
# training loop constants
EPOCHS = 50
LR = 2e-3


def learn(model, train_iter, eval_iter):
    eval_losses = []
    eval_accs = []
    best_eval_acc = 0
    
    model = model.to(device)

    loss_fn = torch.nn.CrossEntropyLoss()
    # loss_fn = loss_fn.to(device)

    params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(params, lr=LR)
    
    for epoch in range(EPOCHS):
        tlog('EPOCH {} of {}'.format(epoch + 1, EPOCHS))
        
        train_loss, train_acc = train(model, train_iter, loss_fn, optimizer)
        tlog('  Training loss {}   acc {}'.format(train_loss, train_acc))
        
        eval_loss, eval_acc = evaluate(model, eval_iter, loss_fn)
        tlog('  Validation loss {}   acc {}'.format(eval_loss, eval_acc))
        eval_losses.append(eval_loss)
        eval_accs.append(eval_acc)
        if eval_acc > best_eval_acc:
            tlog('  *** New accuracy peak, saving model')
            best_eval_acc = eval_acc
            saved_model_filename = save_model(model, epoch + 1)
    
    tlog('DONE')
    return eval_losses, eval_accs

In [0]:
t_iter, e_iter, vocab_size, output_size = get_data()
embedding_vectors = t_iter.dataset.fields['phrases'].vocab.vectors

In [0]:
sa = get_model(vocab_size, output_size, embedding_vectors)
losses, accs = learn(sa, t_iter, e_iter)

In [0]:
for i in range(len(losses)):
    print('{} {}'.format(losses[i], accs[i]))