In [1]:
import os
import random
import collections
import operator
import itertools
import pickle
import tqdm
import pprint

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import simpleclock
import sklearn.metrics
import sklearn.decomposition

In [3]:
import torch
import torchtext

In [4]:
DEVICE = torch.device("cuda")

## data

In [5]:
TEXT = torchtext.data.Field(tokenize = "spacy",
                            include_lengths=True)
LABEL = torchtext.data.LabelField(dtype=torch.float)

In [6]:
data_train_, data_test = torchtext.datasets.IMDB.splits(TEXT, LABEL)

In [7]:
data_train, data_valid = data_train_.split()

In [8]:
print(f"""training data: {len(data_train)} examples.
validation data: {len(data_valid)} examples.
test data: {len(data_test)} examples.""")

training data: 17500 examples.
validation data: 7500 examples.
test data: 25000 examples.


In [9]:
VOCAB_MAX_SIZE = 50000
TEXT.build_vocab(data_train, max_size=VOCAB_MAX_SIZE, vectors="glove.6B.100d")

In [10]:
LABEL.build_vocab(data_train)
LABEL.vocab.stoi

defaultdict(None, {'neg': 0, 'pos': 1})

In [11]:
BATCH_SIZE = 256

In [12]:
iter_train, iter_valid, iter_test = \
    torchtext.data.BucketIterator.splits((data_train, data_valid, data_test),
                                  batch_size=BATCH_SIZE,
                                  device=DEVICE,
                                  sort_within_batch=True,
                                  sort_key=lambda ex: len(ex.text),
                                  sort=False)

## model

In [13]:
class RNN(torch.nn.Module):
    def __init__(self, n_vocab, embedding_dim, hidden_dim, output_dim, dropout, bidirectional,
                 n_layers, pad_idx):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.bidirectional = bidirectional
        num_dir = 2 if bidirectional else 1
        self.embedding = torch.nn.Embedding(n_vocab, embedding_dim, padding_idx=pad_idx)
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim,
                                 bidirectional=bidirectional,
                                 num_layers=n_layers,
                                 dropout=dropout)
        self.fc = torch.nn.Linear(hidden_dim * num_dir, output_dim)
        self.dropout = torch.nn.Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, input_lengths):
        input, lengths = input_lengths
        torch.nn.utils.rnn.pack_padded_sequence(input, lengths)
        embedded = self.embedding(input)  # ((sent_len, batch), emb_dim)
        packed_output, (hidden, cell) = self.rnn(embedded)  # hidden: (num_layers * num_directions,
                                                            #          batch, hidden_size * num_directions)
        hidden = (torch.cat([hidden[-2, :, :], hidden[-1, :, :]], dim=1)
                  if self.bidirectional else hidden).squeeze(0)  # (batch, hidden_size * num_directions)
        return self.sigmoid(self.fc(self.dropout(hidden)))  # (batch, 1)

In [14]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
N_VOCAB = len(TEXT.vocab)

In [15]:
DEFAULT_PARAMS = {
    "n_vocab": N_VOCAB,
    "embedding_dim": 100,
    "hidden_dim": 128,
    "output_dim": 1,
    "dropout": 0.5,
    "bidirectional": True,
    "n_layers": 2,
    "pad_idx": PAD_IDX,
}

def default_model(**kwargs):
    _d = {}
    _d.update(DEFAULT_PARAMS)
    _d.update(kwargs)
    return RNN(**_d)

In [16]:
def pseudo_init(model, criterion, device=DEVICE, learn_embedding_param=True):
    model.embedding.weight.data.copy_(TEXT.vocab.vectors)
    model.embedding.weight.data[UNK_IDX] = torch.zeros(model.embedding_dim)
    model.embedding.weight.data[PAD_IDX] = torch.zeros(model.embedding_dim)
    
    for name, param in model.named_parameters():
        if name == "embedding.weight":
            param.requires_grad = learn_embedding_param
    
    print("The model has {:,} trainable parameters"
         .format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
    
    model = model.to(device)
    criterion = criterion.to(device)
    
    return model, criterion

## train

In [17]:
def output_to_pred(output):
    return torch.round(output)

In [18]:
def accuracy(preds, y):
    correct = (preds == y).float()  # convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [19]:
def accuracy_from_output(output, target):
    return accuracy(output_to_pred(output), target)

## action

In [20]:
from nntraining.pytorch.supervised import AccLossMeasurer, do_training, evaluate, is_max_of
from nntraining.pytorch.text import Predictor

In [21]:
model = default_model()

In [22]:
model, criterion = pseudo_init(model, torch.nn.MSELoss())

The model has 5,631,241 trainable parameters


In [23]:
optimizer = torch.optim.Adam(model.parameters())

In [24]:
training_info = do_training(model,
                            "imdbtest", 
                            iter_train, 
                            iter_valid, 
                            optimizer, 
                            criterion,
                            n_epochs=20,
                            get_input=operator.attrgetter("text"),
                            get_target=operator.attrgetter("label"),
                            fun_is_best=is_max_of("acc"),
                            measurer_getter=AccLossMeasurer.getter(accuracy_from_output))

Not saving best model.
Starting training: 20 epochs.
Epoch: 1.. T, V acc: 58.3%, 59.2%. Took 12.45s. (+)
Epoch: 2.. T, V acc: 67.2%, 52.6%. Took 12.34s.
Epoch: 3.. T, V acc: 77.3%, 71.1%. Took 12.33s. (+)
Epoch: 4.. T, V acc: 82.8%, 68.4%. Took 12.35s.
Epoch: 5.. T, V acc: 89.7%, 81.6%. Took 12.41s. (+)
Epoch: 6.. T, V acc: 79.1%, 47.4%. Took 12.51s.
Epoch: 7.. T, V acc: 65.3%, 52.6%. Took 12.68s.
Epoch: 8.. T, V acc: 76.2%, 86.8%. Took 12.39s. (+)
Epoch: 9.. T, V acc: 90.6%, 81.6%. Took 12.36s.
Epoch: 10. T, V acc: 94.6%, 85.5%. Took 12.34s.
Epoch: 11. T, V acc: 85.8%, 81.6%. Took 12.29s.
Epoch: 12. T, V acc: 94.9%, 89.5%. Took 12.35s. (+)
Epoch: 13. T, V acc: 94.7%, 52.6%. Took 12.32s.
Epoch: 14. T, V acc: 91.5%, 75.0%. Took 12.25s.
Epoch: 15. T, V acc: 96.9%, 77.6%. Took 12.33s.
Epoch: 16. T, V acc: 98.0%, 80.3%. Took 12.40s.
Epoch: 17. T, V acc: 98.6%, 78.9%. Took 12.38s.
Epoch: 18. T, V acc: 98.8%, 76.3%. Took 12.36s.
Epoch: 19. T, V acc: 99.0%, 77.6%. Took 12.40s.
Epoch: 20. T, V

In [25]:
import spacy
nlp = spacy.load("en")

In [26]:
predictor = Predictor(model, TEXT.vocab.stoi, nlp.tokenizer)

In [27]:
predictor("This film is terrible.")

0.0021064248867332935

In [28]:
predictor("This is a great movie.")

0.99488765001297

In [29]:
evaluate(model, iter_test, criterion, operator.attrgetter("text"), operator.attrgetter("label"),
         AccLossMeasurer(accuracy_from_output))

{'acc': 0.7321428656578064, 'loss': 0.209058940410614}