In [1]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision

!pip install tqdm

Collecting tqdm
  Downloading https://files.pythonhosted.org/packages/78/bc/de067ab2d700b91717dc5459d86a1877e2df31abfb90ab01a5a5a5ce30b4/tqdm-4.23.0-py2.py3-none-any.whl (42kB)
[K    100% |████████████████████████████████| 51kB 3.2MB/s 
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.23.0
[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# download the Large IMDB Movie Review Dataset
# http://ai.stanford.edu/%7Eamaas/data/sentiment/index.html
! wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
! tar -xzf aclImdb_v1.tar.gz

--2018-04-25 05:11:07--  http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2018-04-25 05:11:08 (75.2 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [0]:
import torch
import math
import tqdm
import numpy as np
from torch.autograd import Variable
import torch.nn as nn
from torch import optim
import os
from collections import namedtuple
import random

In [0]:
# let's set some parameters
train_path = "aclImdb/train/" 
test_path = "aclImdb/test/"

batch_size = 100
embedding_size = 300
min_count = 2
cuda = True

## Load the dataset
- 25000 train and test sentences

In [0]:
Sentence = namedtuple('Sentence', ['index', 'string', 'label'])

def read_imdb_movie_dataset(dataset_path):

    indices = []
    text = []
    rating = []

    i = 0

    for filename in os.listdir(os.path.join(dataset_path, "pos")):
        file_path = os.path.join(dataset_path, "pos", filename)
        data = open(file_path, 'r', encoding="ISO-8859-1").read()
        indices.append(i)
        text.append(data)
        rating.append(1)
        i = i + 1

    for filename in os.listdir(os.path.join(dataset_path, "neg")):
        file_path = os.path.join(dataset_path, "neg", filename)
        data = open(file_path, 'r', encoding="ISO-8859-1").read()
        indices.append(i)
        text.append(data)
        rating.append(0)
        i = i + 1

    sentences = [ Sentence(index, text, rating)
                  for index, text, rating in zip(indices, text, rating)]

    return sentences

In [6]:
train_sentences = read_imdb_movie_dataset(train_path)
test_sentences = read_imdb_movie_dataset(test_path)

random.shuffle(train_sentences)
random.shuffle(test_sentences)

print(len(train_sentences))
print(len(test_sentences))

25000
25000


## Mapping our words to unique identifiers: the Vocabulary object
- We will create an object to manage a mapping between words (or more generally tokens) and unique indices. 
- There are a few special symbols that we will be adding to handle special cases.
  - The first key special case is the `UNK` token, wich will represent all tokens that we do not have in our vocabulary. This is needed as we will build our vocabulary only using the training examples, and during validation or testing (or if we deploy our model in production) we may encounter new words that also need to be represented somehow.
  - The `PAD` token, which we will use to create even-sized batches of sentences of different length (more on this below). 
  - The beginning-of-sentence or `BOS` token, which we may use to denote the beginning of a sentence in some special cases
  - The end-of-sentence or `EOS` token, which as in the previous case is useful for certain tasks.
  

In [0]:
# Define the string of special tokens we will need 
UNK = '<UNK>'
PAD = '<PAD>'
BOS = '<BOS>'
EOS = '<EOS>'


class VocabItem:

    def __init__(self, string, hash=None):
        """
        Our token object, representing a term in our vocabulary.
        """
        self.string = string
        self.count = 0
        self.hash = hash

    def __str__(self):
        """
        For pretty-printing of our object
        """
        return 'VocabItem({})'.format(self.string)

    def __repr__(self):
        """
        For pretty-printing of our object
        """
        return self.__str__()


def tokenizer(x):
    return x.split()

def token_function(x):
    return x.lower()

class Vocab:

    def __init__(self, sentences, tokenizer=tokenizer,
                 token_function=token_function, min_count=0,
                 add_padding=False, add_bos=False, add_eos=False, unk=None):
        """
        :param sentences: A list of strings.

        :param tokenizer: A function to tokenize strings into tokens.

        :param token_function: A function to process every token string,
                               useful for normalizing case, and handling
                               numbers, dates and so.  

        :param add_padding: if we should add the special `PAD` token.

        :param add_bos: If we should add the special `BOS` token.

        :param add_eos: If we should add the special `EOS` token.

        :param unk: A string with the unknown token, in case our 
                    sentences have already been processed for this,
                    or `None` to use our default `UNK` token. 

        :param min_count: The minimum frequency count threshold for a token
                          to be added to our mapping. Only useful if 
                          the unk parameter is None.

        """
        vocab_items = []
        vocab_hash = {}
        word_count = 0

        self.token_function = token_function
        self.tokenizer = tokenizer
        self.special_tokens = []

        self.UNK = None
        self.PAD = None
        self.BOS = None
        self.EOS = None

        index2token = []
        token2index = {}

        # we tokenize or sentences, process our tokens
        # and add them to a list of VocabItem objects
        for sentence in sentences:
            for token in tokenizer(sentence.string):
                real_token = token_function(token)
                if real_token not in vocab_hash:
                    vocab_hash[real_token] = len(vocab_items)
                    vocab_items.append(VocabItem(real_token))

                vocab_items[vocab_hash[real_token]].count += 1
                word_count += 1

        tmp = []

        # we add/handle the special `UNK` token
        # and set it to have index 0 in our mapping 
        if unk:
            self.UNK = VocabItem(unk, hash=0)
            self.UNK.count = vocab_items[vocab_hash[unk]].count
            index2token.append(self.UNK)
            self.special_tokens.append(self.UNK)

            for token in vocab_items:
                if token.string != unk:
                    tmp.append(token)

        else:
            self.UNK = VocabItem(UNK, hash=0)
            index2token.append(self.UNK)
            self.special_tokens.append(self.UNK)

            for token in vocab_items:
                if token.count <= min_count:
                    self.UNK.count += token.count
                else:
                    tmp.append(token)

        # we sort our vocab. items by frequency
        # so for the same corpus, the indices of our words
        # are always the same
        tmp.sort(key=lambda token: token.count, reverse=True)

        # we always add our additional special tokens
        # at the end of our mapping
        if add_bos:
            self.BOS = VocabItem(BOS)
            tmp.append(self.BOS)
            self.special_tokens.append(self.BOS)

        if add_eos:
            self.EOS = VocabItem(EOS)
            tmp.append(self.EOS)
            self.special_tokens.append(self.EOS)

        if add_padding:
            self.PAD = VocabItem(PAD)
            tmp.append(self.PAD)
            self.special_tokens.append(self.PAD)

        index2token += tmp

        # we update the vocab_hash for each 
        # VocabItem object in our list 
        # based on their frequency 
        for i, token in enumerate(index2token):
            token2index[token.string] = i
            token.hash = i

        self.index2token = index2token
        self.token2index = token2index

        print('Unknown vocab size:', self.UNK.count)
        print('Vocab size: %d' % len(self))


    def __getitem__(self, i):
        return self.index2token[i]

    def __len__(self):
        return len(self.index2token)

    def __iter__(self):
        return iter(self.index2token)

    def __contains__(self, key):
        return key in self.token2index

    def string2indices(self, string, add_bos=False, add_eos=False):
        """
        Returns a list of mapping indices by processing the given string
        with our `tokenizer` and `token_function`, and defaulting to our
        special `UNK` token whenever we found an unseen term.
        
        :param string: A sentence string we wish to map into our vocabulary.
        
        :param add_bos: If we should add the `BOS` at the beginning.
        
        :param add_eos: If we should add the `EOS` at the end.
        
        :return: A list of ints, with the indices of each token in the
                 given string.
        
        """
        string_seq = []
        if add_bos:
            string_seq.append(self.BOS.hash)
        for item in self.tokenizer(string):
            processed_token = self.token_function(item)
            string_seq.append(self.token2index.get(processed_token, self.UNK.hash))
        if add_eos:
            string_seq.append(self.EOS.hash)
        return string_seq


    def indices2tokens(self, indices, ignore_ids=()):
        """
        Retuns a list of strings by mapping back every index to our
        vocabulary.
        
        :param indices: A list of ints. 
        
        :param ignore_ids: An itereable with indices to ignore, meaning
                           that we will not look for them in our mapping.
        
        :return: A list of strings.
        
        Will raise an Exception whenever we pass an index that we
        do not have in our mapping, except when provided with `ignore_ids`.
        
        """
        tokens = []

        for idx in indices:
            if idx in ignore_ids:
                continue
            tokens.append(self.index2token[idx])

        return tokens

In [9]:
vocab = Vocab(train_sentences,
              min_count=min_count,
              add_padding=True)

Unknown vocab size: 213668
Vocab size: 69448


In [10]:
vocab.string2indices('the movie was bad')

[1, 19, 13, 96]

## Representing words using dense vectors: Word Embeddings
- One of the major breakthroughs in NLP with deep models came after the conception of word embeddings, which changed the way in which we represent each word in our machine learning models.
- We start by simply assigning an initially random vector to each word in our vocabulary.These vectors are stacked together into a big matrix, usually referred to as the *embedding* matrix. After we have built our vocabulary, all we have to do is to create a big tensor of shape (`vocab_size`, `embedding_size`).
- In theory, whenever we need to obtain the vector for a given word, we could build a one-hot vector of our word and multply this vector by our *embedding* matrix. All but one value in this one-hot vector are zeroes, the result of this product will correspond exactly to the vector that represents our word.
- Our *embeddings* will be treated as parameters of our models and are trained with it. This is possible because the *embedding* mechanism as has a well-defined derivative, so we are  allowed to use backpropagation to train these vectors.
- Note that in practice, however, the one-hot-based behavior can be achieved by simply selecting row vectors from our *embedding* matrix, given our indices.

In [0]:
embeddings = nn.Embedding(len(vocab.index2token),
                           embedding_size,
                           padding_idx=vocab.PAD.hash)

In [14]:
print(embeddings.weight.data.size())

torch.Size([69448, 300])


## The BatchIterator object
- We will create an object to help us transform our text data into tensors with information that can be fed into our neural network. This object will do all the heavy-lifting, turning our string examples into batches of sequences of word indices that PyTorch can handle.
### The padding function
- Let's suppose we have these two sentences to build a batch:
  - the dog barks $\rightarrow [1, 2 ,3]$
  - the cat likes to sleep $\rightarrow [1, 4, 5, 6, 7]$
  
  In order to put these two examples in a batch Tensor, we will need to *pad* the shortest sentence to have the same length of the longest one. 
  - the dog barks $\rightarrow [1, 2 ,3, 0 , 0]$
  - the cat likes to sleep $\rightarrow [1, 4, 5, 6, 7]$
  
  Finally, our batch Tensor will look like this: 
  - $\begin{bmatrix}1 & 2 & 3 & 0 & 0 \\ 1 & 4 & 5 & 6 & 7\end{bmatrix}$
  
  where its first dimension represents the size of the batch, and its second dimension has the length of the longest sentence in our batch.

In [0]:
def pad_list(raw_input_list, dim0_pad=None, dim1_pad=None,
             align_right=False, pad_value=0):
    """
    Receive a list of lists and return a padded 2d torch tensor,
    a list of lengths and a padded mask
    input_list: a list of lists. len(input_list) = M, and N is the max
    length of any of the lists contained in input_list.
        e.g.: [[2,45,3,23,54], [12,4,2,2], [4], [45, 12]]
    
    Return a torch tensor of dimension (M, N) corresponding to the padded
    sequence, a list of the original lengths, and a mask
    
    Returns:
         out: a torch tensor of dimension (M, N)
         lengths: a list of ints containing the lengths of each input_list
                  element

     """
    input_list = [torch.LongTensor(sublist) for sublist in raw_input_list]
    
    if not dim0_pad:
        dim0_pad = len(input_list)

    if not dim1_pad:
        dim1_pad = max(x.size(0) for x in input_list)

    out = input_list[0].new(dim0_pad, dim1_pad).fill_(pad_value)

    lengths = []
    for i in range(len(input_list)):
        data_length = input_list[i].size(0)
        data_length = data_length if data_length < dim1_pad else dim1_pad
        lengths.append(data_length)
        offset = dim1_pad - data_length if align_right else 0
        out[i].narrow(0, offset, data_length).copy_(input_list[i])

    return out, lengths

In [0]:
class BatchIterator(object):

    def __init__(self, sentences, vocab, batch_size,
                 shuffle=False, cuda=False, ids=None):

        self.vocab = vocab
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.pad_id = self.vocab.PAD.hash

        self.id_examples = []
        self.examples = []
        self.y_examples = []

        # we create a list with our examples as Tensor objects
        # we keep a list with the ids for each example, which
        # is special for ().
        for i, sentence in enumerate(sentences):

            example = vocab.string2indices(sentence.string)

            self.examples.append(torch.LongTensor(example))

            if sentence.index is not None:
                self.id_examples.append(int(sentence.index))
            else:
                self.id_examples.append(i)

            y_example = int(sentence.label)
            self.y_examples.append(torch.LongTensor([y_example]))

        assert len(self.examples) == len(self.y_examples)


        self.cuda = self.is_cuda = cuda

        self.num_batches = (len(self.examples) + batch_size - 1) // batch_size

    def __len__(self):
        """
        Overload the `len()` Python syntax.
        """
        return self.num_batches

    def __getitem__(self, index):
        """
        By implementing this function, we allow our BatchIterator
        to be an iterated over. Every time we reach the end of
        our examples, if the `shuffle` parameter was provided
        we shuffle our examples. 
        To tell Python that we have reached the end of our set
        to iterate on, we must raise the IndexError, which the Python
        interpreter takes to stop the iteration process.

        """
        if index >= self.num_batches:

            if self.shuffle:
                c = list(zip(self.id_examples,
                             self.examples,
                             self.y_examples))

                random.shuffle(c)

                (self.id_examples, self.examples, self.y_examples) = zip(*c)

            raise IndexError("Index is greater "
                             "than the number of batches")

        # first we obtain the batch slice indices
        start = index * self.batch_size
        end = (index + 1) * self.batch_size
        
        # we get id, x and y items corresonding to the current iteration
        id_slice = self.id_examples[start:end]
        x_slice = self.examples[start:end]
        y_slice = self.y_examples[start:end]

        # we need to pad our examples (explanation below)
        padded_x_slice, x_slice_lengths = pad_list(x_slice,
                                                   pad_value=self.pad_id)
        y_slice = torch.cat(y_slice, 0)

        padded_x_slice = Variable(padded_x_slice)
        y_slice = Variable(y_slice)
        
        # we move our Tensors to the GPU if needed
        if self.cuda:
            padded_x_slice = padded_x_slice.cuda()
            y_slice = y_slice.cuda()

        return id_slice, padded_x_slice, x_slice_lengths, y_slice

Let's instance our `batch_iterator` objects for the training and test examples, and inspect a single batch of examples.

In [0]:
train_batches = BatchIterator(train_sentences,
                              vocab,
                              batch_size,
                              cuda=cuda)


test_batches = BatchIterator(test_sentences,
                             vocab,
                             batch_size,
                             cuda=cuda)

In [22]:
ids_batch, x_batch, lengths_batch, y_batch = train_batches[0]
print(x_batch.size())
print(lengths_batch)
print(y_batch.size())

torch.Size([100, 908])
[252, 183, 161, 224, 307, 106, 405, 147, 48, 148, 259, 75, 137, 233, 185, 203, 172, 237, 355, 360, 293, 140, 315, 234, 136, 215, 209, 449, 136, 385, 908, 246, 229, 362, 218, 120, 155, 289, 129, 143, 168, 142, 296, 485, 142, 162, 312, 124, 142, 144, 242, 190, 289, 467, 226, 100, 88, 129, 41, 145, 320, 162, 206, 178, 139, 108, 157, 149, 109, 294, 129, 212, 394, 387, 268, 255, 73, 670, 450, 145, 191, 163, 125, 56, 167, 196, 66, 307, 113, 155, 149, 189, 435, 43, 114, 206, 138, 240, 200, 645]
torch.Size([100])


## The Pytorch Model
### The LSTM
![An unrolled RNN.](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/RNN-unrolled.png)
- The LSTM is a special kind of Recurrent Neural Network that will process sequence data and return a vector for each input in our sequence. In the example, given a sequence of inputs $X=x_1, \ldots , x_t$, the LSTM will give us a sequence of $t$ vectors also called hidden states $H= h_1, \ldots, h_t$.
- The LSTM is a complex beast, in this turorial we will be skipping details on how exactly it works. For more details, visit http://pytorch.org/docs/master/nn.html#lstm
- If we think of our input sequence as our word vectors for a given sentence, we can think of the output as a kind of enriched or contextualized version of the input, which will contain not only information about the word each vector represents, but also about its previous words.
- In PyTorch, LSTMs will return both the set of output vectors $H$ but also some additional output that we will not pay attention to.
- Because we need a fixed-size vector to classify our sentences, we will have to use some kind of pooling function over our hidden states to achieve this. 

In [0]:
def mean_pooling(batch_hidden_states, batch_lengths):
    '''
    :param batch_hidden_states: torch.Tensor(batch_size, seq_len, hidden_size)
    :param batch_lengths: list(batch_size)
    :return:
    '''
    batch_lengths = torch.FloatTensor(batch_lengths)
    batch_lengths = batch_lengths.unsqueeze(1)
    batch_lengths = Variable(batch_lengths)
    if batch_hidden_states.is_cuda:
        batch_lengths = batch_lengths.cuda()

    pooled_batch = torch.sum(batch_hidden_states, 1)
    pooled_batch = pooled_batch / batch_lengths.expand_as(pooled_batch)

    return pooled_batch

  
def max_pooling(batch_hidden_states):
    '''
    :param batch_hidden_states: torch.Tensor(batch_size, seq_len, hidden_size)
    :return:
    '''
    # the `torch.max()` function will return both the maximum
    # value and the argmax over a given, dimension
    # as we do not need the argmas, we discard it
    pooled_batch, _ = torch.max(batch_hidden_states, 1)
    return pooled_batch

- The next key util functions are related to the fact that we are using batches of sentences to train.
- To make the training efficient, Pytorch asks us to sort the examples in our batch by sequence length and build a special object.
- We will use the function `pack_padded_sequence()` to build this special `PackedSequence` object given our sorted padded batch and the lengths of each sentence on it
- Conversely, we will use the `pad_packed_sequence()` function to turn the output of the `nn.LSTM`, a `PackedSequence` object, into a regular Pytorch tensor. This tensor will have zeroes in all padding positions, so we can later directy use our pooling functions.

In [0]:
def pack_rnn_input(embedded_sequence_batch, sequence_lengths):
    """
    Prepares the special `PackedSequence` object that can be
    efficiently processed by the `nn.LSTM`.
    
    :param embedded_sequence_batch: torch.Tensor(seq_len, batch_size)
    
    :param sequence_lengths: list(batch_size)
    
    :return:
      - `PackedSequence` object containing our padded batch
      - indices to sort back our sentences to their original order 
    """
    sequence_lengths = np.array(sequence_lengths)
    sorted_sequence_lengths = np.sort(sequence_lengths)[::-1]

    idx_sort = np.argsort(-sequence_lengths)
    idx_unsort = np.argsort(idx_sort)

    idx_sort = Variable(torch.from_numpy(idx_sort))
    idx_unsort = Variable(torch.from_numpy(idx_unsort))

    if embedded_sequence_batch.is_cuda:
        idx_sort = idx_sort.cuda()
        idx_unsort = idx_unsort.cuda()

    embedded_sequence_batch = embedded_sequence_batch.index_select(0, idx_sort)

    # go back to ints as requested by torch (will change in torch 0.4)
    int_sequence_lengths = [int(elem) for elem in sorted_sequence_lengths.tolist()]

    # Handling padding in Recurrent Networks
    packed_rnn_input = \
        nn.utils.rnn.pack_padded_sequence(embedded_sequence_batch,
                                          int_sequence_lengths,
                                          batch_first=True)

    return packed_rnn_input, idx_unsort

  
def unpack_rnn_output(packed_rnn_output, indices):
    """
     Recover a regular tensor given a `PackedSequence` as returned
     by  `nn.LSTM`

    :param packed_rnn_output: torch object
    
    :param indices: Variable(LongTensor) of indices to sort output
    
    :return:
      - Padded tensor
      
    """
    encoded_sequence_batch, _ = \
        nn.utils.rnn.pad_packed_sequence(packed_rnn_output,
                                         batch_first=True)

    encoded_sequence_batch = \
        encoded_sequence_batch.index_select(0, indices)

    return encoded_sequence_batch

- To build the model, we extend the `nn.Module`

In [0]:
class BiLSTM(nn.Module):

    def __init__(self,
                 embeddings,
                 hidden_size,
                 num_labels,
                 input_dropout=0,
                 output_dropout=0,
                 bidirectional=True,
                 num_layers=2,
                 pooling='mean'):

        super(BiLSTM, self).__init__()

        self.embeddings = embeddings
        self.pooling = pooling

        self.input_dropout = nn.Dropout(input_dropout)
        self.output_dropout = nn.Dropout(output_dropout)

        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.num_labels = num_labels

        self.hidden_size = hidden_size

        self.input_size = self.embeddings.embedding_dim

        self.lstm = nn.LSTM(self.input_size,
                            hidden_size,
                            bidirectional=bidirectional,
                            num_layers=num_layers,
                            batch_first=True)

        self.total_hidden_size = \
            self.hidden_size * 2 if self.bidirectional else self.hidden_size

        self.encoder_zero_total_hidden = \
            self.num_layers*2 if self.bidirectional else self.num_layers

        self.output_layer = nn.Linear(self.total_hidden_size, self.num_labels)

        self.loss_function = nn.CrossEntropyLoss()

        self.is_cuda = False

    def cuda(self, *args, **kwargs):
        super(BiLSTM, self).cuda(*args, **kwargs)
        self.is_cuda = True

    def cpu(self):
        super(BiLSTM, self).cpu()
        self.is_cuda = False


    def forward(self, sequence_batch, sequence_lengths,
                targets=None, train_embeddings=False):

        batch_size, seq_len = sequence_batch.size()

        embedded_sequence_batch = self.embeddings(sequence_batch)
        embedded_sequence_batch = self.input_dropout(embedded_sequence_batch )

        packed_rnn_input, indices = pack_rnn_input(embedded_sequence_batch,
                                                   sequence_lengths)

        rnn_packed_output, _ = self.lstm(packed_rnn_input)
        encoded_sequence_batch = unpack_rnn_output(rnn_packed_output, indices)

      
        if self.pooling == "mean":
            # batch_size, hidden_x_dirs
            pooled_batch = mean_pooling(encoded_sequence_batch,
                                        sequence_lengths)

        elif self.pooling == "max":
            # batch_size, hidden_x_dirs
            pooled_batch = max_pooling(encoded_sequence_batch)
        else:
            raise NotImplementedError
       
        logits = self.output_layer(pooled_batch)
        _, predictions = logits.max(1)

        if targets is not None:
            loss = self.loss_function(logits, targets)
        else:
            loss = None

        return loss, predictions, logits

### Instancing our model
- Let's define the hyperparameters of our model

In [0]:
epochs = 10
hidden_size = 300
log_interval = 10
num_labels = 2
input_dropout = 0.5
output_dropout = 0.5
bidirectional = True
num_layers = 2
pooling = 'mean'
lr = 0.001
gradient_clipping = 0.25

In [27]:
model = BiLSTM(embeddings=embedddings,
               hidden_size=hidden_size,
               num_labels=num_labels,
               input_dropout=input_dropout,
               output_dropout=output_dropout,
               bidirectional=bidirectional,
               num_layers=num_layers,
               pooling=pooling)

if cuda:
    model.cuda()
    
print(model)

BiLSTM(
  (embeddings): Embedding(69448, 300, padding_idx=69447)
  (input_dropout): Dropout(p=0.5)
  (output_dropout): Dropout(p=0.5)
  (lstm): LSTM(300, 300, num_layers=2, batch_first=True, bidirectional=True)
  (output_layer): Linear(in_features=600, out_features=2)
  (loss_function): CrossEntropyLoss(
  )
)


In [0]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [29]:
pbar = tqdm.trange(epochs, desc='Training...')

for epoch in pbar:
    epoch_correct = 0
    epoch_total = 0
    epoch_loss = 0
    for i, batch in enumerate(train_batches):
        (id_sice, padded_x_slice, x_slice_lengths, y_slice) = batch
        loss, predictions, logits = model.forward(padded_x_slice,
                                                  x_slice_lengths,
                                                  y_slice)

        loss.backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), gradient_clipping)

        optimizer.step()
        correct = (predictions == y_slice).long().sum()
        total = y_slice.size(0)
        epoch_correct += correct.data[0]
        epoch_total += total
        epoch_loss += loss.data[0]

        if i % log_interval == 0 and i > 0:
            accuracy = 100 * epoch_correct/ epoch_total

            pbar.write('Train Loss: {}'.format(epoch_loss/log_interval))
            pbar.write('Train Accuracy: {}'.format(accuracy))
            epoch_correct = 0
            epoch_total = 0
            epoch_loss = 0

    test_epoch_correct = 0
    test_epoch_total = 0
    test_epoch_loss = 0

    for i, batch in enumerate(test_batches):
        (id_sice, padded_x_slice, x_slice_lengths, y_slice) = batch
        loss, predictions, logits = model.forward(padded_x_slice,
                                                  x_slice_lengths,
                                                  y_slice)

        correct = (predictions == y_slice).long().sum()
        total = y_slice.size(0)
        test_epoch_correct += correct.data[0]
        test_epoch_total += total
        test_epoch_loss += loss.data[0]

    test_accuracy = 100 * test_epoch_correct / test_epoch_total

    pbar.write('\n---------------------')
    pbar.write('Test Loss: {}'.format(test_epoch_loss/len(test_batches)))
    pbar.write('Test Accuracy: {}'.format(test_accuracy))
    pbar.write('---------------------\n')

Training...:   0%|          | 0/10 [00:16<?, ?it/s]

Train Loss: 0.763588809967041
Train Accuracy: 52.0


Training...:   0%|          | 0/10 [00:32<?, ?it/s]

Train Loss: 0.6884946644306182
Train Accuracy: 51.8


Training...:   0%|          | 0/10 [00:49<?, ?it/s]

Train Loss: 0.6568180024623871
Train Accuracy: 60.3


Training...:   0%|          | 0/10 [01:06<?, ?it/s]

Train Loss: 0.6618802487850189
Train Accuracy: 58.4


Training...:   0%|          | 0/10 [01:24<?, ?it/s]

Train Loss: 0.6577332019805908
Train Accuracy: 60.5


Training...:   0%|          | 0/10 [01:40<?, ?it/s]

Train Loss: 0.6528779685497283
Train Accuracy: 58.0


Training...:   0%|          | 0/10 [01:57<?, ?it/s]

Train Loss: 0.6298307538032532
Train Accuracy: 64.7


Training...:   0%|          | 0/10 [02:13<?, ?it/s]

Train Loss: 0.5931613981723786
Train Accuracy: 66.5


Training...:   0%|          | 0/10 [02:30<?, ?it/s]

Train Loss: 0.5972204864025116
Train Accuracy: 68.3


Training...:   0%|          | 0/10 [02:48<?, ?it/s]

Train Loss: 0.5573752522468567
Train Accuracy: 70.0


Training...:   0%|          | 0/10 [03:04<?, ?it/s]

Train Loss: 0.5268430650234223
Train Accuracy: 73.2


Training...:   0%|          | 0/10 [03:20<?, ?it/s]

Train Loss: 0.5649274021387101
Train Accuracy: 70.5


Training...:   0%|          | 0/10 [03:37<?, ?it/s]

Train Loss: 0.5199377506971359
Train Accuracy: 74.4


Training...:   0%|          | 0/10 [03:52<?, ?it/s]

Train Loss: 0.5426427304744721
Train Accuracy: 71.3


Training...:   0%|          | 0/10 [04:10<?, ?it/s]

Train Loss: 0.5652843922376632
Train Accuracy: 72.0


Training...:   0%|          | 0/10 [04:27<?, ?it/s]

Train Loss: 0.5644978195428848
Train Accuracy: 70.0


Training...:   0%|          | 0/10 [04:43<?, ?it/s]

Train Loss: 0.5058971494436264
Train Accuracy: 74.6


Training...:   0%|          | 0/10 [04:59<?, ?it/s]

Train Loss: 0.5395213961601257
Train Accuracy: 72.6


Training...:   0%|          | 0/10 [05:15<?, ?it/s]

Train Loss: 0.49744054973125457
Train Accuracy: 74.7


Training...:   0%|          | 0/10 [05:32<?, ?it/s]

Train Loss: 0.4742052495479584
Train Accuracy: 76.9


Training...:   0%|          | 0/10 [05:47<?, ?it/s]

Train Loss: 0.4276879608631134
Train Accuracy: 79.1


Training...:   0%|          | 0/10 [06:04<?, ?it/s]

Train Loss: 0.46528694331645964
Train Accuracy: 77.1


Training...:   0%|          | 0/10 [06:20<?, ?it/s]

Train Loss: 0.4455619305372238
Train Accuracy: 78.0


Training...:   0%|          | 0/10 [06:37<?, ?it/s]

Train Loss: 0.47218385338783264
Train Accuracy: 77.8


Training...:  10%|█         | 1/10 [08:41<1:18:12, 521.44s/it]


---------------------
Test Loss: 0.48285325622558595
Test Accuracy: 75.932
---------------------



Training...:  10%|█         | 1/10 [08:57<1:20:40, 537.86s/it]

Train Loss: 0.48773019313812255
Train Accuracy: 77.81818181818181


Training...:  10%|█         | 1/10 [09:14<1:23:06, 554.03s/it]

Train Loss: 0.4085088938474655
Train Accuracy: 80.4


Training...:  10%|█         | 1/10 [09:30<1:25:34, 570.52s/it]

Train Loss: 0.3979483425617218
Train Accuracy: 82.3


Training...:  10%|█         | 1/10 [09:48<1:28:12, 588.11s/it]

Train Loss: 0.366290745139122
Train Accuracy: 83.1


Training...:  10%|█         | 1/10 [10:05<1:30:50, 605.56s/it]

Train Loss: 0.3607216000556946
Train Accuracy: 84.4


Training...:  10%|█         | 1/10 [10:22<1:33:19, 622.13s/it]

Train Loss: 0.3788736552000046
Train Accuracy: 83.4


Training...:  10%|█         | 1/10 [10:38<1:35:50, 638.93s/it]

Train Loss: 0.3451613008975983
Train Accuracy: 85.4


Training...:  10%|█         | 1/10 [10:55<1:38:17, 655.26s/it]

Train Loss: 0.34347525238990784
Train Accuracy: 86.2


Training...:  10%|█         | 1/10 [11:11<1:40:41, 671.31s/it]

Train Loss: 0.38172011375427245
Train Accuracy: 83.2


Training...:  10%|█         | 1/10 [11:29<1:43:28, 689.78s/it]

Train Loss: 0.3215642601251602
Train Accuracy: 86.8


Training...:  10%|█         | 1/10 [11:45<1:45:48, 705.44s/it]

Train Loss: 0.292501500248909
Train Accuracy: 88.1


Training...:  10%|█         | 1/10 [12:01<1:48:14, 721.64s/it]

Train Loss: 0.3556629687547684
Train Accuracy: 86.3


Training...:  10%|█         | 1/10 [12:18<1:50:42, 738.07s/it]

Train Loss: 0.34035356491804125
Train Accuracy: 85.7


Training...:  10%|█         | 1/10 [12:33<1:53:02, 753.64s/it]

Train Loss: 0.3744565635919571
Train Accuracy: 83.3


Training...:  10%|█         | 1/10 [12:51<1:55:45, 771.69s/it]

Train Loss: 0.40091287791728974
Train Accuracy: 81.8


Training...:  10%|█         | 1/10 [13:08<1:58:12, 788.07s/it]

Train Loss: 0.424839922785759
Train Accuracy: 81.0


Training...:  10%|█         | 1/10 [13:23<2:00:35, 803.97s/it]

Train Loss: 0.3881752222776413
Train Accuracy: 83.5


Training...:  10%|█         | 1/10 [13:40<2:03:03, 820.36s/it]

Train Loss: 0.3369844764471054
Train Accuracy: 84.5


Training...:  10%|█         | 1/10 [13:56<2:05:28, 836.55s/it]

Train Loss: 0.36284053772687913
Train Accuracy: 82.0


Training...:  10%|█         | 1/10 [14:12<2:07:54, 852.70s/it]

Train Loss: 0.31973257213830947
Train Accuracy: 85.9


Training...:  10%|█         | 1/10 [14:27<2:10:09, 867.75s/it]

Train Loss: 0.2848996058106422
Train Accuracy: 88.3


Training...:  10%|█         | 1/10 [14:44<2:12:44, 884.89s/it]

Train Loss: 0.33506195098161695
Train Accuracy: 86.4


Training...:  10%|█         | 1/10 [15:00<2:15:07, 900.88s/it]

Train Loss: 0.32450890690088274
Train Accuracy: 87.7


Training...:  10%|█         | 1/10 [15:18<2:17:47, 918.59s/it]

Train Loss: 0.2946134254336357
Train Accuracy: 88.3


Training...:  20%|██        | 2/10 [17:22<1:09:29, 521.20s/it] 


---------------------
Test Loss: 0.3349680826663971
Test Accuracy: 86.096
---------------------



Training...:  20%|██        | 2/10 [17:38<1:10:35, 529.41s/it]

Train Loss: 0.3343262568116188
Train Accuracy: 87.9090909090909


Training...:  20%|██        | 2/10 [17:55<1:11:40, 537.51s/it]

Train Loss: 0.26948550939559934
Train Accuracy: 89.2


Training...:  20%|██        | 2/10 [18:11<1:12:46, 545.76s/it]

Train Loss: 0.2691680908203125
Train Accuracy: 88.1


Training...:  20%|██        | 2/10 [18:29<1:13:56, 554.55s/it]

Train Loss: 0.25346323400735854
Train Accuracy: 90.5


Training...:  20%|██        | 2/10 [18:46<1:15:06, 563.32s/it]

Train Loss: 0.2357587218284607
Train Accuracy: 91.5


Training...:  20%|██        | 2/10 [19:03<1:16:12, 571.62s/it]

Train Loss: 0.2458162397146225
Train Accuracy: 90.2


Training...:  20%|██        | 2/10 [19:20<1:17:20, 580.01s/it]

Train Loss: 0.21496859788894654
Train Accuracy: 91.5


Training...:  20%|██        | 2/10 [19:36<1:18:25, 588.16s/it]

Train Loss: 0.2256816953420639
Train Accuracy: 91.9


Training...:  20%|██        | 2/10 [19:52<1:19:29, 596.19s/it]

Train Loss: 0.2530245453119278
Train Accuracy: 90.2


Training...:  20%|██        | 2/10 [20:10<1:20:43, 605.46s/it]

Train Loss: 0.21294230073690415
Train Accuracy: 92.6


Training...:  20%|██        | 2/10 [20:26<1:21:46, 613.37s/it]

Train Loss: 0.18570363894104958
Train Accuracy: 92.5


Training...:  20%|██        | 2/10 [20:42<1:22:51, 621.49s/it]

Train Loss: 0.2633150011301041
Train Accuracy: 89.7


Training...:  20%|██        | 2/10 [20:59<1:23:57, 629.72s/it]

Train Loss: 0.23298424705863
Train Accuracy: 90.5


Training...:  20%|██        | 2/10 [21:15<1:25:00, 637.52s/it]

Train Loss: 0.26107673943042753
Train Accuracy: 88.8


Training...:  20%|██        | 2/10 [21:33<1:26:12, 646.54s/it]

Train Loss: 0.28451064229011536
Train Accuracy: 88.0


Training...:  20%|██        | 2/10 [21:49<1:27:17, 654.72s/it]

Train Loss: 0.2503566339612007
Train Accuracy: 89.9


Training...:  20%|██        | 2/10 [22:05<1:28:21, 662.70s/it]

Train Loss: 0.2561042934656143
Train Accuracy: 89.4


Training...:  20%|██        | 2/10 [22:21<1:29:27, 670.90s/it]

Train Loss: 0.23490295112133025
Train Accuracy: 90.7


Training...:  20%|██        | 2/10 [22:38<1:30:32, 679.02s/it]

Train Loss: 0.2658605217933655
Train Accuracy: 89.9


Training...:  20%|██        | 2/10 [22:54<1:31:37, 687.14s/it]

Train Loss: 0.21524795591831208
Train Accuracy: 91.0


Training...:  20%|██        | 2/10 [23:09<1:32:37, 694.68s/it]

Train Loss: 0.2024135947227478
Train Accuracy: 92.2


Training...:  20%|██        | 2/10 [23:26<1:33:45, 703.23s/it]

Train Loss: 0.21830800622701646
Train Accuracy: 91.4


Training...:  20%|██        | 2/10 [23:42<1:34:49, 711.20s/it]

Train Loss: 0.20518997088074684
Train Accuracy: 91.6


Training...:  20%|██        | 2/10 [24:00<1:36:00, 720.05s/it]

Train Loss: 0.2105241060256958
Train Accuracy: 91.4


Training...:  30%|███       | 3/10 [26:04<1:00:49, 521.40s/it]


---------------------
Test Loss: 0.4250059585571289
Test Accuracy: 82.0
---------------------



Training...:  30%|███       | 3/10 [26:20<1:01:28, 526.88s/it]

Train Loss: 0.31855655908584596
Train Accuracy: 87.54545454545455


Training...:  30%|███       | 3/10 [26:36<1:02:05, 532.28s/it]

Train Loss: 0.2498329296708107
Train Accuracy: 89.0


Training...:  30%|███       | 3/10 [26:53<1:02:44, 537.80s/it]

Train Loss: 0.21977973729372025
Train Accuracy: 91.7


Training...:  30%|███       | 3/10 [27:10<1:03:25, 543.67s/it]

Train Loss: 0.1705862559378147
Train Accuracy: 93.4


Training...:  30%|███       | 3/10 [27:28<1:04:06, 549.51s/it]

Train Loss: 0.18244198113679885
Train Accuracy: 94.0


Training...:  30%|███       | 3/10 [27:45<1:04:45, 555.02s/it]

Train Loss: 0.14158847630023957
Train Accuracy: 94.1


Training...:  30%|███       | 3/10 [28:01<1:05:24, 560.62s/it]

Train Loss: 0.14569939598441123
Train Accuracy: 94.4


Training...:  30%|███       | 3/10 [28:18<1:06:02, 566.06s/it]

Train Loss: 0.18467241451144217
Train Accuracy: 92.6


Training...:  30%|███       | 3/10 [28:34<1:06:39, 571.42s/it]

Train Loss: 0.16815004944801332
Train Accuracy: 93.4


Training...:  30%|███       | 3/10 [28:52<1:07:23, 577.61s/it]

Train Loss: 0.15554785653948783
Train Accuracy: 94.4


Training...:  30%|███       | 3/10 [29:08<1:08:00, 582.87s/it]

Train Loss: 0.19332499839365483
Train Accuracy: 92.2


Training...:  30%|███       | 3/10 [29:24<1:08:37, 588.28s/it]

Train Loss: 0.23857715874910354
Train Accuracy: 90.1


Training...:  30%|███       | 3/10 [29:41<1:09:16, 593.75s/it]

Train Loss: 0.171990504860878
Train Accuracy: 93.0


Training...:  30%|███       | 3/10 [29:56<1:09:52, 598.94s/it]

Train Loss: 0.17010625824332237
Train Accuracy: 93.0


Training...:  30%|███       | 3/10 [30:14<1:10:34, 604.95s/it]

Train Loss: 0.16212233826518058
Train Accuracy: 93.1


Training...:  30%|███       | 3/10 [30:31<1:11:12, 610.41s/it]

Train Loss: 0.21549870669841767
Train Accuracy: 92.6


Training...:  30%|███       | 3/10 [30:47<1:11:50, 615.72s/it]

Train Loss: 0.2057577170431614
Train Accuracy: 92.0


Training...:  30%|███       | 3/10 [31:03<1:12:28, 621.18s/it]

Train Loss: 0.15492613911628722
Train Accuracy: 95.3


Training...:  30%|███       | 3/10 [31:19<1:13:06, 626.58s/it]

Train Loss: 0.1912270851433277
Train Accuracy: 92.6


Training...:  30%|███       | 3/10 [31:36<1:13:44, 632.01s/it]

Train Loss: 0.16652667969465257
Train Accuracy: 93.5


Training...:  30%|███       | 3/10 [31:51<1:14:19, 637.04s/it]

Train Loss: 0.1617109827697277
Train Accuracy: 94.2


Training...:  30%|███       | 3/10 [32:08<1:14:59, 642.74s/it]

Train Loss: 0.15191135480999945
Train Accuracy: 94.7


Training...:  30%|███       | 3/10 [32:24<1:15:36, 648.06s/it]

Train Loss: 0.2094196744263172
Train Accuracy: 92.3


Training...:  30%|███       | 3/10 [32:41<1:16:17, 653.98s/it]

Train Loss: 0.19741350561380386
Train Accuracy: 92.5


KeyboardInterrupt: ignored