In [1]:
import time, sys
#sys.path.append('/path/to/env/lib/python3.6/site-packages')
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

import import_ipynb
from MoveData import Options, json2datatools, num_batches, nopeak_mask, create_masks
from Encoder import Encoder
from Decoder import Decoder
from LearningDynamics import CosineWithRestarts

importing Jupyter notebook from MoveData.ipynb
importing Jupyter notebook from Encoder.ipynb
importing Jupyter notebook from Elements.ipynb
importing Jupyter notebook from Decoder.ipynb
importing Jupyter notebook from LearningDynamics.ipynb


Whether you are running this on a GPU ready environment or not, the below if else statement will set `device` appropriately to leverage the GPU or CPU. We are working on such a small dataset that for demonstration purposes, a CPU will do just as well and GPU is not usefull unless you have augmented the data to be significantly larger.

In [2]:
if torch.cuda.is_available():
    print("you have", torch.cuda.device_count(), "GPUs")
    device = torch.device("cuda:0")
else:
    print('using CPU for evaluation and training')
    device = torch.device("cpu")

using CPU for evaluation and training


in START_HERE.ipynb you have already seen the `Options` class and `json2datatools` function. As a reminder, `opt` is just a way to keep and pass all your preferences in a single input, rather than type out every hyperparameter again and again. we will need the input and output vocabulary `infield, outfield` for our demonstration of how a sequence of words is represented by the transformer and we will use `data_iter` to show how data flows from the dataset though our transformer

In [3]:
opt = Options(batchsize=2, device=device, epochs=25, lr=0.01, 
              beam_width=3, max_len = 25, save_path = '../saved/weights/model_weights')

data_iter, infield, outfield, opt = json2datatools(path='../saved/pairs.json', opt=opt)

Lets take a look inside the Transformer, like older sequence to sequence models, Transformers also encode the sentence into vector representations and pass those representations along to the decoder to generate the response/reply/output/translation/etc. The Encoder and Decoder have subcomponents which is discussed in Elements, Encoder and Decoder. For now, just define the Transformer class, instantiate a model as chloe and define the optimizer and scheduler by running the cell below 

In [4]:
class Transformer(nn.Module):
    def __init__(self, in_vocab_size, out_vocab_size, emb_dim, n_layers, heads, dropout):
        super().__init__()
        self.encoder = Encoder(in_vocab_size, emb_dim, n_layers, heads, dropout)
        self.decoder = Decoder(out_vocab_size, emb_dim, n_layers, heads, dropout)
        self.out = nn.Linear(emb_dim, out_vocab_size)
    def forward(self, src_seq, trg_seq, src_mask, trg_mask):
        e_output = self.encoder(src_seq, src_mask)
        d_output = self.decoder(trg_seq, e_output, src_mask, trg_mask)
        output = self.out(d_output)
        return output
    
emb_dim, n_layers, heads, dropout = 32, 3, 8, 0.01 
chloe = Transformer(len(infield.vocab), len(outfield.vocab), emb_dim, n_layers, heads, dropout)
chloe.load_state_dict(torch.load(opt.save_path))

if opt.device == torch.device("cuda:0"):
    chloe =  chloe.cuda()
    
optimizer = torch.optim.Adam(chloe.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)
scheduler = CosineWithRestarts(optimizer, T_max=num_batches(data_iter))

The line `batch = next(iter(data_iter))` extracts one batch of data into the variable `batch`. The batch has two parts to it, the `listen` part that is the input to the model, and the `reply` part that is the preferred output response to the corresponding `listen` input. This is a flawed assumption for conversation since there are many valid responses to anything one listens to or hears. But for now we will use this method to teach chloe to reply in some linguistically coherent manner. 

`.transpose(0,1)` is used to flip the orientation of the data contained in batch.listen

the first dimension of `listen.shape` and `reply.shape` should match whatever you put into `opt = Options(batchsize=2` above, it is the number of samples in each batch. Each sample is one line in our pairs.json dataset. The next dimension is the sequence length. Although it seems messy, I have printed out each piece of data from the batch and it's shape. Each is a 2-dimensional tensor, or matrix. Above the listen and reply tensors I have printed the entire vocabulary so you can resolve that the integers represent coherent sentences. 

Notice that the reply always starts with an integer that represents the "Start Of Sentence" or `<sos>` token, and ends with the "End Of Sentence"or `<eos>` token, so it's sequence length is the length of the reply sentence + 2

In [20]:
batch = next(iter(data_iter))
listen = batch.listen.transpose(0,1)
reply = batch.reply.transpose(0,1)
reply_input = reply[:, :-1]

print(infield.vocab.stoi)
print(" ------------------------------------------------------ ")
print(listen, listen.shape)
print(" ------------------------------------------------------ ")
print(outfield.vocab.stoi)
print(" ------------------------------------------------------ ")
print(reply, reply.shape)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x12d1e00f0>>, {'<unk>': 0, '<pad>': 1, '?': 2, 'you': 3, 'chloe': 4, 'i': 5, 'ok': 6, 'are': 7, 'hi': 8, 'a': 9, 'bye': 10, 'haha': 11, 'hello': 12, 'how': 13, 'later': 14, 'lol': 15, 'sure': 16, 'alive': 17, 'am': 18, 'any': 19, 'do': 20, 'dont': 21, 'dunno': 22, 'go': 23, 'gotta': 24, 'ill': 25, 'im': 26, 'joke': 27, 'know': 28, 'me': 29, 'more': 30, 'robot': 31, 'see': 32, 'talk': 33, 'tell': 34, 'think': 35, 'to': 36, 'true': 37, 'ttyl': 38, 'vicki': 39, 'what': 40, 'who': 41})
 ------------------------------------------------------ 
tensor([[34, 29,  9, 27,  4,  1],
        [40, 20,  3, 35,  4,  2]]) torch.Size([2, 6])
 ------------------------------------------------------ 
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x12d1e0080>>, {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3, 'you': 4, '?': 5, ',': 6, 'i': 7, 'are': 8, 'joke': 9, 'say': 10, 'a': 1

Tensors have to be single values (scalars), lines (vectors), rectangles (matrices) or boxes. This means that if I want my batch to have sentence of unequal length, the shorter sentences have to be padded with the `<pad>` token so that every sentence is the same length, the example input batch here has been padded into a 3x3 tensor 

`i`      `love`   `you`

`hello`  `<pad>` `<pad>`

`see`  `ya` `<pad>`


However, the padding does not carry useful information, so it must be masked away. Masks are tensors that are the same shape as the tensors they are a mask for, here we show a mask for the listen tensor and for the reply tensor. Think of a real mask, usually there will be two holes poked out for the eyes and one hole for the mouth, but otherwise the mask is meant to cover the rest of my face. The mask for the input listen tensor covers the padded elements. Seems tedious, but this is an artifact of the way wee need to prepare data to train efficiently. The mask tensors printed below have a `True` or `1` in the positions that are open for use, and a `False` or `0` in the positions that are meant to be covered or not used. 

The shape of the listen mask is **(batch_size, 1, input_sequence_length)**, why is there a 1 in there? We will explain later. 

The Mask for the output sequence, or reply tensor, has an additional meaning. Transformers use something called "Attention", which we will talk about in the Elements.ipynb, Encoder.ipynb and Decoder.ipynb sections. At a high level, attention is a good word to describe what is occuring computationally. Suppose you are replying to this phrase:

"action potentials flow from dendrite to soma to where ?"

you write:

"they from from soma to axon"

As you, the decoder, intend to write the words "they flow from", you are paying attention to the words "action potentials flow from" from the encoder, this is called encoder attention. In addition you see that instead of flow I wrote "from" twice becaue I didnt pay enough attention to what I had already written, This is called self attention. 

As the decoder writes each word in responce, one word at a time, the word it just produce and it's previous words are fed back into the decoder to get the next word so it can pay attention to what has already been written. 

During training, we judge it based on whether it produced the right word or not, if it does not, we insert the right word at each position and hope that the next position is better predicted. This is called teacher forcing. during training, it can only pay attention to words in previous earlier positions, thus, even though we have the entire correct sequence given to us with each training batch, we will hide the words in later positions through the mask to simulate real self attention (you cant pay self attention to words you havent said yet). In addition to suppressing/covering the pad tokens, the reply mask also covers sequence positions in the future. 

This is why the shape of the reply mask is **(batch_size, output_sequence_length, output_sequence_length)**, having 2 dimension named the same might be confusion, so i will rewrite this as 

**reply mask (batch_index, time_step, output_mask_position)**

In the print statement below, I have shown you the mask `reply_mask[0,0,:]` indexed as `[0,0` for the first sample and first time step, `:` means all , it prints `True, False, . . . False` because at the first time step you can pay attention to the `<sos>` token only, not any future words in the "correct reply", at the next time step you can pay attention to both `<sos>` and also the first correct token

In [23]:
listen_mask, reply_mask = create_masks(listen, reply_input, opt)
print(listen_mask, reply_mask.shape)
print(" ------------------------------------------------------ ")
print(reply_mask[0,0,:], reply_mask.shape)

tensor([[[ True,  True,  True,  True,  True, False]],

        [[ True,  True,  True,  True,  True,  True]]]) torch.Size([2, 10, 10])
 ------------------------------------------------------ 
tensor([ True, False, False, False, False, False, False, False, False, False]) torch.Size([2, 10, 10])


The shape of the model (chloe's) output is **(batch_size, output_sequence_length, output_vocab_size)**
For every reply in batch_size, chloe says output_sequence_length number of words, for each output position, `len(outfield.vocab)` numbers are given. `len(outfield.vocab)` is the number of words that are in chloe's output vocabulary. Yes, chloe has a separate vocab size for what she can hear and what she can say. suppose output_vocab_size = 100, then chloe only knows to output 100 different tokens. I say token, because this includes not only words, but also `<eos>`, `?`, `ummm`, `,` and other discreet symbols. 

Each of these numbers in `chloes_reply[0,-1,:]` represents one token, when I ran this cell the response included 

`'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3, 'you': 4, '?': 5,`

`tensor([-1.7353, -0.5510, -1.4945,  7.2842,  2.6917,` 

and this makes alot of sense because `[0,-1,:]` indexes to the first example and the last output token. It includes all `:` the vocabulary. The tensor has in position 0, `-1.7353`, in position 3 it has `7.2842`, this was the highest number in the vector, it means that of all the tokens in the vocabulary, the word chloe things should come next is the `<eos>` token, 3 is the index for the end of sentence token  `'<eos>': 3,`, meaning that she is done talking. 

In [27]:
chloes_reply = chloe(listen, reply_input, listen_mask, reply_mask)

print('input_vocab_size =', len(infield.vocab), ', output_vocab_size =', len(outfield.vocab))
print(" ------------------------------------------------------ ")
print(outfield.vocab.stoi)
print(" ------------------------------------------------------ ")
print(chloes_reply[0,-1,:], chloes_reply.shape)

input_vocab_size = 42 , output_vocab_size = 65
 ------------------------------------------------------ 
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x12d1e0080>>, {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3, 'you': 4, '?': 5, ',': 6, 'i': 7, 'are': 8, 'joke': 9, 'say': 10, 'a': 11, 'bye': 12, 'cats': 13, 'do': 14, 'french': 15, 'hi': 16, 'how': 17, 'thank': 18, 'ttyl': 19, 'can': 20, 'for': 21, 'tell': 22, 'thanks': 23, 'alive': 24, 'and': 25, 'at': 26, 'beaucoup': 27, 'but': 28, 'dont': 29, 'either': 30, 'know': 31, 'laughing': 32, 'me': 33, 'meowci': 34, 'my': 35, 'ok': 36, 'some': 37, 'they': 38, 'to': 39, 'am': 40, 'biological': 41, 'bragging': 42, 'chloe': 43, 'conceding': 44, 'definition': 45, 'depends': 46, 'flash': 47, 'have': 48, 'just': 49, 'less': 50, 'math': 51, 'more': 52, 'of': 53, 'on': 54, 'robot': 55, 'sad': 56, 'science': 57, 'teach': 58, 'vicki': 59, 'viruses': 60, 'when': 61, 'will': 62, 'yes': 63, 'your': 64})
 -------

suppose the correct reply to listen = `bye`  `chloe` is reply = `see`  `ya` `<eos>`

the batch we get gives is listen = `bye`  `chloe` and reply = `<sos>` `see`  `ya` `<eos>`

we input `bye`  `chloe` into chloe, give her `<sos>` `see`  `ya`, plus the masks to prevent peaking into the future, and we compare whatever she replies against `see`  `ya` `<eos>`. I have been referring to `see`  `ya` `<eos>` as the "correct response", but this is also called the "target" abbreviated trg. In addition, what I have been referring to as "listen" is also called the "source" abbreviated src. 

## Loss functions



defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x12d1e00f0>>, {'<unk>': 0, '<pad>': 1, '?': 2, 'you': 3, 'chloe': 4, 'i': 5, 'ok': 6, 'are': 7, 'hi': 8, 'a': 9, 'bye': 10, 'haha': 11, 'hello': 12, 'how': 13, 'later': 14, 'lol': 15, 'sure': 16, 'alive': 17, 'am': 18, 'any': 19, 'do': 20, 'dont': 21, 'dunno': 22, 'go': 23, 'gotta': 24, 'ill': 25, 'im': 26, 'joke': 27, 'know': 28, 'me': 29, 'more': 30, 'robot': 31, 'see': 32, 'talk': 33, 'tell': 34, 'think': 35, 'to': 36, 'true': 37, 'ttyl': 38, 'vicki': 39, 'what': 40, 'who': 41})
 ------------------------------------------------------ 
input_vocab_size = 42 , output_vocab_size = 65
 ------------------------------------------------------ 
tensor([[15, 11],
        [11, 15]]) torch.Size([2, 2])
 ------------------------------------------------------ 
tensor([[ 2, 23, 21, 32, 26, 35,  9,  3],
        [ 2, 23, 21, 32, 26, 35,  9,  3]]) torch.Size([2, 8])


In [5]:
def trainer(model, data_iterator, options, optimizer, scheduler):

    if torch.cuda.is_available() and options.device == torch.device("cuda:0"):
        print("a GPU was detected, model will be trained on GPU")
        model = model.cuda()
    else:
        print("training on cpu")

    model.train()
    start = time.time()
    best_loss = 100
    for epoch in range(options.epochs):
        total_loss = 0
        for i, batch in enumerate(data_iterator): 
            src = batch.listen.transpose(0,1)
            trg = batch.reply.transpose(0,1)
            trg_input = trg[:, :-1]
            src_mask, trg_mask = create_masks(src, trg_input, options)
            preds = model(src, trg_input, src_mask, trg_mask)
            ys = trg[:, 1:].contiguous().view(-1)
            optimizer.zero_grad()
            batch_loss = F.cross_entropy(preds.view(-1, preds.size(-1)), 
                                         ys, ignore_index = options.trg_pad)
            batch_loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += batch_loss.item()

        epoch_loss = total_loss/(num_batches(data_iterator)+1)
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save(model.state_dict(), options.save_path)
        print("%dm: epoch %d loss = %.3f" %((time.time() - start)//60, epoch, epoch_loss))
        total_loss = 0

    return model