In [1]:
import time, sys, math, copy
#sys.path.append('/path/to/env/lib/python3.6/site-packages')
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

import import_ipynb
from MoveData import Options, json2datatools, num_batches, nopeak_mask, create_masks

importing Jupyter notebook from MoveData.ipynb


If you are not using this notebook to learn, change the below variable `teaching` to False, if you are here to learn, change it to True

In [17]:
teaching = True 

Before going into this lesson, lets remind ourselves where the components we are about to learn fit into the big picture. Recall the overall architecture of the Transformer. It is useful to image yourself as a piece of data, then imagine the journey you are about to go on. 

Imagine you emerge from the `data_iter` function and into the transformer, the first place you will go is the encoder. initialized using `self.encoder = Encoder(in_vocab_size, emb_dim, n_layers, heads, dropout)` and used in the form `e_output = self.encoder(src_seq, src_mask)` 

In [2]:
class Transformer(nn.Module):
    def __init__(self, in_vocab_size, out_vocab_size, emb_dim, n_layers, heads, dropout):
        super().__init__()
        self.encoder = Encoder(in_vocab_size, emb_dim, n_layers, heads, dropout)
        self.decoder = Decoder(out_vocab_size, emb_dim, n_layers, heads, dropout)
        self.out = nn.Linear(emb_dim, out_vocab_size)
    def forward(self, src_seq, trg_seq, src_mask, trg_mask):
        e_output = self.encoder(src_seq, src_mask)
        d_output = self.decoder(trg_seq, e_output, src_mask, trg_mask)
        output = self.out(d_output)
        return output

The first component, or module, within the Encoder, is the Embedder. emb_dim is short for embedding_dimensions

`self.embed = Embedder(vocab_size, embedding_dimensions)` 

`x = self.embed(source_sequence)`

In [3]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, n_layers, heads, dropout):
        super().__init__()
        self.n_layers = n_layers
        self.embed = Embedder(vocab_size, emb_dim)
        self.pe = PositionalEncoder(emb_dim, dropout=dropout)
        self.layers = get_clones(EncoderLayer(emb_dim, heads, dropout), n_layers)
        self.norm = Norm(emb_dim)
    def forward(self, src_seq, mask):
        x = self.embed(src_seq)
        x = self.pe(x)
        for i in range(self.n_layers):
            x = self.layers[i](x, mask)
        x = self.norm(x)
        return x

What is it Embedding? As we mentioned before, one ability, or limitation depending on how you look at it, of chloe is her fixed vocabulary, each word or symbol in her vocabulary is assigned an integer. For example the word hi is assigned 3, the word dog is 17, a word not in the vocabulary is 0. This integer is the `token` index. 

The neural nework sees every word as a vector. [A vector of 3 real numbers forms the coordinates in 3D space](https://youtu.be/fNk_zzaMoSs). We use several more dimensions than 3 in deep learning, if we use 512 dimensions, our `embedding_dimensions = 512`, this means that each word is a point in 512 dimensional space. The same concepts apply to 3D space in that the location of that word in 3D space tells you it's [meaning and meaning relative to other words](https://youtu.be/8rXD5-xhemo?t=1550).

<img src="../saved/images/wordvectors.png" height=400 width=400>

In the image you see above, similar words are close to each other, not only that, the direction they are separated from eachother also carries meaning. In the image, there are 3 clusters of words and the separation between them has something to do with age/time/etc.  

If you stack all the vectors on top of eachother row by row, you get a matrix. Remember how each word is represented by both a vector and an integer? well this integer is the index for a row in the matrix. The matrix is called the embedding matrix. you might say that we "embed" words into the matrix. Now for the example, run the cells below 

In [4]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.emb_dim = emb_dim
        self.embed = nn.Embedding(vocab_size, emb_dim)
    def forward(self, x):
        return self.embed(x)

To demonstrate the Embedder, I will show you it's two functions

- storing an embedding matrix of word vectors 
- transforming a sequence of integers that represent token indices, into a sequence of vectors

Lets start off by creating a toy embedding with only 2 tokens in it, these tokens will be represented in 4-dimensional space. After creating the embedding, I pass into the Embedder a sequence of token indices, the integers `[1,0,1]`

In [5]:
if teaching:  
    torch.manual_seed(0)
    embedding = Embedder(vocab_size=2, emb_dim=4)
    source_sequence = torch.from_numpy(np.asarray([1,0,1])).unsqueeze(0)
    print('source_sequence',source_sequence, source_sequence.shape)
    print("---------------------------------------------------")
    print("Embedding Matrix", embedding.embed.weight, embedding.embed.weight.shape)
    print("---------------------------------------------------")
    sequence_of_vectors = embedding(source_sequence)
    print('sequence_of_vectors',sequence_of_vectors, sequence_of_vectors.shape)

source_sequence tensor([[1, 0, 1]]) torch.Size([1, 3])
---------------------------------------------------
Embedding Matrix Parameter containing:
tensor([[ 1.5410, -0.2934, -2.1788,  0.5684],
        [-1.0845, -1.3986,  0.4033,  0.8380]], requires_grad=True) torch.Size([2, 4])
---------------------------------------------------
sequence_of_vectors tensor([[[-1.0845, -1.3986,  0.4033,  0.8380],
         [ 1.5410, -0.2934, -2.1788,  0.5684],
         [-1.0845, -1.3986,  0.4033,  0.8380]]], grad_fn=<EmbeddingBackward>) torch.Size([1, 3, 4])


Suppose, as I mentioned earlier, I am unhappy with chloe's limited vocabulary `{"me":0, "give":1}`. I want her to learn the word "covfefe". I will have to add a word to her dictionary `{"covfefe":2}` (not shown), then I will have to initialize a new word vector and add it, concatenate it, to chloe's embedding matrix `embedding.embed.weight`. In the cell below, I do just that. 

`concatenated_matrix` is the concatenation of the old matrix `embedding.embed.weight` with the `new_vector` appended to the bottom row. Now we can embed our

sequence of integers **(batch_size, sequence_length)** 

into a vector sequence the phrase "give me covfefe"


In [6]:
if teaching:  
    np.random.seed(0)
    new_vector = torch.from_numpy(np.random.uniform(-0.1,0.1,(1, 4)).astype(np.float32))
    print('new_vector', new_vector, new_vector.shape)
    print("---------------------------------------------------")
    concatenated_matrix = torch.cat((embedding.embed.weight, new_vector),dim=0)
    embedding.embed.weight=nn.Parameter(concatenated_matrix,requires_grad=True)
    print("New Embedding Matrix", embedding.embed.weight, embedding.embed.weight.shape)
    print("---------------------------------------------------")
    source_sequence = torch.from_numpy(np.asarray([1,0,2])).unsqueeze(0)
    print('source_sequence',source_sequence, source_sequence.shape)
    print("---------------------------------------------------")
    sequence_of_vectors = embedding(source_sequence)
    print('sequence_of_vectors',sequence_of_vectors, sequence_of_vectors.shape)

new_vector tensor([[0.0098, 0.0430, 0.0206, 0.0090]]) torch.Size([1, 4])
---------------------------------------------------
New Embedding Matrix Parameter containing:
tensor([[ 1.5410, -0.2934, -2.1788,  0.5684],
        [-1.0845, -1.3986,  0.4033,  0.8380],
        [ 0.0098,  0.0430,  0.0206,  0.0090]], requires_grad=True) torch.Size([3, 4])
---------------------------------------------------
source_sequence tensor([[1, 0, 2]]) torch.Size([1, 3])
---------------------------------------------------
sequence_of_vectors tensor([[[-1.0845, -1.3986,  0.4033,  0.8380],
         [ 1.5410, -0.2934, -2.1788,  0.5684],
         [ 0.0098,  0.0430,  0.0206,  0.0090]]], grad_fn=<EmbeddingBackward>) torch.Size([1, 3, 4])


## Positional Encoding

I find it useful to jeep track of the shape of my data as it goes on it's journey through the neurla network

After the embedded, the shape is this vector sequence is **(batch_size, sequence_length, embedding_dimensions)**

`[[-1.0845, -1.3986,  0.4033,  0.8380 ]` give

` [ 1.5410, -0.2934, -2.1788,  0.5684 ]` me

` [ 0.0098,  0.0430,  0.0206,  0.0090]]` covefefe
  
 
 
 $$PE_{(pos,2i)} = \sin(pos/1000^{2i/d_{model}})$$
 $$PE_{(pos,2i)} = \cos(pos/1000^{2i/d_{model}})$$

In [7]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        # create constant 'pe' matrix with values dependant on pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0) # add a batch dimention to your pe matrix 
        self.register_buffer('pe', pe)
 
    def forward(self, x):
        
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
        #print('x.shape', x.shape) # (batch_size, input_seq_len, d_model)
        #print('pe.shape', pe.shape) # (1, input_seq_len, d_model)
        if x.is_cuda:
            pe.cuda()
        x = x + pe
        return self.dropout(x)

In [18]:
if teaching: 
    positioner = PositionalEncoder(d_model=4, max_seq_len=100, dropout=0.0)
    print(positioner.pe.shape)
    print(positioner.pe[:,:3].shape)

torch.Size([1, 100, 4])
torch.Size([1, 3, 4])


In [20]:
if teaching: 
    print('sequence_of_vectors',sequence_of_vectors, sequence_of_vectors.shape)
    print("---------------------------------------------------")
    sequence_of_vectors = positioner(sequence_of_vectors)
    print(sequence_of_vectors, sequence_of_vectors.shape)
    print("---------------------------------------------------")

sequence_of_vectors tensor([[[ -8.6762,  -4.1888,   3.2268,  13.7042],
         [ 18.2183,   4.6522, -17.4296,  11.5475],
         [  6.4432,   7.3429,   0.1658,   7.0718]]], grad_fn=<AddBackward0>) torch.Size([1, 3, 4])
---------------------------------------------------
x.shape torch.Size([1, 3, 4])
pe.shape torch.Size([1, 3, 4])
tensor([[[-17.3524,  -7.3775,   6.4535,  28.4084],
         [ 37.2780,  10.3044, -34.8591,  24.0949],
         [ 13.7957,  15.6856,   0.3318,  15.1436]]], grad_fn=<AddBackward0>) torch.Size([1, 3, 4])
---------------------------------------------------


sequence_of_vectors tensor([[[-2.1690, -1.7972,  0.8067,  2.6761],
         [ 3.9235,  0.4131, -4.3575,  2.1369],
         [ 0.9288,  1.0859,  0.0413,  1.0180]]], grad_fn=<AddBackward0>) torch.Size([1, 3, 4])
---------------------------------------------------
x.shape torch.Size([1, 3, 4])
pe.shape torch.Size([1, 3, 4])
tensor([[[-4.3381, -2.5944,  1.6134,  6.3521],
         [ 8.6884,  1.8261, -8.7149,  5.2737],
         [ 2.7669,  3.1716,  0.0828,  3.0359]]], grad_fn=<AddBackward0>) torch.Size([1, 3, 4])
---------------------------------------------------


plt.figure(figsize=(15, 5))
pe = PositionalEncoding(20, 0)
y = pe.forward(Variable(torch.zeros(1, 100, 20)))
plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())
plt.legend(["dim %d"%p for p in [4,5,6,7]])

In [None]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        
        self.eps = eps
    
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
    
    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output
    
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, emb_dim, dropout = 0.1):
        super().__init__()
        
        self.emb_dim = emb_dim
        self.k_dim = emb_dim // heads
        self.h = heads
        
        self.q_linear = nn.Linear(emb_dim, emb_dim)
        self.v_linear = nn.Linear(emb_dim, emb_dim)
        self.k_linear = nn.Linear(emb_dim, emb_dim)
        
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(emb_dim, emb_dim)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into N heads
        k = self.k_linear(k).view(bs, -1, self.h, self.k_dim)
        q = self.q_linear(q).view(bs, -1, self.h, self.k_dim)
        v = self.v_linear(v).view(bs, -1, self.h, self.k_dim)
        
        # transpose to get dimensions bs * N * sl * d_model
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        

        # calculate attention using function we will define next
        scores = attention(q, k, v, self.k_dim, mask, self.dropout)
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.emb_dim)
        output = self.out(concat)
    
        return output

class FeedForward(nn.Module):
    def __init__(self, emb_dim, ff_dim=2048, dropout = 0.1):
        super().__init__() 
    
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(emb_dim, ff_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(ff_dim, emb_dim)
    
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x
    
    
class EncoderLayer(nn.Module):
    def __init__(self, emb_dim, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(emb_dim)
        self.dropout_1 = nn.Dropout(dropout)
        self.attn = MultiHeadAttention(heads, emb_dim, dropout=dropout)
        self.norm_2 = Norm(emb_dim)
        self.ff = FeedForward(emb_dim, dropout=dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x