In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt

In [3]:
class Embedder():
    def __init__(self, vocab_size, embedding_dim):
        super(Embedder, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

e = Embedder(3,4)

In [4]:
vocab = ["Hello", "I", "am", "Dylan"]
word_to_ix = {word: i for i, word in enumerate(vocab)}
print(word_to_ix)



{'Hello': 0, 'I': 1, 'am': 2, 'Dylan': 3}


https://arxiv.org/abs/1706.03762

http://jalammar.github.io/illustrated-transformer/

http://nlp.seas.harvard.edu/2018/04/03/attention.html

https://medium.com/@kolloldas/building-the-mighty-transformer-for-sequence-tagging-in-pytorch-part-i-a1815655cd8

https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec

The following code implements the **positional encoding** step of the Transformer. It is needed to let the model understand the relative positions of each word in the embedding matrix. 

In [189]:
class PositionalEncoder(nn.Module):
    
    def __init__(self, d_model, max_seq_len):
        super().__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len
    
        #initialize positional encoding matrix to contain all zeroes
        #max_seq_len = longest sentence we expect to observe in embedding matrix
        
        pos_encoding_mat = torch.zeros(max_seq_len, d_model)
    
        #iterate over one sentence position at a time
        
        for pos in range(max_seq_len):
            
            #iterate over each sentence at current pos
            #Step size is 2 because you have to append cos sinusoid next to sin sinusoid
            for i in range(0, d_model, 2):
                #Create sinusoid for each entry in embedding matrix
                #pos = embedding matrix row (embedded word location)
                #i = embedding matrix column (embedding vector location)
                pos_encoding_mat[pos, i] = math.sin(pos / (np.power(10000, (2 * i) / d_model)))
                pos_encoding_mat[pos, i + 1] = math.cos(pos / (np.power(10000, (2 * (i+1)) / d_model)))
                
                pos_encoding_mat[pos, i] = 1 
        
        pos_encoding_mat = pos_encoding_mat.unsqueeze(0)
        
        self.register_buffer('pos_encoding_mat', pos_encoding_mat)
        print(pos_encoding_mat)
        
    #function to add positional encoding matrix to embedding matrix
    def forward(self, x):
        x = x + Variable(self.pos_encoding_mat[:,:x.size(1)], requires_grad = False)
        return x
    

In [194]:
#10 columns
#longest sequence will be length 500
en = PositionalEncoder(10, 12)
x = torch.zeros(10,12)
en.forward(x)


tensor([[[ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000],
         [ 1.0000,  0.9875,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000],
         [ 1.0000,  0.9502,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000],
         [ 1.0000,  0.8891,  1.0000,  0.9999,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000],
         [ 1.0000,  0.8057,  1.0000,  0.9999,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000],
         [ 1.0000,  0.7021,  1.0000,  0.9998,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000],
         [ 1.0000,  0.5809,  1.0000,  0.9997,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000],
         [ 1.0000,  0.4452,  1.0000,  0.9996,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000],
         [ 1.0000,  0.2983,  1.0000,  0.9995,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000],
 

RuntimeError: The size of tensor a (12) must match the size of tensor b (10) at non-singleton dimension 2

The dimensions (# columns) of the Query, Value, and Key matrices are hyperparameters. Also, if there are $n$ words in a sequence (input sentence), then the $Q$ ,$V$, and $K$ matrices will have $n$ rows. After $matmul(Q, K)$, the dimensionality of the resulting matrix will be $n \times n$.

In [181]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, input_dim, key_dim):
        super(MultiHeadAttention, self).__init__()
        self.heads = heads
        self.input_dim = input_dim
        self.key_dim = key_dim
        
        #Query, value, and key matrices
        #first arg: hidden size (# of words in input)
        #second arg: feature size (hyperparameter)
        self.W_Q = nn.Linear(input_dim, key_dim)
        self.W_V = nn.Linear(input_dim, key_dim)
        self.W_K = nn.Linear(input_dim, key_dim)
    
    def generate_heads(self, x):
        Q = self.W_Q(x)
        V = self.W_V(x)
        K = self.W_V(x)
        print(Q,V,K)
        #print("Generating " + str(self.heads) + " heads") 
        #for tensor in [self.W_Q, self.W_V, self.W_K]:
        #    torch.split(tensor, self.heads, dim=1)
        
    def ScaledDotAttention(self, embeddings, Q, V, K, d_K):
        K_T = torch.transpose(torch.matmul(embeddings, W_K), 0, -1)
        soft = F.softmax(K_T / np.sqrt(d_K), -1)
        return torch.matmul(soft, V)   
    
W = torch.rand(1,3)
Q = torch.rand(1,3)
V = torch.rand(1,3)

word = torch.rand(3,1)
print(W, word)

m = MultiHeadAttention(6, 1, 3)

m.generate_heads(word)

tensor([[0.5220, 0.5080, 0.4684]]) tensor([[0.5964],
        [0.1537],
        [0.5928]])
tensor([[0.4996, 0.6760, 1.0904],
        [0.2569, 0.6730, 0.7768],
        [0.4977, 0.6760, 1.0879]], grad_fn=<ThAddmmBackward>) tensor([[ 0.6447, -0.9489, -0.4299],
        [ 0.2664, -0.6795, -0.2627],
        [ 0.6417, -0.9467, -0.4286]], grad_fn=<ThAddmmBackward>) tensor([[ 0.6447, -0.9489, -0.4299],
        [ 0.2664, -0.6795, -0.2627],
        [ 0.6417, -0.9467, -0.4286]], grad_fn=<ThAddmmBackward>)


In [46]:
V = nn.Linear(2,5)
x = torch.rand(2,2)
print(V(x))

tensor([[ 0.4434,  0.1025,  0.4296, -0.7735, -0.8767],
        [ 0.0263, -0.1078, -0.1602, -0.4200, -0.7669]],
       grad_fn=<ThAddmmBackward>)
