In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt

In [3]:
class EncoderDecoder(nn.Module):
    #Constructor
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

### Embedding

We need to embed each word in the corpus, based on semantic similarity. 

In [4]:
class Embedder(nn.Module):
    def__init(self, vocab_size, dimensionality):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, dimensionality)

### Multi-headed Attention

Standard RNN architectures typically fall short at capturing the meaning of really long sentences. In **attention mechanisms**, each output word depends on a weighted combination of each input word, rather than just the final hidden state from a standard RNN. The weights tell the decoder which state to pay most attention to when generating each word in the output sentence. 

**Multi-headed attention** is the process of splitting up the final hidden state into equal sized chunks. Attention is applied to each chunk in parallel, then the chunks are concatenated. It is typically used to give the attention layer multiple "representation subspaces." This is actually the main component of the transformer model. 

Our paper uses a specific form of attention, called **scaled dot-product attention**. The equation is: 

$$Attention(\textbf{Q}, \textbf{K}, \textbf{V}) = softmax(\frac{\textbf{QK}^{\top}}{\sqrt{n}})\textbf{V}$$

where $n = dim(K)$.


In [38]:
#nn.Module is the base class referenced for all neural network modules in pytorch

class MultiHeadedAttention(nn.Module):
    
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.h = heads
        self.d_K = np.power(d_model, 1/heads)
        
        #Q, V, and K matrices used in attention functions
        self.Q_linear = nn.Linear(d_model, d_model)
        self.V_linear = nn.Linear(d_model, d_model)
        self.K_linear = nn.Linear(d_model, d_model)
        
        #dropout prevents overfitting  during the attention stage
        self.dropout = nn.Dropout(dropout)
        
        #attention_scores = attention(q, k, d_k, mask=None, dropout=None)
    
    #d_k is the square root of the dimension of the key vectors
    #For some reason the paper uses a dimension of 64 so we're going with that 
    
    #each word in the sentence vector is going to have its own Q, K, and V value. 
    def Attention(self, Q, K, V, d_K, dropout=None):
        scores = F.softmax(torch.matmul(Q, K.transpose(0,1)) / np.sqrt(d_K), dim = 0)
        scores = scores * V
        print("Attention Scores: ")
        return scores
        #score vector should be same dimensions as Q,K,V

In [51]:
#First parameter: Number of heads
#Second parameter: Dimension of model
net = MultiHeadedAttention(3, 3)

### Loss Function