In [6]:
import math, copy, sys
sys.path.append('/anaconda/anaconda3/lib/python3.6/site-packages/torchtext-0.2.1-py3.6.egg')
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import import_ipynb
from Elements import *

In [2]:
class EncoderLayer(nn.Module):
    def __init__(self, emb_dim, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(emb_dim)
        self.dropout_1 = nn.Dropout(dropout)
        self.attn = MultiHeadAttention(heads, emb_dim, dropout=dropout)
        self.norm_2 = Norm(emb_dim)
        self.ff = FeedForward(emb_dim, dropout=dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, n_layers, heads, dropout):
        super().__init__()
        self.n_layers = n_layers
        self.embed = Embedder(vocab_size, emb_dim)
        self.pe = PositionalEncoder(emb_dim, dropout=dropout)
        self.layers = get_clones(EncoderLayer(emb_dim, heads, dropout), n_layers)
        self.norm = Norm(emb_dim)
    def forward(self, source_sequence, source_mask):
        '''
        input:
        source_sequence (sequence of source tokens) of shape (batch size, sequence length)
        source_mask (mask over input sequence) of shape (batch size, 1, sequence length)
        output: x.shape after layers and after norm both are of shape
        (batch size, sequence length, embedding dimensions)
        '''
        x = self.embed(source_sequence)
        x = self.pe(x)
        for i in range(self.n_layers):
            x = self.layers[i](x, source_mask)
        x = self.norm(x)
        return x