In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchtext
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import pandas as pd
import numpy as np
import spacy
import nltk
import time
import random

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [4]:
!python -m spacy download de #(RUN this if needed to download the tokenizer for de)
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 1.2MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=9903371dbdafed4db999d97332fa97b31dc5326b2bb23f3d329ceb8e8cda7439
  Stored in directory: /tmp/pip-ephem-wheel-cache-4slcrdkv/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/

In [5]:
def tokenize_de(text):
  """
  Tokenizes German text from a string into a list of strings
  """
  return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
  """
  Tokenizes English text from a string into a list of strings
  """
  return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
SRC = Field(tokenize = tokenize_de,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True,
            batch_first = True)

TRG = Field(tokenize = tokenize_en,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True,
            batch_first = True)

In [7]:
train_data, valid_data, test_test = Multi30k.splits(exts= ('.de', '.en'),
                                                    fields = (SRC, TRG))

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 604kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 168kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 160kB/s]


In [8]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_test),
                                                                      batch_size=BATCH_SIZE,
                                                                      device=device)

In [9]:
class Encoder(nn.Module):
  def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
    super().__init__()
    
    self.device = device
    self.tok_embedding = nn.Embedding(input_dim, hid_dim)  #input is the number of lookups and hid_dim is the embedding dimensions
    self.pos_embedding = nn.Embedding(max_length, hid_dim)  #max_length is the number of sequence lookups
    # n_layers should refer to num of encoder layers
    self.layers = nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])

    self.dropout = nn.Dropout(dropout)
    # torch.sqrt only works on a tensor
    self.scale = torch.sqrt(torch.FloatTensor([hid_dim]))  # this scale for the embeddings are different for the self-attn

  def forward(self, src, src_mask):
    # src = [batch_size, src_len]
    # src_mask = [batch_size, src_len] Not sure what this will be for

    batch_size = src.size()[0]
    src_len = src.size()[1]

    #pos = the numericalization for the position embedding for nn.Embedding lookup
    pos = torch.arange(0, src_len).repeat(batch_size, 1) # repeat method repeats the number of times the tensor gets generated.

    src = self.dropout((self.scale * self.pos_embedding(pos)) + self.tok_embedding(src))  # what they did here seems off with the scaling factor on the pos_embedding

    for layer in self.layers:
      src = layer(src, src_mask)

    #src = [batch_size, src_len, hid_dim]
    return src


In [10]:
class LayerNorm(nn.Module):
  def __init__(self, hid_dim, eps=1e-9):
    self.a = nn.Parameter(torch.ones(hid_dim))
    self.b = nn.Parameter(torch.zeros(hid_dim))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)
    return self.a * (x - mean) / (std + self.eps) + self.b

In [11]:
class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, dropout, device):
    super().__init__()
    
    assert hid_dim % n_heads == 0  # this is to make sure that the number of heads is divisible by the hid_dim
    self.hid_dim = hid_dim
    self.n_heads = n_heads
    self.head_dim = hid_dim // n_heads

    self.WQ = nn.Linear(hid_dim, hid_dim)
    self.WK = nn.Linear(hid_dim, hid_dim)
    self.WV = nn.Linear(hid_dim, hid_dim)

    self.fc_o = nn.Linear(hid_dim, hid_dim)
    self.dropout = nn.Dropout(dropout)
    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim]))

  def forward(self, query, key, value, mask=None):
    #query = [batch size, query len, hid dim]
    #key = [batch size, key len, hid dim]
    #value = [batch size, value len, hid dim]

    batch = query.size()[0]
    Q = self.WQ(query)
    K = self.WK(key)
    V = self.WV(value)

    #Q = [batch size, n heads, query len, head dim]
    #K = [batch size, n heads, key len, head dim]
    #V = [batch size, n heads, value len, head dim]
    Q = Q.view(batch, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
    K = K.view(batch, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
    V = V.view(batch, -1, self.n_heads, self.head_dim).permute(0,2,1,3)

    # energy = [batch_size, n heads, query len, key len]
    energy = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale   

    # this part is used for the decoder when we need to keep leftward information
    if mask is not None:
      energy = energy.masked_fill(mask == 0, -1e10)
    
    # attention = [batch_size, n heads, query len, key len]
    attention = torch.softmax(energy, dim = -1)

    # x = [batch_size, n heads, query len, hid_dim]
    x = torch.matmul(self.dropout(attention), V)

    # x = [batch_size, query len, n heads, hid_dim]
    x = x.permute(0,2,1,3).contiguous()
    
    # x = [batch_size, query len * n heads, hid_dim]
    x = x.view(batch_size, -1, self.hid_dim)
    
    # x = [batch_size, query len * n heads, hid_dim]
    x = self.fc_o(x)
    return x, attention
    


In [12]:
a = torch.randn(2,100,512)
b = torch.randn(2,100,512)

In [13]:
c = a.view(2, -1, 4, 128).permute(0,2,1,3).contiguous()
d = b.view(2, -1, 4, 128).permute(0,2,1,3).contiguous()

In [14]:
c.size(), d.size()

(torch.Size([2, 4, 100, 128]), torch.Size([2, 4, 100, 128]))

In [54]:
lin(b)

tensor([[[[-3.1584e-01, -1.4610e-01, -2.1449e-01,  ..., -5.8498e-02,
            2.1875e-01,  1.2090e+00],
          [-2.5359e-01, -5.7685e-01, -6.8934e-02,  ..., -3.1597e-01,
            1.8308e-01,  4.3960e-02],
          [-7.7758e-01, -5.4961e-01,  6.1451e-01,  ...,  1.2833e-02,
           -1.2997e-02,  7.4842e-01],
          ...,
          [ 1.1810e+00,  9.4285e-01, -2.1197e-01,  ..., -1.9555e-01,
            9.6916e-02, -4.7627e-01],
          [ 2.2723e-01, -1.8306e-01,  7.9665e-01,  ..., -1.7253e-01,
            1.3715e-01,  2.6035e-01],
          [ 8.9522e-01, -2.6743e-02, -3.9374e-01,  ..., -1.4682e-01,
            2.3210e-01, -5.3901e-02]],

         [[ 6.9797e-01,  6.5521e-01,  3.8578e-01,  ...,  7.8776e-01,
           -6.6024e-02, -3.8928e-01],
          [-2.8730e-01,  8.3368e-01, -6.4027e-03,  ..., -8.0859e-01,
            8.0418e-01,  4.2457e-01],
          [ 1.4637e+00, -8.0052e-01, -3.8222e-01,  ..., -9.5757e-03,
            3.3649e-02,  4.1286e-01],
          ...,
     

In [30]:
class PositionFeedForwardLayer(nn.Module):
  def __init__(self, hid_dim, pf_dim, dropout):
    super().__init__()
    self.linear1 = nn.Linear(hid_dim, pf_dim)
    self.linear2 = nn.Linear(pf_dim, hid_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    x = F.relu(self.linear1(x))
    x = self.linear2(x)
    return x

In [21]:
class EncoderLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
    super().__init__()
    self.layer_norm = LayerNorm(hid_dim)
    self.ff_layer_norm = LayerNorm(hid_dim)
    self.self_attn = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
    self.positionwise_ffn = PositionFeedForwardLayer(hid_dim, pf_dim, dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src, src_mask):
    # src = [batch,src_len, hid_dim]
    # src_mask = [batch_size, src_len]
    _src, _ = self.self_attn(src, src, src, src_mask)
    src = self.layer_norm(src + self.dropout(_src))
    # src = [batch, src_len, hid_dim]
    ffn_src = self.positionwise_ffn(src)
    src = self.ff_layer_norm(src + self.dropout(ffn_src))
    return src

In [33]:
class Decoder(nn.Module):
  def __init__(self, out_dim, hid_dim, pff_dim, n_layers, n_heads, dropout, max_length=100):
    super().__init__()

    self.pos_emb = nn.Embedding(max_length, hid_dim)
    self.tok_emb = nn.Embedding(out_dim, hid_dim)
    self.layers = nn.ModuleList([ DecoderLayer(hid_dim, pff_dim, n_heads, dropout) for _ in range(n_layers)])
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(hid_dim, out_dim)
    self.scale = torch.sqrt(torch.FloatTensor[hid_dim])

  def forward(self, trg, enc_src, trg_mask, src_mask,):
    #trg = [batch size, trg len]
    #enc_src = [batch size, src len, hid dim]
    #trg_mask = [batch size, trg len]
    #src_mask = [batch size, src len]

    batch_size = trg.shape[0]
    trg_len = trg.shape[1]

    #pos = [batch size, trg len]
    pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1)
    trg = self.dropout((self.tok_emb(trg) * self.scale) + self.pos_emb(pos))

    #trg = [batch size, trg len, hid dim]
    #attention = [batch size, n heads, trg len, src len]
    for layer in self.layers:
      trg, attention = layer(trg, enc_src, trg_mask, src_mask)
    
    output = self.linear(trg)
    return output, attention
                              

In [34]:
class DecoderLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
    super().__init__()
    self.layer_norm = LayerNorm(hid_dim)
    self.enc_layer_norm = LayerNorm(hid_dim)
    self.pos_layer_norm = LayerNorm(hid_dim)
    self.self_attn = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
    self.enc_attn = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
    self.positionwise_ff = PositionFeedForwardLayer(hid_dim, pf_dim, dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, trg, enc_src, trg_mask, src_mask):
    #trg = [batch size, trg len, hid dim]
    #enc_src = [batch size, src len, hid dim]
    #trg_mask = [batch size, trg len]
    #src_mask = [batch size, src len]

    _trg, _ = self.self_attn(trg, trg, trg, trg_mask)

    trg = self.self_attn_layer_norm(trg + self.dropout(_trg))

    _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)

    trg = self.enc_layer_norm(trg + self.dropout(_trg))

    _trg = self.positionwise_ff(trg)

    trg = self.pos_layer_norm(trg + self.dropout(_trg))
    return trg, attention

In [18]:
nn.LayerNorm(torch.LongTensor([1,2,3,4]))

LayerNorm((tensor(1), tensor(2), tensor(3), tensor(4)), eps=1e-05, elementwise_affine=True)

In [14]:
torch.arange(0, 100).repeat(3,1)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
         54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
         72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
         90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
         54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
         72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
         90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         

In [4]:
torch.sqrt(torch.FloatTensor([512]))

tensor([22.6274])

In [6]:
1/torch.sqrt(torch.FloatTensor([512]))

tensor([0.0442])

In [25]:
a = torch.randn(4,4)

In [20]:
torch.arange(0, a.size()[1]).unsqueeze(0).repeat(2, 1)

tensor([[0, 1, 2, 3],
        [0, 1, 2, 3]])

Object `torch.repeat` not found.
