In [45]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import math

In [5]:
!python -m spacy download en
!python -m spacy download de

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 1.8 MB/s eta 0:00:01
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-py3-none-any.whl size=12047106 sha256=128b53b3e4a6d1c8dec59f503d1ec50741df87748eaabb04ccca54273a6077a3
  Stored in directory: /private/var/folders/rt/ppzpkmzd72335rvk3gsxw3q40000gn/T/pip-ephem-wheel-cache-hgom4g2f/wheels/b7/0d/f0/7ecae8427c515065d75410989e15e5785dd3975fe06e795cd9
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.3.1
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/opt/anaconda3/envs/te

In [6]:
import torchtext
from torchtext.experimental.datasets import IWSLT
from torchtext.data.utils import get_tokenizer

# tokenize the dataset (pairs of sentence strings -> pairs of arrays of indices)
src_tokenizer = get_tokenizer("spacy", language='de')
tgt_tokenizer = get_tokenizer("spacy", language='en')
train_dataset, valid_dataset, test_dataset = IWSLT(tokenizer=(src_tokenizer, tgt_tokenizer))

100%|██████████| 196884/196884 [00:46<00:00, 4209.42lines/s]
100%|██████████| 196884/196884 [00:41<00:00, 4726.86lines/s]


In [7]:
!python -m pip show torchtext

Name: torchtext
Version: 0.9.0.dev20210123
Summary: Text utilities and datasets for PyTorch
Home-page: https://github.com/pytorch/text
Author: PyTorch core devs and James Bradbury
Author-email: jekbradbury@gmail.com
License: BSD
Location: /opt/anaconda3/envs/test-transformer/lib/python3.7/site-packages
Requires: tqdm, requests, torch, numpy
Required-by: 


In [8]:
# vocab allows us see which index maps to which word
de_vocab, en_vocab = train_dataset.get_vocab()

def print_top(n_rows, mode='words'):
    if mode == 'words':
        for i in range(n_rows):
            de_sentence = [de_vocab.itos[index] for index in train_dataset[i][0]]
            en_sentence = [en_vocab.itos[index] for index in train_dataset[i][1]]
            print((de_sentence, en_sentence))
    elif mode == 'indices':
        for i in range(n_rows):
            print(train_dataset[i])

print_top(100, 'words')
print_top(10, 'indices')

(['David', 'Gallo', ':', 'Das', 'ist', 'Bill', 'Lange', '.', 'Ich', 'bin', 'Dave', 'Gallo', '.', '\n'], ['David', 'Gallo', ':', 'This', 'is', 'Bill', 'Lange', '.', 'I', "'m", 'Dave', 'Gallo', '.', '\n'])
(['Wir', 'werden', 'Ihnen', 'einige', 'Geschichten', 'über', 'das', 'Meer', 'in', 'Videoform', 'erzählen', '.', '\n'], ['And', 'we', "'re", 'going', 'to', 'tell', 'you', 'some', 'stories', 'from', 'the', 'sea', 'here', 'in', 'video', '.', '\n'])
(['Wir', 'haben', 'ein', 'paar', 'der', 'unglaublichsten', 'Aufnahmen', 'der', 'Titanic', ',', 'die', 'man', 'je', 'gesehen', 'hat', ',', ',', 'und', 'wir', 'werden', 'Ihnen', 'nichts', 'davon', 'zeigen', '.', '\n'], ['We', "'ve", 'got', 'some', 'of', 'the', 'most', 'incredible', 'video', 'of', 'Titanic', 'that', "'s", 'ever', 'been', 'seen', ',', 'and', 'we', "'re", 'not', 'going', 'to', 'show', 'you', 'any', 'of', 'it', '.', '\n'])
(['Die', 'Wahrheit', 'ist', ',', 'dass', 'die', 'Titanic', '–', 'obwohl', 'sie', 'alle', 'Kinokassenrekorde', 'b

In [39]:
print(len(en_vocab.itos))
print(len(en_vocab.stoi))
print(len(de_vocab.itos))
print(len(de_vocab.stoi))

58423
58423
133406
133406


In [69]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [70]:
def embed(x, vocab_size, d_model=512):
    # two embeddings (1) each token's numeral value is mapped to a embedding vector, index (scalar) -> embedding vector (size of d_model) 
    # (2) positional embedding is applied
    number_to_embedding = nn.Embedding(vocab_size, d_model)
    pos_embedding = PositionalEncoding(d_model)
    x = number_to_embedding(x) * math.sqrt(d_model)
    x = pos_embedding(x)
    return x

In [62]:
class Batch:
    def __init__(self, src, trg=None, pad_value=0):
        src_vocab_size, trg_vocab_size = len(de_vocab.itos), len(en_vocab.itos)
        # input src & trg are shape (sentence_len, batch_size) and embedded into (sentence_len, batch_size, d_model)
        self.src = embed(src, src_vocab_size)
        if trg != None:
            # given src & trg_x, we try to predict trg_y, which has ntokens words (i.e. we make ntokens predictions)
            trg_embedding = embed(trg, trg_vocab_size)
            self.trg_x = trg_embedding[:-1, :, :]
            self._trg_y = trg[1:, :]
            self.trg_y = self._trg_y.reshape(-1)
            self.ntokens = (self._trg_y != pad_value).sum()

In [67]:
# this tells dataloader how you want your batch to look like 
# input is a list of tensors of size batch_size (dataloader just feeds you a mini-batch of batch_size at a time and you can process it),
# where each tensor is [src, trg]
# output is whatever you want in train_epoch
def collate_batch(batch_data, pad_idx=1):
    max_src_len = max([len(sentence_pair[0]) for sentence_pair in batch_data])
    max_trg_len = max([len(sentence_pair[1]) for sentence_pair in batch_data])
    # initialize the padding in the shape of the result src/trg we want
    res_src = torch.zeros(len(batch_data), max_src_len).long() + pad_idx
    res_trg = torch.zeros(len(batch_data), max_trg_len).long() + pad_idx
    # layer the actual sentence on top of the padding
    for i, sentence_pair in enumerate(batch_data):
        src_sentence, trg_sentence = sentence_pair        
        res_src[i, :len(src_sentence):], res_trg[i, :len(trg_sentence):] = src_sentence.long(), trg_sentence.long() # the first part of sentence are filled with words, the rest are pads
        
    return Batch(res_src, res_trg)

In [16]:
# divide the data into batches, using Dataloader
sorted_train_dataset = sorted(train_dataset, key=lambda x: (len(x[0]), len(x[1])))

In [18]:
# the output from a dataloader must be of shape u
dataloader = DataLoader(sorted_train_dataset, batch_size=16, shuffle=False, collate_fn=lambda b: collate_batch(b))

In [66]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fc0cea47e90>

In [73]:
for i, batch in enumerate(dataloader):
    if i > 0:
        print("## batch.src: ", batch.src)
        print("## batch.trg: ", batch.trg_x)
        break
    print(i)

0
## batch.src:  tensor([[[ 36.3410,  -5.3847,   0.0000,  ..., -15.5931,   0.0000,  22.1422],
         [-12.9151,  -3.6408, -27.6591,  ...,   0.0000,   5.7168,   3.4524],
         [  3.6409, -10.4176,  22.5471,  ...,   3.1294,   6.3004, -14.7257]],

        [[-37.4013, -13.9052,  -0.0000,  ...,  29.8335,  15.2723,  16.2829],
         [-11.9801,  -4.1515,  -0.0000,  ...,   8.5213,   5.7169,   3.4524],
         [  4.5759,  -0.0000,  23.4603,  ...,   0.0000,   6.3005, -14.7257]],

        [[ 45.1240,  -4.1163,  56.4353,  ...,  -9.1390,   0.2787,  29.9507],
         [-11.9048,  -5.2143, -26.6186,  ...,   8.5213,   5.7170,   3.4524],
         [  0.0000, -11.9911,   0.0000,  ...,   3.1294,   0.0000,  -0.0000]],

        ...,

        [[  0.7399,   9.6344,   0.0000,  ...,  29.2805, -16.5202,  18.6147],
         [ 33.7402,  -0.7952,  17.9800,  ..., -30.9216, -36.8936, -29.0791],
         [-12.4482,  -3.7436, -27.6877,  ...,   8.5213,   5.7183,   3.4524]],

        [[ 26.8298, -15.5965, -29.360

In [None]:
def train_batch(batch, model, hyper_params):
    def forward_pass():
        src = batch.src.to(device=device)
        trg_x = batch.trg_x.to(device=device)
        pred = model(src, trg_x)
        return pred
    
    def calculate_loss(pred):
        trg_y = batch.trg_y.to(device=device)
        loss = hyper_params.criterion(pred.view(-1,tgt_vocab_size), trg_y)  
        return loss

    pred = foward_pass()
    loss = calculate_loss(pred)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    return float(loss)

In [None]:
# Every epoch is an iteration over the entire training set (how many steps are in one epoch depends on the batch_size)
def train_epoch(data, model, hyper_params):
    model.train() 
    # varialbes for logging
    log = Log()
    # train the model batch-by-batch 
    for i, batch in enumerate(data):
        batch_loss = train_batch(batch, model, hyper_params)
        log.batch_info(batch, batch_loss)
        if i % 50 == 1:
            log.show_every_50batches(i)
    log.show_epoch()
    scheduler.step()

In [None]:
# variables
pad_value = TRG.vocab.stoi['<pad>']
d_model = 512
lr = 5.0 

# initialize the training params
# data = (Batch(batch.src, batch.trg, pad_value) for batch in data_loader)

# initialize model
encoder_decoder = nn.Transformer().to(device)
generator = Generator(d_model, tgt_vocab_size).to(device)
model = Model(encoder_decoder, generator)

hyper_params = HyperParams(criterion = nn.CrossEntropyLoss(), optimizer = torch.optim.SGD(model.parameters(), lr=lr), scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95))

In [None]:
# train for 10 epochs
for epoch in range(10):
    print("------------Training epoch ", epoch, "--------------")
    train_epoch(data, model, hyper_params)
  # set to eval model to check how good the model is after each loop
  # print(train_epoch((batchify(b, pad_idx, device) for b in valid_iterator), model, criterion, epoch))