<a href="https://colab.research.google.com/github/ghlai9665/transformer-implementation/blob/main/Transformer_Implementation_PyTorch_Version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Dependencies


In [1]:
!pip uninstall torchtext torch
!pip install --pre torch torchtext -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
!pip install numpy matplotlib spacy seaborn

Uninstalling torchtext-0.3.1:
  Would remove:
    /usr/local/lib/python3.6/dist-packages/test/common/*
    /usr/local/lib/python3.6/dist-packages/test/data/*
    /usr/local/lib/python3.6/dist-packages/torchtext-0.3.1.dist-info/*
    /usr/local/lib/python3.6/dist-packages/torchtext/*
Proceed (y/n)? y
  Successfully uninstalled torchtext-0.3.1
Uninstalling torch-1.7.0+cu101:
  Would remove:
    /usr/local/bin/convert-caffe2-to-onnx
    /usr/local/bin/convert-onnx-to-caffe2
    /usr/local/lib/python3.6/dist-packages/caffe2/*
    /usr/local/lib/python3.6/dist-packages/torch-1.7.0+cu101.dist-info/*
    /usr/local/lib/python3.6/dist-packages/torch/*
Proceed (y/n)? y
y
  Successfully uninstalled torch-1.7.0+cu101
Looking in links: https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
Collecting torch
[?25l  Downloading https://download.pytorch.org/whl/nightly/cpu/torch-1.8.0.dev20210131%2Bcpu-cp36-cp36m-linux_x86_64.whl (167.7MB)
[K     |████████████████████████████████| 167.7MB 9

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn
import math
import time
seaborn.set_context(context="talk")
%matplotlib inline

In [3]:
!pip show torchtext

Name: torchtext
Version: 0.9.0.dev20210130
Summary: Text utilities and datasets for PyTorch
Home-page: https://github.com/pytorch/text
Author: PyTorch core devs and James Bradbury
Author-email: jekbradbury@gmail.com
License: BSD
Location: /usr/local/lib/python3.6/dist-packages
Requires: tqdm, torch, requests, numpy
Required-by: 


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load Data

In [5]:
!python -m spacy download en
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 1.7MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907057 sha256=8a5438ac1c516e7cada576dec363438bc2c3f05715d3444a0eb3b1db35b4a199
  Stored in directory: /tmp/pip-ephem-wheel-cache-9gtlncdb/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Inst

In [6]:
from torchtext.experimental.datasets import IWSLT
from torchtext.data.utils import get_tokenizer

# tokenize the dataset (pairs of sentence strings -> pairs of arrays of indices)
src_tokenizer = get_tokenizer("spacy", language='de')
tgt_tokenizer = get_tokenizer("spacy", language='en')
train_dataset, valid_dataset, test_dataset = IWSLT(tokenizer=(src_tokenizer, tgt_tokenizer))
# vocab allows us see which index maps to which word
de_vocab, en_vocab = train_dataset.get_vocab()

2016-01.tgz: 188MB [00:02, 67.2MB/s]
100%|██████████| 196884/196884 [00:28<00:00, 6846.71lines/s]
100%|██████████| 196884/196884 [00:19<00:00, 10250.79lines/s]


In [101]:
class Batch:
    def __init__(self, src, trg=None, pad_value=0):
        src_vocab_size, trg_vocab_size = len(de_vocab.itos), len(en_vocab.itos)
        # input src & trg are shape (batch_size, sentence_len) and embedded into (batch_size, sentence_len, d_model)
        # finally transposed into (sentence_len, batch_size, d_model)
        assert src.shape[0] == trg.shape[0], "src and trg should have the same batch size!"

        batch_size, src_sentence_len = src.shape
        # print("### trg.shape: ", trg.shape)

        self.src_padding_mask = get_padding_mask(src)
        self.src = embed(src, src_vocab_size).transpose(0, 1)
        
        if trg != None:
            # given src & trg_x...
            self.trg_padding_mask = get_padding_mask(trg[:,:-1])
            trg_embedding = embed(trg, trg_vocab_size)
            self.trg_x = trg_embedding[:, :-1, :].transpose(0, 1)
            trg_sentence_len = self.trg_x.shape[0]
            self.trg_attn_mask = generate_square_subsequent_mask(trg_sentence_len)
            # ...we try to predict trg_y, which has ntokens words (i.e. we make ntokens predictions)
            self.trg_y = trg[:, 1:].transpose(0,1)
            self.ntokens = (self.trg_y != pad_value).sum()

In [8]:
# this tells dataloader how you want your batch to look like 
# input is a list of tensors of size batch_size (dataloader just feeds you a mini-batch of batch_size at a time and you can process it),
# where each tensor is [src, trg]
# output is whatever you want in train_epoch
def collate_batch(batch_data, pad_idx=1):
    max_src_len = max([len(sentence_pair[0]) for sentence_pair in batch_data])
    max_trg_len = max([len(sentence_pair[1]) for sentence_pair in batch_data])
    # initialize the padding in the shape of the result src/trg we want
    res_src = torch.zeros(len(batch_data), max_src_len).long() + pad_idx
    res_trg = torch.zeros(len(batch_data), max_trg_len).long() + pad_idx
    # layer the actual sentence on top of the padding
    for i, sentence_pair in enumerate(batch_data):
        src_sentence, trg_sentence = sentence_pair        
        res_src[i, :len(src_sentence):], res_trg[i, :len(trg_sentence):] = src_sentence.long(), trg_sentence.long() # the first part of sentence are filled with words, the rest are pads
        
    return Batch(res_src, res_trg)

In [9]:
# divide the data into batches, using Dataloader
sorted_train_dataset = sorted(train_dataset, key=lambda x: (len(x[0]), len(x[1])))
# the output from a dataloader must be of shape u
dataloader = DataLoader(sorted_train_dataset, batch_size=16, shuffle=False, collate_fn=lambda b: collate_batch(b))

In [10]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    return mask

# input shape (N, S), you need to get src_padding_mask and trg_padding_mask
def get_padding_mask(batch, pad_index = 1):
    # batch_size, sentence_len = batch[1]
    # shape (N, S)
    padding_mask = (batch == pad_index)
    return padding_mask

# Explore Data

In [None]:
def print_top(n_rows, mode='words'):
  if mode == 'words':
    for i in range(n_rows):
      de_sentence = [de_vocab.itos[index] for index in train_dataset[i][0]]
      en_sentence = [en_vocab.itos[index] for index in train_dataset[i][1]]
      print((de_sentence, en_sentence))
  elif mode == 'indices':
    for i in range(n_rows):
      print(train_dataset[i])

print_top(100, 'words')
print_top(10, 'indices')

In [12]:
print(de_vocab.stoi['<pad>'])
print(en_vocab.stoi['<pad>'])

1
1


In [13]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [14]:
def embed(x, vocab_size, d_model=512):
    # two embeddings (1) each token's numeral value is mapped to a embedding vector, index (scalar) -> embedding vector (size of d_model) 
    # (2) positional embedding is applied
    number_to_embedding = nn.Embedding(vocab_size, d_model)
    pos_embedding = PositionalEncoding(d_model)
    x = number_to_embedding(x) * math.sqrt(d_model)
    x = pos_embedding(x)
    return x

In [55]:
for i, batch in enumerate(dataloader):
    if i > 0:
      break
    print("## batch.src: ", batch.src.shape)
    print("## batch.trg_x: ", batch.trg_x.shape)
    print("## batch.trg_y: ", batch.trg_y.shape)
    # print("## trg_attn_mask: ", batch.trg_attn_mask.shape, batch.trg_attn_mask)
    print("## trg_padding_mask: ", batch.trg_padding_mask.shape, batch.trg_padding_mask)
    print("## src_padding_mask: ", batch.src_padding_mask.shape, batch.src_padding_mask)


    print(i)

## batch.src:  torch.Size([2, 16, 512])
## batch.trg_x:  torch.Size([3, 16, 512])
## batch.trg_y:  torch.Size([3, 16])
## trg_padding_mask:  torch.Size([16, 3]) tensor([[False, False,  True],
        [False, False,  True],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False],
        [False, False, False]])
## src_padding_mask:  torch.Size([16, 2]) tensor([[False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [Fal

# Define Functions for Training

In [92]:
xtokens = 3
a = torch.randn(xtokens, 16, 5); print(a.shape)
b = a.view(-1, xtokens); print(b.shape)
# [3, 16, 132937]
# b = a.view(-1, 4); print(b.shape)
# c = a.reshape(-1); print(c.shape)

torch.Size([3, 16, 5])
torch.Size([80, 3])


In [93]:
class Generator(nn.Module):
  def __init__(self, d_model, vocab):
    super().__init__()
    self.proj = nn.Linear(d_model, vocab) # define the linear projection, which takes d_model sized activation and output linear mapping to assign probability to each of the vocab
  
  def forward(self, x):
    return F.log_softmax(self.proj(x), dim=-1)

In [94]:
class Log:
  def __init__(self):
    self.start = time.time()
    self.total_loss = 0
    self.total_tokens = 0
    # loss & tokens accumulated over the last < 50 steps
    self.cur_loss = 0
    self.cur_tokens = 0 

  def batch_info(batch, loss):
    self.total_loss += loss
    self.total_tokens += int(batch.ntokens)
    self.cur_loss += loss
    self.cur_tokens += int(batch.ntokens)
  
  def reset_every_50batches():
    self.cur_tokens = 0
    self.cur_loss = 0
    self.start = time.time()

  def show_every_50batches(i):
    elapsed = time.time() - self.start
    print("-----")
    print("Epoch step: %d  Average Loss Per Token Over the Last 50 Batch: %f  Processed Tokens per Sec %f" % (i, self.cur_loss / self.cur_tokens, self.cur_tokens / elapsed))
    self.reset_every_50batches()

  def show_epoch():
    print("Total loss for the epoch is: ", total_loss / total_tokens)

In [95]:
# Every epoch is an iteration over the entire training set (how many steps are in one epoch depends on the batch_size)
def train_epoch(data, model, hyper_params):
  model.encoder_decoder.train() 
  # varialbes for logging
  log = Log()
  # train the model batch-by-batch 
  for i, batch in enumerate(data):
    batch_loss = train_batch(batch, model, hyper_params)
    log.batch_info(batch, batch_loss)
    if i % 50 == 1:
      log.show_every_50batches(i)
  log.show_epoch()
  scheduler.step()

In [105]:
def train_batch(batch, model, hyper_params):
    def forward_pass():
        src = batch.src.to(device=device)
        trg_x = batch.trg_x.to(device=device)
        src_padding_mask = batch.src_padding_mask.to(device=device)
        trg_attn_mask = batch.trg_attn_mask.to(device=device)
        trg_padding_mask = batch.trg_padding_mask.to(device=device)
        
        pred = model.forward(src, trg_x, src_padding_mask, trg_attn_mask, trg_padding_mask).view(-1, len(de_vocab.stoi))
        return pred
    
    def calculate_loss(prediction):
        trg_y = batch.trg_y.to(device=device).reshape(-1)
        print("## pred.shape: ", pred.shape)
        print("## trg_y.shape: ", trg_y.shape)
        loss = hyper_params.criterion(pred, trg_y)  
        return loss

    pred = forward_pass()
    loss = calculate_loss(pred)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    return float(loss)

# Define Model


In [97]:
class Model(nn.Module):
  def __init__(self, encoder_decoder, generator):
    super().__init__()
    self.encoder_decoder = encoder_decoder
    self.generator = generator

  def forward(self, src, trg_x, src_key_padding_mask, tgt_mask, tgt_key_padding_mask):
    out = self.encoder_decoder.forward(src, trg_x, src_key_padding_mask = src_key_padding_mask, tgt_mask = tgt_mask, tgt_key_padding_mask =  tgt_key_padding_mask)
    out = self.generator.forward(out)
    return out

In [98]:
class HyperParams():
  def __init__(self, criterion = None, optimizer = None, scheduler = None):
    self.criterion, self.optimizer, self.scheduler = criterion, optimizer, scheduler

In [106]:
# variables
pad_value = de_vocab.stoi['<pad>']; print("## pad_value: ", pad_value)
d_model = 512
lr = 5.0 
tgt_vocab_size = len(de_vocab.itos)

# initialize model
encoder_decoder = nn.Transformer().to(device)
generator = Generator(d_model, tgt_vocab_size).to(device)
model = Model(encoder_decoder, generator)

# hyper_params = MyHyperParams(criterion = nn.CrossEntropyLoss(), optimizer = torch.optim.Adam(model.parameters(), lr=lr), scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95))
# hyper_params = HyperParams(criterion = nn.CrossEntropyLoss(), optimizer = torch.optim.Adam(model.parameters(), lr=lr))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.encoder_decoder.parameters(), lr=lr) 
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
hyper_params = HyperParams(criterion=criterion, optimizer=optimizer, scheduler=scheduler)

## pad_value:  1


# Start Training

In [None]:
# train for 10 epochs
for epoch in range(1):
  print("------------Training epoch ", epoch, "--------------")
  train_epoch(dataloader, model, hyper_params)
  # set to eval model to check how good the model is after each loop
  # print(train_epoch((batchify(b, pad_idx, device) for b in valid_iterator), model, criterion, epoch))

------------Training epoch  0 --------------
## pred.shape:  torch.Size([48, 132937])
## trg_y.shape:  torch.Size([48])
