In [None]:
# !pip install -r requirements.txt

In [30]:
# !pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm

In [80]:
# import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
# import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
# from torch.utils.data.distributed import DistributedSampler
# import torch.distributed as dist
# import torch.multiprocessing as mp
# from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [4]:
# Some convenience helper functions used throughout the notebook


def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

In [5]:
class EncoderDecoder(nn.Module):
  def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
    super(EncoderDecoder, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_embed = src_embed
    self.tgt_embed = tgt_embed
    self.generator = generator

  def forward(self, src, tgt, src_mask, tgt_mask):
    return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

  def encode(self, src, src_mask):
    return self.encoder(self.src_embed(src), src_mask)

  def decode(self, memory, src_mask, tgt, tgt_mask):
    return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [6]:
class Generator(nn.Module):
  def __init__(self, d_model, vocab):
    super(Generator, self).__init__()
    self.proj = nn.Linear(d_model, vocab)

  def forward(self, x):
    return log_softmax(self.proj(x), dim=-1)

The encoder is composed of a stack of  𝑁=6  identical layers.

In [7]:
def clones(module, N):
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [8]:
class LayerNorm(nn.Module):
  def __init__(self, features, eps=1e-6):
    super(LayerNorm, self).__init__()
    self.a_2 = nn.Parameter(torch.ones(features))
    self.b_2 = nn.Parameter(torch.zeros(features))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)
    return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class Encoder(nn.Module):
  def __init__(self, layer, N):
    super(Encoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

That is, the output of each sub-layer is  LayerNorm(𝑥+Sublayer(𝑥)) , where  Sublayer(𝑥)  is the function implemented by the sub-layer itself. We apply dropout (cite) to the output of each sub-layer, before it is added to the sub-layer input and normalized.

To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension  𝑑model=512 .

In [9]:
class SublayerConnection(nn.Module):
  def __init__(self, size, dropout):
    super(SublayerConnection, self).__init__()
    self.norm = LayerNorm(size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, sublayer):
    return x + self.dropout(sublayer(self.norm(x)))

Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network.

In [10]:
class EncoderLayer(nn.Module):
  def __init__(self, size, self_attn, feed_forward, dropout):
    super(EncoderLayer, self).__init__()
    self.self_attn = self_attn
    self.feed_forward = feed_forward
    self.sublayer = clones(SublayerConnection(size, dropout), 2)
    self.size = size

  def forward(self, x, mask):
    x = self.sublayer[0](x, lambda x:self.self_attn(x, x, x, mask))
    return self.sublayer[1](x, self.feed_forward)

Decoder
The decoder is also composed of a stack of  𝑁=6  identical layers.

In [11]:
class Decoder(nn.Module):
  def __init__(self, layer, N):
    super(Decoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, memory, src_mask, tgt_mask):
    for layer in self.layers:
      x = layer(x, memory, src_mask, tgt_mask)
    return self.norm(x)

In [12]:
class DecoderLayer(nn.Module):
  def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
    super(DecoderLayer, self).__init__()
    self.size = size
    self.self_attn = self_attn
    self.src_attn = src_attn
    self.feed_forward = feed_forward
    self.sublayer = clones(SublayerConnection(size, dropout), 3)

  def forward(self, x, memory, src_mask, tgt_mask):
    m = memory
    x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
    x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
    return self.sublayer[2](x, self.feed_forward)

We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position  𝑖  can depend only on the known outputs at positions less than  𝑖 .

In [13]:
def subsequent_mask(size):
  attn_shape = (1, size, size)
  subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
      torch.uint8
  )
  return subsequent_mask == 0

In [14]:
def example_mask():
  LS_data = pd.concat(
      [
          pd.DataFrame(
              {
                  "Subsequent Mask": subsequent_mask(20)[0][x, y].flatten(),
                  "Window": y,
                  "Masking": x,
              }
          )
          for y in range(20)
          for x in range(20)
      ]
  )
  return (
      alt.Chart(LS_data)
      .mark_rect()
      .properties(height=250, width=250)
      .encode(
          alt.X("Window:O"),
          alt.Y("Masking:O"),
          alt.Color("Subsequent Mask:Q", scale=alt.Scale(scheme="viridis")),
      )
      .interactive()
  )

In [15]:
show_example(example_mask)

Attention
An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.

We call our particular attention "Scaled Dot-Product Attention". The input consists of queries and keys of dimension  𝑑𝑘 , and values of dimension  𝑑𝑣 . We compute the dot products of the query with all keys, divide each by  𝑑𝑘‾‾‾√ , and apply a softmax function to obtain the weights on the values.

In [16]:
def attention(query, key, value, mask=None, dropout=None):
  d_k = query.size(-1)
  scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
  if mask is not None:
    scores = scores.masked_fill(mask == 0, -1e9)
  p_attn = scores.softmax(dim=-1)
  if dropout is not None:
    p_attn = dropout(p_attn)
  return torch.matmul(p_attn, value), p_attn

The two most commonly used attention functions are additive attention (cite), and dot-product (multiplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of  1𝑑𝑘√ . Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code.

While for small values of  𝑑𝑘  the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of  𝑑𝑘  (cite). We suspect that for large values of  𝑑𝑘 , the dot products grow large in magnitude, pushing the softmax function into regions where it has extremely small gradients (To illustrate why the dot products get large, assume that the components of  𝑞  and  𝑘  are independent random variables with mean  0  and variance  1 . Then their dot product,  𝑞⋅𝑘=∑𝑑𝑘𝑖=1𝑞𝑖𝑘𝑖 , has mean  0  and variance  𝑑𝑘 .). To counteract this effect, we scale the dot products by  1𝑑𝑘√ .

In [17]:
class MultiHeadedAttention(nn.Module):
  def __init__(self, h, d_model, dropout=0.1):
    super(MultiHeadedAttention, self).__init__()
    assert d_model % h == 0
    self.d_k = d_model // h
    self.h = h
    self.linears = clones(nn.Linear(d_model, d_model), 4)
    self.attn = None
    self.dropout = nn.Dropout(p=dropout)

  def forward(self, query, key, value, mask=None):
    if mask is not None:
      mask = mask.unsqueeze(1)
    nbatches = query.size(0)

    # 1) Do all the linear projections in batch from d_model => h x d_k
    query, key, value = [
      lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
      for lin, x in zip(self.linears, (query, key, value))
    ]

    # 2) Apply attention on all the projected vectors in batch.
    x, self.attn = attention(
        query, key, value, mask=mask, dropout=self.dropout
    )

    # 3) "Concat" using a view and apply a final linear.
    x = (
        x.transpose(1, 2)
        .contiguous()
        .view(nbatches, -1, self.h * self.d_k)
    )

    del query
    del key
    del value

    return self.linears[-1](x)

Applications of Attention in our Model
The Transformer uses multi-head attention in three different ways: 1) In "encoder-decoder attention" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as (cite).

2) The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder.

3) Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to  −∞ ) all values in the input of the softmax which correspond to illegal connections.

Position-wise Feed-Forward Networks
In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically. This consists of two linear transformations with a ReLU activation in between.

FFN(𝑥)=max(0,𝑥𝑊1+𝑏1)𝑊2+𝑏2

While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is  𝑑model=512 , and the inner-layer has dimensionality  𝑑𝑓𝑓=2048 .

In [18]:
class PositionwiseFeedForward(nn.Module):
  def __init__(self, d_model, d_ff, dropout=0.1):
    super(PositionwiseFeedForward, self).__init__()
    self.w_1 = nn.Linear(d_model, d_ff)
    self.w_2 = nn.Linear(d_ff, d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    return self.w_2(self.dropout(self.w_1(x).relu()))

Embeddings and Softmax
Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension
�
model
d
model
​
 . We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to (cite). In the embedding layers, we multiply those weights by
�
model
d
model
​

​
 .

In [19]:
class Embeddings(nn.Module):
  def __init__(self, d_model, vocab):
    super(Embeddings, self).__init__()
    self.lut = nn.Embedding(vocab, d_model)
    self.d_model = d_model

  def forward(self, x):
    return self.lut(x) * math.sqrt(self.d_model)

Positional Encoding
Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the bottoms of the encoder and decoder stacks. The positional encodings have the same dimension  𝑑model  as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed (cite).

In this work, we use sine and cosine functions of different frequencies:

𝑃𝐸(𝑝𝑜𝑠,2𝑖)=sin(𝑝𝑜𝑠/100002𝑖/𝑑model)

𝑃𝐸(𝑝𝑜𝑠,2𝑖+1)=cos(𝑝𝑜𝑠/100002𝑖/𝑑model)

where  𝑝𝑜𝑠  is the position and  𝑖  is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from  2𝜋  to  10000⋅2𝜋 . We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset  𝑘 ,  𝑃𝐸𝑝𝑜𝑠+𝑘  can be represented as a linear function of  𝑃𝐸𝑝𝑜𝑠 .

In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of  𝑃𝑑𝑟𝑜𝑝=0.1 .

In [20]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, dropout, max_len=5000):
    super(PositionalEncoding, self).__init__()
    self.dropout = nn.Dropout(dropout)

    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(
      torch.arange(0, d_model, 2) * -(math.log(10_000.0) / d_model)
    )
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer("pe", pe)

  def forward(self, x):
    x = x + self.pe[:, :x.size(1) ].requires_grad_(False)
    return self.dropout(x)

Below the positional encoding will add in a sine wave based on position. The frequency and offset of the wave is different for each dimension.

In [21]:
def example_positional():
  pe = PositionalEncoding(20, 0)
  y = pe.forward(torch.zeros(1, 100, 20))

  data = pd.concat(
      [
          pd.DataFrame(
            {
                "embedding": y[0, :, dim],
                "dimension": dim,
                "position": list(range(100)),
            }
          )
          for dim in [4, 5, 6, 7]
      ]
  )

  return (
      alt.Chart(data)
      .mark_line()
      .properties(width=800)
      .encode(x="position", y="embedding", color="dimension:N")
      .interactive()
  )

In [22]:
show_example(example_positional)

Full Model
Here we define a function from hyperparameters to a full model.

In [23]:
def make_model(
    src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
):
  c = copy.deepcopy
  attn = MultiHeadedAttention(h, d_model)
  ff = PositionwiseFeedForward(d_model, d_ff, dropout)
  position = PositionalEncoding(d_model, dropout)
  model = EncoderDecoder(
    encoder=Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
    decoder=Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
    src_embed=nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
    tgt_embed=nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
    generator=Generator(d_model, tgt_vocab),
  )

  for p in model.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)

  return model

Here we make a forward step to generate a prediction of the model. We try to use our transformer to memorize the input. As you will see the output is randomly generated due to the fact that the model is not trained yet. In the next tutorial we will build the training function and try to train our model to memorize the numbers from 1 to 10.

In [24]:
def inference_test():
  test_model = make_model(11,11,2)
  test_model.eval()
  src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
  src_mask = torch.ones(1, 1, 10)

  memory = test_model.encode(src, src_mask)
  ys = torch.zeros(1,1).type_as(src)

  for i in range(9):
    out = test_model.decode(
        memory, src_mask, ys,
        subsequent_mask(ys.size(1)).type_as(src.data)
    )
    prob = test_model.generator(out[:, -1])
    _, next_word = torch.max(prob, dim=1)
    next_word = next_word.data[0]
    ys = torch.cat(
        [ys, torch.empty(1,1).type_as(src.data).fill_(next_word)], dim=1
    )

  print("Example Untrained Model Predition", ys)

def run_tests():
  for _ in range(10):
    inference_test()

In [25]:
show_example(run_tests)

Example Untrained Model Predition tensor([[0, 8, 8, 8, 8, 8, 8, 8, 8, 8]])
Example Untrained Model Predition tensor([[0, 4, 4, 4, 4, 4, 4, 4, 4, 4]])
Example Untrained Model Predition tensor([[ 0,  5, 10, 10, 10, 10, 10, 10, 10, 10]])
Example Untrained Model Predition tensor([[0, 7, 8, 3, 8, 3, 8, 3, 8, 3]])
Example Untrained Model Predition tensor([[0, 1, 2, 5, 1, 5, 7, 7, 6, 9]])
Example Untrained Model Predition tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Example Untrained Model Predition tensor([[0, 2, 3, 6, 0, 2, 6, 3, 9, 3]])
Example Untrained Model Predition tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Example Untrained Model Predition tensor([[ 0,  8, 10,  9,  0,  1,  4,  7,  7,  7]])
Example Untrained Model Predition tensor([[ 0, 10,  1,  3, 10,  1,  3, 10,  1,  3]])


Part 2: Model Training
Training
This section describes the training regime for our models.

We stop for a quick interlude to introduce some of the tools needed to train a standard encoder decoder model. First we define a batch object that holds the src and target sentences for training, as well as constructing the masks.

Part 2: Model Training

### Batches and Masking

In [26]:
class Batch:
  def __init__(self, src, tgt=None, pad=2):
    self.src = src
    self.src_mask = (src != pad).unsqueeze(-2)
    if tgt is not None:
      self.tgt = tgt[:, :-1]
      self.tgt_y = tgt[:, 1:]
      self.tgt_mask = self.make_std_mask(self.tgt, pad)
      self.ntokens = (self.tgt_y != pad).data.sum()

  @staticmethod
  def make_std_mask(tgt, pad):
    tgt_mask = (tgt != pad).unsqueeze(-2)
    tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
        tgt_mask.data
    )
    return tgt_mask


Training Loop

In [27]:
class TrainState:
  step = 0
  accum_step = 0
  samples = 0
  tokens = 0

In [28]:
def run_epoch(
    data_iter,
    model,
    loss_compute,
    optimizer,
    scheduler,
    mode="train",
    accum_iter=1,
    train_state=TrainState(),
):
  start = time.time()
  total_tokens = 0
  total_loss = 0
  tokens = 0
  n_accum = 0
  for i, batch in enumerate(data_iter):
    out = model.forward(
      batch.src,
      batch.tgt,
      batch.src_mask,
      batch.tgt_mask
    )
    loss, loss_node = loss_compute(
        out, batch.tgt_y, batch.ntokens
    )
    if mode == "train" or mode == "train+log":
      loss_node.backward()
      train_state.step += 1
      train_state.samples += batch.src.shape[0]
      train_state.tokens += batch.ntokens
      if i % accum_iter == 0:
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        n_accum += 1
        train_state.accum_step += 1
      scheduler.step()

    total_loss += loss
    total_tokens += batch.ntokens
    tokens += batch.ntokens
    if i % 40 == 1 and (mode == "train" or mode == "train+log"):
      lr = optimizer.param_groups[0]["lr"]
      elapsed = time.time() - start
      print(
          (
              "Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f "
              + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e"
          )
          % ( i, n_accum, loss / batch.ntokens, tokens / elapsed, lr )
      )
      start = time.time()
      tokens = 0
    del loss
    del loss_node
  return total_loss / total_tokens, train_state

Training Data and Batching
We trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding, which has a shared source-target vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary.

Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens.

In [29]:
def rate(step, model_size, factor, warmup):
  if step == 0:
    step = 1
  return factor * (
      model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
  )
  

In [30]:
def example_learning_schedule():
  opts = [
      [512, 1, 4000],
      [512, 1, 8000],
      [256, 1, 4000],
  ]

  dummy_model = torch.nn.Linear(1, 1)
  learning_rates = []

  for idx, example in enumerate(opts):
    optimizer = torch.optim.Adam(
        dummy_model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9
    )
    lr_scheduler = LambdaLR(
        optimizer=optimizer, lr_lambda=lambda step: rate(step, *example)
    )
    tmp = []
    for step in range(20_000):
      tmp.append(optimizer.param_groups[0]["lr"])
      optimizer.step()
      lr_scheduler.step()
    learning_rates.append(tmp)

  learning_rates = torch.tensor(learning_rates)

  alt.data_transformers.disable_max_rows()

  opts_data = pd.concat(
      [
          pd.DataFrame(
              {
                  "Learning Rate": learning_rates[warmup_idx, :],
                  "model_size_warmup": ["512:4000", "512:8000", "256:4000"][
                      warmup_idx
                  ],
                  "step": range(20_000),
              }
          )
          for warmup_idx in [0, 1, 2]
      ]
  )

  return (
      alt.Chart(opts_data)
      .mark_line()
      .properties(width=600)
      .encode(x="step", y="Learning Rate", color="model_size_warmup:N")
      .interactive()
  )


In [31]:
example_learning_schedule()

Regularization
Label Smoothing
During training, we employed label smoothing of value  𝜖𝑙𝑠=0.1  (cite). This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.


In [32]:
class LabelSmoothing(nn.Module):
  def __init__(self, size, padding_idx, smoothing=0.0):
    super(LabelSmoothing, self).__init__()
    self.criterion = nn.KLDivLoss(reduction="sum")
    self.padding_idx = padding_idx
    self.confidence = 1.0 - smoothing
    self.smoothing = smoothing
    self.size = size
    self.true_dist = None

  def forward(self, x, target):
    assert x.size(1) == self.size
    true_dist = x.data.clone()
    true_dist.fill_(self.smoothing/ (self.size - 2))
    true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
    true_dist[:, self.padding_idx] = 0
    mask = torch.nonzero(target.data == self.padding_idx)
    if mask.dim() > 0:
      true_dist.index_fill_(0, mask.squeeze(), 0.0)
    self.true_dist = true_dist
    return self.criterion(x, true_dist.clone().detach())


In [33]:
def example_label_smoothing():
  crit = LabelSmoothing(5, 0, 0.4)
  predict = torch.FloatTensor(
      [
          [0, 0.2, 0.7, 0.1, 0],
          [0, 0.2, 0.7, 0.1, 0],
          [0, 0.2, 0.7, 0.1, 0],
          [0, 0.2, 0.7, 0.1, 0],
          [0, 0.2, 0.7, 0.1, 0],
      ]
  )
  crit(x=predict.log(), target=torch.LongTensor([2, 1, 0, 3, 3]))
  LS_data = pd.concat(
      [
          pd.DataFrame(
              {
                  "target distribution": crit.true_dist[x, y].flatten(),
                  "columns": y,
                  "rows": x,
              }
          )
          for y in range(5)
          for x in range(5)
      ]
  )

  return (
      alt.Chart(LS_data)
      .mark_rect(color="Blue", opacity=1)
      .properties(height=200, width=200)
      .encode(
          alt.X("columns:O", title=None),
          alt.Y("rows:O", title=None),
          alt.Color(
              "target distribution:Q", scale=alt.Scale(scheme="viridis")
          ),
      )
      .interactive()
  )


In [34]:
show_example(example_label_smoothing)

In [35]:
def loss(x, crit):
    d = x + 3 * 1
    predict = torch.FloatTensor([[0, x/d, 1/d, 1/d, 1/d]])
    epsilon = 1e-10 # handle division by 0
    result = crit((epsilon + predict).log(), torch.LongTensor([1])).data
    return result

def penalization_visulization():
    crit = LabelSmoothing(5, 0, 0.1)
    loss_data = pd.DataFrame(
      {
          "Loss": [loss(x, crit) for x in range(1, 100)],
          "Steps": list(range(99)),
      }
    ).astype("float")

    
    return (
      alt.Chart(loss_data)
      .mark_line()
      .properties(width=350)
      .encode(
          x="Steps",
          y="Loss",
      )
      .interactive()
    )

In [36]:
show_example(penalization_visulization)

# A First  Example

> We can begin by trying out a simple copy-task. Given a random set
> of input symbols from a small vocabulary, the goal is to generate
> back those same symbols.

In [37]:
def data_gen(V, batch_size, nbatches):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.randint(1, V, size=(batch_size, 10))
        data[:, 0] = 1
        src = data.requires_grad_(False).clone().detach()
        tgt = data.requires_grad_(False).clone().detach()
        yield Batch(src, tgt, 0)

In [38]:
class SimpleLossCompute:
    "A simple loss compute and train function."

    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion

    def __call__(self, x, y, norm):
        x = self.generator(x)
        sloss = (
            self.criterion(
                x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
            )
            / norm
        )
        return sloss.data * norm, sloss

Greedy Decoding
This code predicts a translation using greedy decoding for simplicity.

In [39]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.zeros(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len - 1):
        out = model.decode(
            memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
        )
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat(
            [ys, torch.zeros(1, 1).type_as(src.data).fill_(next_word)], dim=1
        )
    return ys

In [40]:
def example_simple_model():
    V = 11
    criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
    model = make_model(V, V, N=2)

    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9
    )
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, model_size=model.src_embed[0].d_model, factor=1.0, warmup=400
        ),
    )

    batch_size = 80
    for epoch in range(20):
        model.train()
        run_epoch(
            data_gen(V, batch_size, 20),
            model,
            SimpleLossCompute(model.generator, criterion),
            optimizer,
            lr_scheduler,
            mode="train",
        )
        model.eval()
        run_epoch(
            data_gen(V, batch_size, 5),
            model,
            SimpleLossCompute(model.generator, criterion),
            DummyOptimizer(),
            DummyScheduler(),
            mode="eval",
        )[0]

    model.eval()
    src = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
    # src = torch.LongTensor([[0, 1, 2, 3, 4, 1, 2, 3, 4, 5]])
    max_len = src.shape[1]
    src_mask = torch.ones(1, 1, max_len)
    print(greedy_decode(model, src, src_mask, max_len=max_len, start_symbol=0))



In [41]:
execute_example(example_simple_model)


Epoch Step:      1 | Accumulation Step:   2 | Loss:   3.23 | Tokens / Sec:  2989.9 | Learning Rate: 5.5e-06
Epoch Step:      1 | Accumulation Step:   2 | Loss:   2.16 | Tokens / Sec:  3098.3 | Learning Rate: 6.1e-05
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.71 | Tokens / Sec:  2975.5 | Learning Rate: 1.2e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.48 | Tokens / Sec:  2902.7 | Learning Rate: 1.7e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.01 | Tokens / Sec:  2477.4 | Learning Rate: 2.3e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   0.69 | Tokens / Sec:  2621.3 | Learning Rate: 2.8e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   0.34 | Tokens / Sec:  2550.7 | Learning Rate: 3.4e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   0.23 | Tokens / Sec:  2530.5 | Learning Rate: 3.9e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   0.13 | Tokens / Sec:  2617.8 | Learning Rate: 4.5e-04
Epoch Step:      1 | Accumul

###Part 3: A Real World Example
>Now we consider a real-world example using the Multi30k German-English Translation task. This task is much smaller than the WMT task considered in the paper, but it illustrates the whole system. We also show how to use multi-gpu processing to make it really fast.

###Data Loading
>We will load the dataset using torchtext and spacy for tokenization.

In [42]:
import os
# os.chdir("drive/MyDrive/Colab Notebooks/datasets")
os.getcwd()
os.listdir()
print(os.getcwd())
print(os.listdir())


/Users/falconlin/Development/annotated_transformer_implementations_2023
['AnnotatedTransformer-CUDA.ipynb', 'AnnotatedTransformer-Metal-Torch.ipynb', 'requirements.metal.txt', 'env', '.doc', 'torch-verify.ipynb', 'europarl-v7.de-en.en', '.ipynb_checkpoints', 'annotated_transformer', 'europarl-v7.de-en.de', 'requirements.cuda.txt']


In [43]:
my_file_en = "europarl-v7.de-en.en"
my_file_de = "europarl-v7.de-en.de"

# my_file_en = ""
# my_file_es = ""

# May need to adjust these parameters for new translation tasks.
size_train = 20350
size_val = 2000
size_test = 2000

train = []
val = []
test = []

de_list = []
en_list = []

count = 0;
with open(my_file_de, encoding="utf8") as fp:
  for line in fp:
    de_list.append(line)
    if count >= (size_train + size_val + size_test - 1):
      break
    count += 1

count = 0;
with open(my_file_en, encoding="utf8") as fp:
  for line in fp:
    en_list.append(line)
    if count > (size_train + size_val + size_test - 1):
      break
    count += 1

dataset = list(zip(de_list, en_list))
train = dataset[:size_train]
val = dataset[size_train:size_train+size_val]
test = dataset[size_train+size_val:size_train+size_val+size_test]

print(len(train), len(val), len(test))

20350 2000 2000


In [44]:
def load_tokenizers():

    try:
        spacy_de = spacy.load("de_core_news_sm")
    except IOError:
        os.system("python -m spacy download de_core_news_sm")
        spacy_de = spacy.load("de_core_news_sm")

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except IOError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    return spacy_de, spacy_en

In [45]:
def tokenize(text, tokenizer):
  return [tok.text for tok in tokenizer.tokenizer(text)]

def yield_tokens(data_iter, tokenizer, index):
  for from_to_tuple in data_iter:
    yield tokenizer(from_to_tuple[index])


In [46]:
def build_vocabulary(spacy_de, spacy_en):
  def tokenize_de(text):
    return tokenize(text, spacy_de)

  def tokenize_en(text):
    return tokenize(text, spacy_en)

  print("Building German Vocabulary ...")
  # train, val, test = datasets.Multi30k(language_pair=("de", "en"))
  # print("here")
  vocab_src = build_vocab_from_iterator(
      yield_tokens(train + val + test, tokenize_de, index=0),
      min_freq=2,
      specials=["<s>", "</s>", "<blank>", "<unk>"],
  )

  print("Building English Vocabulary ...")
  # train, val, test = datasets.Multi30k(langiage_part=("de", "en"))
  vocab_tgt = build_vocab_from_iterator(
      yield_tokens(train + val + test, tokenize_en, index=0),
      min_freq=2,
      specials=["<s>", "</s>", "<blank>", "<unk>"],
  )

  vocab_src.set_default_index(vocab_src["<unk>"])
  vocab_tgt.set_default_index(vocab_tgt["<unk>"])

  return vocab_src, vocab_tgt


def load_vocab(spacy_de, spacy_en):
  if not exists("vocab.pt"):
    vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en)
    torch.save((vocab_src, vocab_tgt), "vocab.pt")
  else:
    vocab_src, vocab_tgt = torch.load("vocab.pt")

  print("Finished.\nVocabulary sizes:")
  print(len(vocab_src))
  print(len(vocab_tgt))
  return vocab_src, vocab_tgt

if is_interactive_notebook():
  spacy_de, spacy_en = show_example(load_tokenizers)
  vocab_src, vocab_tgt = show_example(load_vocab, args=[spacy_de, spacy_en])

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0mm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.0
[38;5

###Iterators

In [47]:
def collate_batch(
    batch,
    src_pipeline,
    tgt_pipeline,
    src_vocab,
    tgt_vocab,
    device,
    max_padding=128,
    pad_id=2,
):
  bs_id = torch.tensor([0], device=device)
  eos_id = torch.tensor([1], device=device)
  src_list, tgt_list = [], []
  for (_src, _tgt) in batch:
    processed_src = torch.cat(
        [
            bs_id,
            torch.tensor(
                src_vocab(src_pipeline(_src)),
                dtype=torch.int64,
                device=device
            ),
            eos_id,
        ],
        0,
    )
    processed_tgt = torch.cat(
        [
            bs_id,
            torch.tensor(
                tgt_vocab(tgt_pipeline(_tgt)),
                dtype=torch.int64,
                device=device,
            ),
            eos_id
        ],
        0,
    )
    src_list.append(
        pad(
            processed_src,
            (
                0,
                max_padding - len(processed_src),
            ),
            value=pad_id,
        )
    )
    tgt_list.append(
        pad(
            processed_tgt,
            (0, max_padding - len(processed_tgt)),
            value=pad_id,
        )
    )

  src = torch.stack(src_list)
  tgt = torch.stack(tgt_list)
  return (src, tgt)

Need to (1) add parameters to create_dataloaders for train, val, and test and remove (2) Remove call to datasets.Multi30K in the code below.

In [48]:
print(len(train))
print(len(val))
print(len(test))

20350
2000
2000


In [49]:
def create_dataloaders(
    device,
    vocab_src,
    vocab_tgt,
    spacy_de,
    spacy_en,
    batch_size = 1200,
    max_padding=128,
    is_distributed = True,
):
  def tokenize_de(text):
    return tokenize(text, spacy_de)

  def tokenize_en(text):
    return tokenize(text, spacy_en)

  def collate_fn(batch):
    return collate_batch(
        batch,
        tokenize_de,
        tokenize_en,
        vocab_src,
        vocab_tgt,
        device,
        max_padding=max_padding,
        pad_id=vocab_src.get_stoi()["<blank>"],
    )

  train_iter, valid_iter, test_iter = train, val, test

  train_iter_map = to_map_style_dataset(
      train_iter
  )
  train_sampler = (
      DistributedSampler(train_iter_map) if is_distributed else None
  )
  valid_iter_map = to_map_style_dataset(valid_iter)
  valid_sampler = (
      DistributedSampler(valid_iter_map) if is_distributed else None
  )

  train_dataloader = DataLoader(
      train_iter_map,
      batch_size=batch_size,
      shuffle=(train_sampler is None),
      sampler=train_sampler,
      collate_fn=collate_fn,
  )
  valid_dataloader = DataLoader(
      valid_iter_map,
      batch_size=batch_size,
      shuffle=(valid_sampler is None),
      sampler=valid_sampler,
      collate_fn=collate_fn,
  )

  return train_dataloader, valid_dataloader


In [76]:
torch.cuda.is_available()
torch.backends.mps.is_available()
torch.backends.mps.is_built()
torch.cuda.device_count()
torch.device("mps")

device(type='mps')

In [81]:
device = torch.device("mps")
def train_worker(
    gpu,
    ngpus_per_node,
    vocab_src,
    vocab_tgt,
    spacy_de,
    spacy_en,
    config,
    is_distributed=False,
):
    print(f"Train worker process using GPU: {gpu} for training", flush=True)
    # torch.cuda.set_device(gpu)
    gpu = device
    
    pad_idx = vocab_tgt["<blank>"]
    d_model = 512
    print("Got to this point(1)!!!")
    model = make_model(len(vocab_src), len(vocab_tgt), N=6)
    print("Got to this point(2)!!!")
    # model.cuda(gpu)
    model.to(device)
    print("Got to this point(3)!!!")
    module = model
    print("Got to this point(4)!!!")
    is_main_process = True
    # if is_distributed:
    #     dist.init_process_group(
    #         "nccl", init_method="env://", rank=gpu, world_size=ngpus_per_node
    #     )
    #     model = DDP(model, device_ids=[gpu])
    #     module = model.module
    #     is_main_process = gpu == 0

    criterion = LabelSmoothing(
        size=len(vocab_tgt), padding_idx=pad_idx, smoothing=0.1
    )
    # criterion.cuda(gpu)
    criterion.to(device)
    print("Got to this point(5)!!!")
    #############################
    ## PROBLEM IS HERE BELOW!  ##
    #############################
    train_dataloader, valid_dataloader = create_dataloaders(
        gpu,
        vocab_src,
        vocab_tgt,
        spacy_de,
        spacy_en,
        batch_size=config["batch_size"] // ngpus_per_node,
        max_padding=config["max_padding"],
        is_distributed=is_distributed,
    )
    print("Got to this point(6)!!!")
    optimizer = torch.optim.Adam(
        model.parameters(), lr=config["base_lr"], betas=(0.9, 0.98), eps=1e-9
    )
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, d_model, factor=1, warmup=config["warmup"]
        ),
    )
    train_state = TrainState()
    for epoch in range(config["num_epochs"]):
        if is_distributed:
            train_dataloader.sampler.set_epoch(epoch)
            valid_dataloader.sampler.set_epoch(epoch)

        model.train()
        print(f"[GPU{gpu}] Epoch {epoch} Training ====", flush=True)
        _, train_state = run_epoch(
            (Batch(b[0], b[1], pad_idx) for b in train_dataloader),
            model,
            SimpleLossCompute(module.generator, criterion),
            optimizer,
            lr_scheduler,
            mode="train+log",
            accum_iter=config["accum_iter"],
            train_state=train_state,
        )

        GPUtil.showUtilization()
        if is_main_process:
            file_path = "%s%.2d.pt" % (config["file_prefix"], epoch)
            torch.save(module.state_dict(), file_path)
        # torch.cuda.empty_cache()

        print(f"[GPU{gpu}] Epoch {epoch} Validation ====", flush=True)
        model.eval()
        sloss = run_epoch(
            (Batch(b[0], b[1], pad_idx) for b in valid_dataloader),
            model,
            SimpleLossCompute(module.generator, criterion),
            DummyOptimizer(),
            DummyScheduler(),
            mode="eval",
        )
        print(sloss)
        # torch.cuda.empty_cache()

    if is_main_process:
        file_path = "%sfinal.pt" % config["file_prefix"]
        torch.save(module.state_dict(), file_path)
def train_distributed_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config):
    from the_annotated_transformer import train_worker

    ngpus = torch.cuda.device_count()
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12356"
    print(f"Number of GPUs detected: {ngpus}")
    print("Spawning training processes ...")
    mp.spawn(
        train_worker,
        nprocs=ngpus,
        args=(ngpus, vocab_src, vocab_tgt, spacy_de, spacy_en, config, True),
    )


def train_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config):
    if config["distributed"]:
        train_distributed_model(
            vocab_src, vocab_tgt, spacy_de, spacy_en, config
        )
    else:
        train_worker(
            0, 1, vocab_src, vocab_tgt, spacy_de, spacy_en, config, False
        )


def load_trained_model():
    config = {
        "batch_size": 32,
        "distributed": False,
        "num_epochs": 8,
        "accum_iter": 10,
        "base_lr": 1.0,
        "max_padding": 72,
        "warmup": 3000,
        "file_prefix": "multi30k_model_",
    }
    model_path = "multi30k_model_final.pt"
    if not exists(model_path):
        train_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config)

    model = make_model(len(vocab_src), len(vocab_tgt), N=6)
    model.load_state_dict(torch.load("multi30k_model_final.pt"))
    return model

if is_interactive_notebook():
    model = load_trained_model()

Train worker process using GPU: 0 for training
Got to this point(1)!!!
Got to this point(2)!!!
Got to this point(3)!!!
Got to this point(4)!!!
Got to this point(5)!!!
Got to this point(6)!!!
[GPUmps] Epoch 0 Training ====
Epoch Step:      1 | Accumulation Step:   1 | Loss:   8.51 | Tokens / Sec:  3851.9 | Learning Rate: 5.4e-07
Epoch Step:     41 | Accumulation Step:   5 | Loss:   8.00 | Tokens / Sec:  4424.2 | Learning Rate: 1.1e-05
Epoch Step:     81 | Accumulation Step:   9 | Loss:   7.20 | Tokens / Sec:  4345.5 | Learning Rate: 2.2e-05
Epoch Step:    121 | Accumulation Step:  13 | Loss:   6.62 | Tokens / Sec:  4363.6 | Learning Rate: 3.3e-05
Epoch Step:    161 | Accumulation Step:  17 | Loss:   6.20 | Tokens / Sec:  4336.5 | Learning Rate: 4.4e-05
Epoch Step:    201 | Accumulation Step:  21 | Loss:   5.92 | Tokens / Sec:  4284.0 | Learning Rate: 5.4e-05
Epoch Step:    241 | Accumulation Step:  25 | Loss:   5.65 | Tokens / Sec:  4415.9 | Learning Rate: 6.5e-05
Epoch Step:    281 | A

In [82]:
## save the model
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# # Save the model
# PATH = '/content/drive/My Drive/model.pt'
# torch.save(model.state_dict(), PATH)

# Download the model to your local computer
# from google.colab import files
# files.download(PATH)



In [83]:
if False:
  model.src_embed[0].lut.weight = model.tgt_embedding[0].lut.weight
  model.generator.lut.weight = model.tgt_embed[0].lut.weight

In [84]:
def average(model, models):
  for ps in zip(*[m.parmas() for m in [model] + models]):
    ps[0].copy_(torch.sum(*ps[1:]) /len(ps[1:]))

In [85]:
def check_outputs(
    valid_dataloader,
    model,
    vocab_src,
    vocab_tgt,
    n_examples=15,
    pad_idx=2,
    eos_string="</s>",
):
  results = [()] * n_examples
  for idx in range(n_examples):
    print("\nExample %d ========\n" % idx)
    b = next(iter(valid_dataloader))
    rb = Batch(b[0], b[1], pad_idx)
    greedy_decode(model, rb.src, rb.src_mask, 64, 0)[0]

    src_tokens = [
        vocab_src.get_itos()[x] for x in rb.src[0] if x != pad_idx
    ]
    tgt_tokens = [
        vocab_tgt.get_itos()[x] for x in rb.tgt[0] if x != pad_idx
    ]

    print(
        "Source Text (Inpput)        : "
        + " ".join(src_tokens).replace("\n", "")
    )
    print(
        "Target Text (Ground Truth)  : "
        + " ".join(tgt_tokens).replace("\n", "")
    )
    model_out = greedy_decode(model, rb.src, rb.src_mask, 72, 0)[0]
    model_txt = (
        " ".join(
            [vocab_tgt.get_itos()[x] for x in model_out if x != pad_idx]
        ).split(eos_string, 1)[0]
        + eos_string
    )
    print("Model Output               : " + model_txt.replace("\n", ""))
    results[idx] = (rb, src_tokens, tgt_tokens, model_out, model_txt)
  return results

def run_model_example(n_examples=5):
  global vcab_src, vocab_tgt, spcay_de, spacy_en

  print("Preparing Data ...")
  _, valid_dataloader = create_dataloaders(
      torch.device("cpu",),
      vocab_src,
      vocab_tgt,
      spacy_de,
      spacy_en,
      batch_size=1,
      is_distributed=False,
  )

  print("Loading Trained Model ...")

  model = make_model(len(vocab_src), len(vocab_tgt), N=6)
  model.load_state_dict(
      torch.load("multi30k_model_final.pt", map_location=torch.device("cpu"))
  )

  print("Checking Model Outputs:")
  example_data = check_outputs(
      valid_dataloader, model, vocab_src, vocab_tgt, n_examples=n_examples
  )
  return model, example_data




In [86]:
# execute_example(run_model_example)

In [87]:
def mtx2df(m, max_row, max_col, row_tokens, col_tokens):
  "convert a dense matrix to a data frame with row and column indices"
  return pd.DataFrame(
      [
          (
              r,
              c,
              float(m[r, c]),
              "%.3d %s"
              % (r, row_tokens[r] if len(row_tokens) > r else "<blank>"),
              "%.3d %s"
              % (c, col_tokens[c] if len(col_tokens) > c else "<blank>"),
          )
          for r in range(m.shape[0])
          for c in range(m.shape[1])
          if r < max_row and c < max_col
      ],
      columns=["row", "column", "value", "row_token", "col_token"],
  )

def attn_map(attn, layer, head, row_tokens, col_tokens, max_dim=30):
  df = mtx2df(
      attn[0, head].data,
      max_dim,
      max_dim,
      row_tokens,
      col_tokens,
  )
  return (
      alt.Chart(data=df)
      .mark_rect()
      .encode(
          x=alt.X("col_token", axis=alt.Axis(title="")),
          y=alt.Y("row_token", axis=alt.Axis(title="")),
          color="value",
          tooltip=["row", "column", "value", "row_token", "col_token"],
      )
      .properties(height=400, width=400)
      .interactive()
  )



In [88]:
def get_encoder(model, layer):
  return model.encoder.layers[layer].self_attn.attn

def get_decoder_self(model, layer):
  return model.decoder.layers[layer].self_attn.attn

def get_decoder_src(model, layer):
  return model.decoder.layers[layer].src_attn.attn

def visualize_layer(model, layer, getter_fn, ntokens, row_tokens, col_tokens):
  attn = getter_fn(model, layer)
  n_heads = attn.shape[1]
  charts = [
      attn_map(
          attn,
          0,
          h,
          row_tokens=row_tokens,
          col_tokens=col_tokens,
          max_dim=ntokens,
      )
      for h in range(n_heads)
  ]
  assert n_heads == 8
  return alt.vconcat(
      charts[0]
      | charts[2]
      | charts[4]
      | charts[6]
  ).properties(title="Layer %d" % (layer + 1))

## Encoder Self Attention

In [89]:
def viz_encoder_self():
  model, example_data = run_model_example(n_examples=1)
  example = example_data[
      len(example_data) - 1
  ]

  layer_viz = [
      visualize_layer(
          model, layer, get_encoder, len(example[1]), example[1], example[1]
      )
      for layer in range(6)
  ]

  return alt.hconcat(
      layer_viz[0]
      & layer_viz[2]
      & layer_viz[4]
  )

In [90]:
show_example(viz_encoder_self)

Preparing Data ...
Loading Trained Model ...
Checking Model Outputs:


Source Text (Inpput)        : <s> In diesem Sinne möchte ich den Berichterstatter , Herrn Bouwman , und den gesamten Ausschuß für Regionalpolitik , Verkehr und Fremdenverkehr zu der <unk> Arbeit beglückwünschen , die sie im Verlaufe der Debatte und bei der gesamten Bearbeitung dieser Initiative geleistet haben .  </s>
Target Text (Ground Truth)  : <s> In <unk> <unk> , I <unk> <unk> to <unk> the <unk> , <unk> Bouwman , and the <unk> of the <unk> on <unk> <unk> , Transport and <unk> for <unk> <unk> <unk> <unk> the <unk> and <unk> the <unk> of <unk> <unk> .  </s>
Model Output               : <s> In <unk> <unk> , <unk> <unk> <unk> <unk> <unk> <unk> , <unk> , <unk> <unk> and <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> in the <unk> .  </s>


## Decoder Src Attention

In [91]:
def viz_decoder_src():
  model, example_data = run_model_example(n_examples=1)
  example = example_data[len(example_data) - 1]

  layer_viz = [
      visualize_layer(
          model,
          layer,
          get_decoder_src,
          max(len(example[1]), len(example[2])),
          example[1],
          example[2],
      )
      for layer in range(6)
  ]

  return alt.hconcat(
      layer_viz[0]
      & layer_viz[1]
      & layer_viz[2]
      & layer_viz[3]
      & layer_viz[4]
      & layer_viz[5]
  )

In [92]:
show_example(viz_decoder_src)

Preparing Data ...
Loading Trained Model ...
Checking Model Outputs:


Source Text (Inpput)        : <s> Leider ändert sich jetzt nichts und wird sich künftig nichts ändern .  </s>
Target Text (Ground Truth)  : <s> <unk> , <unk> is <unk> and <unk> is <unk> to <unk> .  </s>
Model Output               : <s> <unk> , <unk> is not <unk> <unk> <unk> <unk> .  </s>
