# Implementation of a decoder

In [None]:
import sys
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from transformers import AutoConfig, AutoTokenizer

sys.path.append('../modules/')

%load_ext autoreload
%autoreload 2

## Load config and tokenizer

In [None]:
# Choose a model (checkpoint).
model_ckpt = 'distilbert-base-uncased'

# Load the model's config.
config = AutoConfig.from_pretrained(model_ckpt)

# Load the tokenizer associated to the model.
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

## Tokenization

In [None]:
text = [
    "To search for perfection",
    "Is all very well",
    "But to look for heaven",
    "Is to live here in hell"
]

input_ids = tokenizer(
    text,
    padding=True,
    return_tensors='tf'
)['input_ids']

input_ids

## Embeddings

In [None]:
from encoder import Embeddings

In [None]:
embeddings = Embeddings(config=config)

token_embeddings = embeddings(input_ids)

token_embeddings

## Masked multi-head self-attention

The masked multi-head self-attention layer prevents the decoder from seeing tokens it still has to predict during the training phase (in a sequence-to-sequence task, e.g. machine translation, the decoder predicts the output tokens sequentially and during training these are compared with the true sequence to predict, the errors being then backpropagated to update the weights of the model).

Build a mask for the attention scores matrix: the way tokens are masked in the input sequence to the decoder is by taking the attention scores matrix (__before softmax is applied__) and set entries in the upper triangle (excluding the diagonal, which corresponds to the "current token") to $-\infty$. This way those entries will be mapped to 0 by softmax and the corresponding attention weights will also be 0. The upper triangle (excluding the diagonal) corresponds indeed to all the tokens __after__ the given ones (corresponding to the row index) in the sequence.

In [None]:
seq_len = input_ids.shape[-1]

mask = 1. - tf.linalg.band_part(
    tf.ones(shape=(seq_len, seq_len)),
    num_lower=-1,
    num_upper=0
)

mask

Randomly generate a fake atetntion score matrix, for testing purposes.

In [None]:
# Fake score matrix.
fake_scores = tf.random.normal(shape=(seq_len, seq_len))

fake_scores

Apply the masking to the fake attention scores matrix.

In [None]:
masked_fake_scores = np.where(mask == 1, - mask * np.infty, fake_scores)

masked_fake_scores

## Scaled dot-product attention with masking

In [None]:
from utils import masked_scaled_dot_product_attention
from encoder import Embeddings

In [None]:
embeddings = Embeddings(config=config)

token_embeddings = embeddings(input_ids)

token_embeddings

Generate fake query, key and value vectors and test scaled dot-product attention. Indeed, if we return the weights instead of the linear combinations of the value vectors, we see all the zeros generated by softmax.

In [None]:
from tensorflow.keras.layers import Dense

In [None]:
fake_q = Dense(units=config.dim)(token_embeddings)
fake_k = Dense(units=config.dim)(token_embeddings)
fake_v = Dense(units=config.dim)(token_embeddings)

masked_scaled_dot_product_attention(fake_q, fake_k, fake_v, return_weights=True)

## Masked single-head self-attention layer

In [None]:
from decoder import MaskedAttentionHead

In [None]:
masked_attention_head = MaskedAttentionHead(
    embed_dim=config.hidden_size,
    head_dim=config.hidden_size // 1 # One head.
)

masked_attention_head(token_embeddings)

## Masked multi-head self-attention layer

In [None]:
from decoder import MaskedMultiHeadAttention

In [None]:
masked_multi_head_attention = MaskedMultiHeadAttention(config=config)

decoder_hidden_states = masked_multi_head_attention(token_embeddings)

decoder_hidden_states

## Encoder-decoder single-head attention layer

The encoder-decoder single-head attention layer is an attention layer that computes attention weights between hidden states coming from the decoder and the key vectors coming from the decoder. These weights are then used to take linear combinations of the value vectors __coming from the encoder__.

Because sequences coming from the decoder and the encoder inputs can have different lengths, in general the weights matrix will be rectangular rather than square (as in self-attention). On the other hand, because similarity is still based on computing dot products, after projecting down to the head dimension the query, key and value vactors must have the same size, but this need not be true before the projection!

In [None]:
config.hidden_size

In [None]:
n_heads = 12
head_dim = config.hidden_size // n_heads

head_dim

Generate fake key and value vectors coming from the encoder. For simplicity, we are assuming the encoder and decoder use embeddings with the same dimension.

In [None]:
# We need to have batches of the same dimension in the
# encoder and decoder inputs, otherwise the scaled dot-product
# attention computation doesn't work (error with the
# batch dimension not matching).
n_sequences_encoder = input_ids.shape[0]
seq_len_encoder = 5  # Different sequence length w.r.t. to the decoder.

fake_encoder_k = tf.random.normal(shape=(n_sequences_encoder, seq_len_encoder, config.hidden_size))
fake_encoder_v = tf.random.normal(shape=(n_sequences_encoder, seq_len_encoder, config.hidden_size))

Project query, key and value vectors to the head dimension using dense layers.

In [None]:
head_q = Dense(units=head_dim)(decoder_hidden_states)
head_k = Dense(units=head_dim)(fake_encoder_k)
head_v = Dense(units=head_dim)(fake_encoder_v)

Compute scaled dot-product attention between the key vectors coming from the encoder and the hidden states coming from the decoders (used as the query vectors). Indeed the weights matrix is rectangular.

In [None]:
from utils import scaled_dot_product_attention

In [None]:
# Output shape: (batch_size, seq_len_decoder, seq_len_encoder).
scaled_dot_product_attention(
    query=head_q,
    key=head_k,
    value=head_v,
    return_weights=True
)

Linear combination of the encoder value vectors with weights equal to the attention weights.

In [None]:
# Output shape: (batch_size, seq_len_decoder, head_dim).
scaled_dot_product_attention(
    query=head_q,
    key=head_k,
    value=head_v
)

Test the class defined in the module.

In [None]:
from decoder import EncoderDecoderAttentionHead

In [None]:
encoder_decoder_att_head = EncoderDecoderAttentionHead(head_dim=head_dim)

encoder_decoder_att_head(
    decoder_hidden_state=decoder_hidden_states,
    encoder_k=fake_encoder_k,
    encoder_v=fake_encoder_v
)

## Encoder-decoder multi-head attention layer

Simply test the class defined in the module.

In [None]:
from decoder import EncoderDecoderMultiHeadAttention

In [None]:
encoder_decoder_multi_head_att = EncoderDecoderMultiHeadAttention(config=config)

encoder_decoder_multi_head_att(
    decoder_hidden_state=decoder_hidden_states,
    encoder_k=fake_encoder_k,
    encoder_v=fake_encoder_v
)

## Test a single decoder layer

In [None]:
from decoder import TransformerDecoderLayer

In [None]:
decoder_layer = TransformerDecoderLayer(config=config)

decoder_layer(token_embeddings, fake_encoder_k, fake_encoder_v)

## Test a full decoder

A full decoder is given by an embedding layer (same architecture as the one used for the encoder) followed by a stack (sequence) of decoder layers, as seen above.

In [None]:
from decoder import TransformerDecoder

In [None]:
decoder = TransformerDecoder(config=config)

decoder(input_ids, fake_encoder_k, fake_encoder_v)