# Transformer from Scratch

Author: Deeepwin, Jason Brownlee [(Machine Learning Mastery)](https://machinelearningmastery.com)   
Date: 05.11.2022 
***

## Imports

In [1]:
import numpy as np

import tensorflow as tf

from tensorflow import math, matmul, reshape, shape, transpose, cast, float32, linalg, ones, maximum, newaxis
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, LayerNormalization, Layer, ReLU, Dropout, TextVectorization, Embedding, Input
 
from keras.backend import softmax


## Configuration

In [6]:
num_heads       = 8     # Number of self-attention heads
d_k             = 64    # Dimensionality of the linearly projected queries and keys
d_v             = 64    # Dimensionality of the linearly projected values
d_ff            = 2048  # Dimensionality of the inner fully connected layer
d_model         = 512   # Dimensionality of the model sub-layers' outputs
enc_num_layers  = 6     # Number of layers in the encoder stack
 
dropout_rate = 0.1      # Frequency of dropping the input units in the dropout layers

enc_vocab_size = 20     # Vocabulary size for the encoder
dec_vocab_size = 20     # Vocabulary size for the decoder
 
enc_seq_length = 5      # Maximum length of the input sequence
dec_seq_length = 5      # Maximum length of the target sequence

batch_size = 32         # Batch size from the training process

## Attention Layer

See [(Link)](https://machinelearningmastery.com/how-to-implement-multi-head-attention-from-scratch-in-tensorflow-and-keras)

In [7]:
# Implementing the Scaled-Dot Product Attention
class DotProductAttention(Layer):

    def __init__(self, **kwargs):
    
        super(DotProductAttention, self).__init__(**kwargs)
 
    def call(self, queries, keys, values, d_k, mask=None):
    
        # Scoring the queries against the keys after transposing the latter, and scaling
        scores = matmul(queries, keys, transpose_b=True) / math.sqrt(cast(d_k, float32))
 
        # Apply mask to the attention scores
        if mask is not None:
            scores += -1e9 * mask
 
        # Computing the weights by a softmax operation
        weights = softmax(scores)
 
        # Computing the attention by a weighted sum of the value vectors
        return matmul(weights, values)
 
# Implementing the Multi-Head Attention
class MultiHeadAttention(Layer):

    def __init__(self, num_heads, d_k, d_v, d_model, **kwargs):
    
        super(MultiHeadAttention, self).__init__(**kwargs)

        self.attention = DotProductAttention()      # Scaled dot product attention
        self.heads = num_heads                      # Number of attention heads to use
        self.d_k = d_k                              # Dimensionality of the linearly projected queries and keys
        self.d_v = d_v                              # Dimensionality of the linearly projected values
        self.d_model = d_model                      # Dimensionality of the model
        self.W_q = Dense(d_k)                       # Learned projection matrix for the queries
        self.W_k = Dense(d_k)                       # Learned projection matrix for the keys
        self.W_v = Dense(d_v)                       # Learned projection matrix for the values
        self.W_o = Dense(d_model)                   # Learned projection matrix for the multi-head output
 
    def reshape_tensor(self, x, heads, flag):
        if flag:
            # Tensor shape after reshaping and transposing: (batch_size, heads, seq_length, -1)
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
            x = transpose(x, perm=(0, 2, 1, 3))
        else:
            # Reverting the reshaping and transposing operations: (batch_size, seq_length, d_k)
            x = transpose(x, perm=(0, 2, 1, 3))
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], self.d_k))
        return x
 
    def call(self, queries, keys, values, mask=None):
 
        # Rearrange the queries to be able to compute all heads in parallel
        q_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
        
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
 
        # Rearrange the keys to be able to compute all heads in parallel
        k_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
 
        # Rearrange the values to be able to compute all heads in parallel
        v_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
 
        # Compute the multi-head attention output using the reshaped queries, keys and values
        o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, self.d_k, mask)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
 
        # Rearrange back the output into concatenated form
        output = self.reshape_tensor(o_reshaped, self.heads, False)
        # Resulting tensor shape: (batch_size, input_seq_length, d_v)
 
        # Apply one final linear projection to the output to generate the multi-head attention
        # Resulting tensor shape: (batch_size, input_seq_length, d_model)
        return self.W_o(output)


### Test

In [8]:
from numpy import random
 
queries     = random.random((batch_size, enc_seq_length, d_k))
keys        = random.random((batch_size, enc_seq_length, d_k))
values      = random.random((batch_size, enc_seq_length, d_v))
 
multihead_attention = MultiHeadAttention(num_heads, d_k, d_v, d_model)

print(multihead_attention(queries, keys, values))

tf.Tensor(
[[[-1.17719315e-01 -1.64581016e-01 -1.85618415e-01 ... -1.17994696e-02
   -3.56539190e-02  8.62294883e-02]
  [-1.15477055e-01 -1.62852913e-01 -1.86942145e-01 ... -1.32803302e-02
   -3.54596786e-02  8.49324912e-02]
  [-1.18289255e-01 -1.62794963e-01 -1.87313288e-01 ... -1.18084215e-02
   -3.63811441e-02  8.45330209e-02]
  [-1.16531700e-01 -1.63553014e-01 -1.85616031e-01 ... -1.05693638e-02
   -3.55419815e-02  8.44776183e-02]
  [-1.17834754e-01 -1.65324256e-01 -1.84310168e-01 ... -1.20349824e-02
   -3.63635682e-02  8.58393982e-02]]

 [[-1.31380200e-01 -1.43474102e-01 -2.00295374e-01 ... -5.29597700e-02
    3.27825896e-03  3.85836735e-02]
  [-1.30766556e-01 -1.42998233e-01 -2.01179400e-01 ... -5.33635132e-02
    4.57827421e-03  3.90004218e-02]
  [-1.31485119e-01 -1.40581980e-01 -2.00538576e-01 ... -5.01947105e-02
    4.36001318e-03  3.86535190e-02]
  [-1.31157145e-01 -1.43689126e-01 -1.99828044e-01 ... -5.31405695e-02
    4.88769542e-03  3.94294448e-02]
  [-1.31040215e-01 -1.41

## Embedding Layer

See [(Link)](https://machinelearningmastery.com/the-transformer-positional-encoding-layer-in-keras-part-2/)

In [15]:

class PositionEmbeddingFixedWeights(Layer):

    def __init__(self, sequence_length, vocab_size, output_dim, **kwargs):
    
        super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)

        word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)   
        position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)     

        self.word_embedding_layer = Embedding(
            input_dim=vocab_size, output_dim=output_dim,
            weights=[word_embedding_matrix],
            trainable=False
        )
        self.position_embedding_layer = Embedding(
            input_dim=sequence_length, output_dim=output_dim,
            weights=[position_embedding_matrix],
            trainable=False
        )
             
    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P
 
    def call(self, inputs):        
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices

### Test

In [22]:
sentences = [["I am a robot"], ["you too robot"]]
sentence_data = tf.data.Dataset.from_tensor_slices(sentences)

# Create the TextVectorization layer
vectorize_layer = TextVectorization(max_tokens=enc_vocab_size,
                                    output_sequence_length=enc_vocab_size
)

# Train the layer to create a dictionary
vectorize_layer.adapt(sentence_data)

# Convert all sentences to tensors
word_tensors = tf.convert_to_tensor(sentences, dtype=tf.string)

# Use the word tensors to get vectorized phrases
vectorized_words = vectorize_layer(word_tensors)
print("Vocabulary: ", vectorize_layer.get_vocabulary())
print("Vectorized words: ", vectorized_words)

Vocabulary:  ['', '[UNK]', 'robot', 'you', 'too', 'i', 'am', 'a']
Vectorized words:  tf.Tensor(
[[5 6 7 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [3 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(2, 20), dtype=int64)


In [23]:
output_length = 6

attnisallyouneed_embedding = PositionEmbeddingFixedWeights(enc_seq_length, enc_vocab_size, output_length)
attnisallyouneed_output = attnisallyouneed_embedding(vectorized_words)
print("Output from my_embedded_layer: ", attnisallyouneed_output)

Output from my_embedded_layer:  tf.Tensor(
[[[-0.9589243   1.2836622   0.23000172  1.9731903   0.01077196
    1.9999421 ]
  [ 0.56205547  1.5004725   0.3213085   1.9603932   0.01508068
    1.9999142 ]
  [ 1.566284    0.3377554   0.41192317  1.9433732   0.01938933
    1.999877  ]
  [ 1.0504174  -1.4061394   0.2314966   1.9860148   0.01077211
    1.9999698 ]
  [-0.7568025   0.3463564   0.18459873  1.982814    0.00861763
    1.9999628 ]
  [ 0.          1.          0.          1.          0.
    1.        ]
  [ 0.          1.          0.          1.          0.
    1.        ]
  [ 0.          1.          0.          1.          0.
    1.        ]
  [ 0.          1.          0.          1.          0.
    1.        ]
  [ 0.          1.          0.          1.          0.
    1.        ]
  [ 0.          1.          0.          1.          0.
    1.        ]
  [ 0.          1.          0.          1.          0.
    1.        ]
  [ 0.          1.          0.          1.          0.
    1.    

## Encoder

See [(Link)](https://machinelearningmastery.com/implementing-the-transformer-encoder-from-scratch-in-tensorflow-and-keras)

In [16]:
# Implementing the Add & Norm Layer
class AddNormalization(Layer):

    def __init__(self, **kwargs):
    
        super(AddNormalization, self).__init__(**kwargs)
        self.layer_norm = LayerNormalization()  # Layer normalization layer
 
    def call(self, x, sublayer_x):
    
        # The sublayer input and output need to be of the same shape to be summed
        add = x + sublayer_x
 
        # Apply layer normalization to the sum
        return self.layer_norm(add)
 
# Implementing the Feed-Forward Layer
class FeedForward(Layer):
    
    def __init__(self, d_ff, d_model, **kwargs):
        super(FeedForward, self).__init__(**kwargs)
        self.fully_connected1 = Dense(d_ff)         # First fully connected layer
        self.fully_connected2 = Dense(d_model)      # Second fully connected layer
        self.activation = ReLU()                    # ReLU activation layer
 
    def call(self, x):
        
        # The input is passed into the two fully-connected layers, with a ReLU in between
        x_fc1 = self.fully_connected1(x)
 
        return self.fully_connected2(self.activation(x_fc1))
 
# Implementing the Encoder Layer
class EncoderLayer(Layer):
    def __init__(self, sequence_length, num_heads, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super(EncoderLayer, self).__init__(**kwargs)
        self.build(input_shape=[None, sequence_length, d_model])
        self.sequence_length = sequence_length
        self.d_model = d_model
        self.multihead_attention = MultiHeadAttention(num_heads, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()
 
    def build_graph(self):
        input_layer = Input(shape=(self.sequence_length, self.d_model))
        return Model(inputs=[input_layer], outputs=self.call(input_layer, None, True))

    def call(self, x, padding_mask, training):
        # Multi-head attention layer
        multihead_output = self.multihead_attention(x, x, x, padding_mask)
        # Expected output shape = (batch_size, sequence_length, d_model)
 
        # Add in a dropout layer
        multihead_output = self.dropout1(multihead_output, training=training)
 
        # Followed by an Add & Norm layer
        addnorm_output = self.add_norm1(x, multihead_output)
        # Expected output shape = (batch_size, sequence_length, d_model)
 
        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output)
        # Expected output shape = (batch_size, sequence_length, d_model)
 
        # Add in another dropout layer
        feedforward_output = self.dropout2(feedforward_output, training=training)
 
        # Followed by another Add & Norm layer
        return self.add_norm2(addnorm_output, feedforward_output)
 
# Implementing the Encoder
class Encoder(Layer):
    def __init__(self, vocab_size, sequence_length, num_heads, d_k, d_v, d_model, d_ff, enc_num_layers, rate, **kwargs):

        super(Encoder, self).__init__(**kwargs)
        
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size, d_model)
        self.dropout = Dropout(rate)
        self.encoder_layer = [EncoderLayer(sequence_length, num_heads, d_k, d_v, d_model, d_ff, rate) for _ in range(enc_num_layers)]
 
    def call(self, input_sentence, padding_mask, training):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(input_sentence)
        # Expected output shape = (batch_size, sequence_length, d_model)
 
        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)
 
        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.encoder_layer):
            x = layer(x, padding_mask, training)
 
        return x


### Test

In [25]:
input_seq = random.random((batch_size, enc_seq_length))
 
encoder = Encoder(enc_vocab_size, enc_seq_length, num_heads, d_k, d_v, d_model, d_ff, enc_num_layers, dropout_rate)
print(encoder(input_seq, None, True))

tf.Tensor(
[[[-1.7888652   0.8736814  -0.3116082  ...  0.10796599 -0.62521356
    2.6012537 ]
  [-1.093596    1.2090927  -0.36734146 ... -1.3095893   0.03615656
    1.8673748 ]
  [-0.8884822   0.17601381  0.4815623  ... -1.5222982   0.25411382
    2.2312286 ]
  [-0.34275466  1.1357977   0.00663071 ... -2.01222     0.116634
    2.4326844 ]
  [-0.7382755   0.664318    0.09541241 ... -0.46417865  0.05934018
    2.6176155 ]]

 [[-1.0668635   1.320001   -0.3506786  ... -0.971359   -0.14598404
    1.70889   ]
  [-0.20275056  1.5537518  -0.24691813 ... -0.9730711  -0.6376055
    2.0582087 ]
  [-0.9931612   1.0704647  -0.31210962 ... -0.05847372 -1.0826235
    2.655542  ]
  [ 0.2933821   1.1508018  -0.3418462  ... -0.970481   -0.35228422
    1.9245548 ]
  [-0.79770005  0.9725596   0.85843486 ... -0.62964433 -0.18828501
    1.7818862 ]]

 [[-0.88951075  1.2760794  -0.27132267 ... -1.2069141   0.2240364
    1.4607836 ]
  [-0.27085745  1.2869214   0.4298286  ... -0.97214043  0.52775323
    1.7769

## Decoder

See [(Link)](https://machinelearningmastery.com/implementing-the-transformer-decoder-from-scratch-in-tensorflow-and-keras)   

In [14]:

# Implementing the Decoder Layer
class DecoderLayer(Layer):
    def __init__(self, sequence_length, num_heads, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super(DecoderLayer, self).__init__(**kwargs)
        self.build(input_shape=[None, sequence_length, d_model])
        self.sequence_length = sequence_length
        self.d_model = d_model
        self.multihead_attention1 = MultiHeadAttention(num_heads, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.multihead_attention2 = MultiHeadAttention(num_heads, d_k, d_v, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout3 = Dropout(rate)
        self.add_norm3 = AddNormalization()

    def build_graph(self):
        input_layer = Input(shape=(self.sequence_length, self.d_model))
        return Model(inputs=[input_layer], outputs=self.call(input_layer, input_layer, None, None, True))
        
    def call(self, x, encoder_output, lookahead_mask, padding_mask, training):

        # Multi-head attention layer
        multihead_output1 = self.multihead_attention1(x, x, x, lookahead_mask)
        # Expected output shape = (batch_size, sequence_length, d_model)
 
        # Add in a dropout layer
        multihead_output1 = self.dropout1(multihead_output1, training=training)
 
        # Followed by an Add & Norm layer
        addnorm_output1 = self.add_norm1(x, multihead_output1)
        # Expected output shape = (batch_size, sequence_length, d_model)
 
        # Followed by another multi-head attention layer
        multihead_output2 = self.multihead_attention2(addnorm_output1, encoder_output, encoder_output, padding_mask)
 
        # Add in another dropout layer
        multihead_output2 = self.dropout2(multihead_output2, training=training)
 
        # Followed by another Add & Norm layer
        addnorm_output2 = self.add_norm1(addnorm_output1, multihead_output2)
 
        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output2)
        # Expected output shape = (batch_size, sequence_length, d_model)
 
        # Add in another dropout layer
        feedforward_output = self.dropout3(feedforward_output, training=training)
 
        # Followed by another Add & Norm layer
        return self.add_norm3(addnorm_output2, feedforward_output)
 
# Implementing the Decoder
class Decoder(Layer):

    def __init__(self, vocab_size, sequence_length, num_heads, d_k, d_v, d_model, d_ff, enc_num_layers, rate, **kwargs):
    
        super(Decoder, self).__init__(**kwargs)
    
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size, d_model)
        self.dropout = Dropout(rate)
        self.decoder_layer = [DecoderLayer(sequence_length, num_heads, d_k, d_v, d_model, d_ff, rate) for _ in range(enc_num_layers)]
 
    def call(self, output_target, encoder_output, lookahead_mask, padding_mask, training):

        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(output_target)
        # Expected output shape = (number of sentences, sequence_length, d_model)
 
        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)
 
        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.decoder_layer):
            x = layer(x, encoder_output, lookahead_mask, padding_mask, training)
 
        return x

### Test

In [26]:
input_seq = random.random((batch_size, enc_seq_length))
enc_output = random.random((batch_size, enc_seq_length, d_model))
 
decoder = Decoder(dec_vocab_size, enc_seq_length, num_heads, d_k, d_v, d_model, d_ff, enc_num_layers, dropout_rate)
print(decoder(input_seq, enc_output, None, True))

tf.Tensor(
[[[ 2.3162944  -0.49510553  0.39101964 ... -0.47465998 -0.27957925
    1.8352864 ]
  [ 2.4507332  -0.5492203   0.49797055 ... -0.48505977 -0.3319876
    1.8730515 ]
  [ 2.482401   -0.6430672   0.5569278  ... -0.5331913  -0.36845747
    1.8736725 ]
  [ 2.39426    -0.66227424  0.55412394 ... -0.5681245  -0.34679818
    1.8641967 ]
  [ 2.2520072  -0.5293939   0.50959176 ... -0.57794714 -0.27542463
    1.8113252 ]]

 [[ 2.1611907  -0.5928186   0.45438272 ... -0.55519485 -0.4944152
    2.0060837 ]
  [ 2.2760499  -0.65348715  0.5957813  ... -0.5765424  -0.54428375
    2.030274  ]
  [ 2.3119519  -0.7591442   0.68515    ... -0.6004256  -0.54614925
    2.023847  ]
  [ 2.2370732  -0.76918703  0.68510264 ... -0.58897775 -0.50142294
    2.0207336 ]
  [ 2.1316447  -0.6503975   0.60717356 ... -0.5717864  -0.43231025
    1.9990848 ]]

 [[ 2.3316627  -0.6190016   0.3406028  ... -0.28421003 -0.50201535
    1.6642814 ]
  [ 2.4423454  -0.67786455  0.44932386 ... -0.313638   -0.5632055
    1.69

## Transformer Model

See [(Link)](https://machinelearningmastery.com/joining-the-transformer-encoder-and-decoder-and-masking/)

In [6]:

class TransformerModel(Model):
    
    def __init__(self, enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, num_heads, d_k, d_v, d_model, d_ff_inner, enc_num_layers, rate, **kwargs):

        super(TransformerModel, self).__init__(**kwargs)
 
        # Set up the encoder
        self.encoder = Encoder(enc_vocab_size, enc_seq_length, num_heads, d_k, d_v, d_model, d_ff_inner, enc_num_layers, rate)
 
        # Set up the decoder
        self.decoder = Decoder(dec_vocab_size, dec_seq_length, num_heads, d_k, d_v, d_model, d_ff_inner, enc_num_layers, rate)
 
        # Define the final dense layer
        self.model_last_layer = Dense(dec_vocab_size)
 
    def padding_mask(self, input):

        # Create mask which marks the zero padding values in the input by a 1.0
        mask = math.equal(input, 0)
        mask = cast(mask, float32)
 
        # The shape of the mask should be broadcastable to the shape
        # of the attention weights that it will be masking later on
        return mask[:, newaxis, newaxis, :]
 
    def lookahead_mask(self, shape):
        
        # Mask out future entries by marking them with a 1.0
        mask = 1 - linalg.band_part(ones((shape, shape)), -1, 0)
 
        return mask
 
    def call(self, encoder_input, decoder_input, training):
 
        # Create padding mask to mask the encoder inputs and the encoder outputs in the decoder
        enc_padding_mask = self.padding_mask(encoder_input)
 
        # Create and combine padding and look-ahead masks to be fed into the decoder
        dec_in_padding_mask = self.padding_mask(decoder_input)
        dec_in_lookahead_mask = self.lookahead_mask(decoder_input.shape[1])
        dec_in_lookahead_mask = maximum(dec_in_padding_mask, dec_in_lookahead_mask)
 
        # Feed the input into the encoder
        encoder_output = self.encoder(encoder_input, enc_padding_mask, training)
 
        # Feed the encoder output into the decoder
        decoder_output = self.decoder(decoder_input, encoder_output, dec_in_lookahead_mask, enc_padding_mask, training)
 
        # Pass the decoder output through a final dense layer
        model_output = self.model_last_layer(decoder_output)
 
        return model_output


# Show Models

In [9]:
 
encoder = EncoderLayer(enc_seq_length, num_heads, d_k, d_v, d_model, d_ff, dropout_rate);
encoder.build_graph().summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 5, 512)]     0           []                               
                                                                                                  
 multi_head_attention_1 (MultiH  (None, 5, 512)      131776      ['input_2[0][0]',                
 eadAttention)                                                    'input_2[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 dropout_2 (Dropout)            (None, 5, 512)       0           ['multi_head_attention_1[0][0]'] 
                                                                                            

In [12]:
decoder = DecoderLayer(dec_seq_length, num_heads, d_k, d_v, d_model, d_ff, dropout_rate);
decoder.build_graph().summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 5, 512)]     0           []                               
                                                                                                  
 multi_head_attention_4 (MultiH  (None, 5, 512)      131776      ['input_4[0][0]',                
 eadAttention)                                                    'input_4[0][0]',                
                                                                  'input_4[0][0]']                
                                                                                                  
 dropout_7 (Dropout)            (None, 5, 512)       0           ['multi_head_attention_4[0][0]'] 
                                                                                            

## Build Transformer

In [17]:
# Create model
training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, num_heads, d_k, d_v, d_model, d_ff, enc_num_layers, dropout_rate)