In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

## Self Attention Calculation

In [2]:
class SelfAttention(Layer):
    def __init__(self, d_model):
        super(SelfAttention, self).__init__()
        self.d_model = d_model
        self.query_dense = tf.keras.layers.Dense(d_model)
        self.key_dense = tf.keras.layers.Dense(d_model)
        self.value_dense = tf.keras.layers.Dense(d_model)

    def call(self, inputs):
        q = self.query_dense(inputs)
        k = self.key_dense(inputs)
        v = self.value_dense(inputs)

        attention_weights = tf.nn.softmax(tf.matmul(q, k, transpose_b = True) / 
        tf.math.sqrt(tf.cast(self.d_model, tf.float32)), axis = 1)
        output = tf.matmul(attention_weights, v)

        return output

In [3]:
inputs = tf.random.uniform((1, 60, 512)) # batch size of 1, sequence length of 60, and model dimensin of 512
self_attention = SelfAttention(d_model=512)
output = self_attention(inputs)
print(output.shape)

(1, 60, 512)


## Transformer encoder

In [7]:
class TransformerEncoder(Layer):
    def __init__(self, d_model, num_heads, dff, rate = 0.1):
        super(TransformerEncoder, self).__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential([tf.keras.layers.Dense(dff, activation='relu'),
             tf.keras.layers.Dense(d_model)])
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

        def call(self, x, training, mask):
            attn_output = self.mha(x, x, x, attention_mask = mask) # Self Attention
            attn_output = self.droput1(attn_output, training=training)
            out1 = self.layernorm1(x + attn_output) # Residual connection and normalization

            ffn_output = self.ffn(out1) # Feed Foward network
            ffn_output = self.dropout2(ffn_output, training=training)
            out2 = self.layernorm29out1 + ffn_output

            return out2

In [8]:
encoder = TransformerEncoder(d_model=512, num_heads=8, dff=2048)
x = tf.random.uniform((1, 60, 512))
mask = None

In [10]:
class TransformerDecoder(Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerDecoder, self).__init__()

        self.mha1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.mha2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)

        def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
            attn1 = self.mha1(x, x, attention_mask=look_ahead_mask)  # Self attention
            attn1 = self.dropout1(attn1, training=training)
            out1 = self.layernorm1(x + attn1)  # Residual connection and normalization

            attn2 = self.mha2(out1, enc_output, enc_output, attention_mask=padding_mask)  # Cross attention
            attn2 = self.dropout2(attn2, training=training)
            out2 = self.layernorm2(out1 + attn2)  # Residual connection and normalization

            ffn_output = self.ffn(out2)  # Feed forward network
            ffn_output = self.dropout3(ffn_output, training=training)
            out3 = self.layernorm3(out2 + ffn_output)  # Residual connection and normalization

            return out3