In [1]:
import tensorflow as tf
from keras.layers import *

def get_subsequent_mask(seq):
    sz_b, len_s, n_features = seq.shape
    subsequent_mask = 1 - tf.linalg.band_part(tf.ones((len_s, len_s)), -1, 0)
    return subsequent_mask

def length_to_mask(length, max_len=None, dtype=None):
    assert len(length.shape) == 1, 'Length shape should be 1 dimensional.'
    max_len = max_len or tf.reduce_max(length)
    mask = tf.expand_dims(tf.range(max_len) >= length, 1)
    if dtype is not None:
        mask = tf.cast(mask, dtype)
    return mask

2024-04-12 15:33:56.795987: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-12 15:33:56.796038: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-12 15:33:56.850119: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-12 15:33:56.970752: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
class TransformerBlock(Layer):
    def __init__(self, d_model, n_heads, dim_feedforward=2048, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=n_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential(
            [Dense(dim_feedforward, activation="relu"), 
             Dense(d_model),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

    def call(self, inputs, training, mask=None):
        attn_output = self.att(inputs, inputs, attention_mask=mask)
        out1 = self.layernorm1(inputs + self.dropout1(attn_output, training=training))
        ffn_output = self.ffn(out1)
        return self.layernorm2(out1 + self.dropout2(ffn_output, training=training))

class AttentionModel(tf.keras.Model):
    def __init__(self, d_model, n_layers, n_heads, dropout, **kwargs):
        super().__init__(**kwargs)
        self.layers_list = [Dense(d_model)]
        self.layers_list.extend([TransformerBlock(d_model, n_heads, dropout=dropout) for _ in range(n_layers)])
        self.layers_list.append(Dense(1, activation='sigmoid'))

    def call(self, x):
        for layer in self.layers_list:
            x = layer(x)
        return x

In [10]:
model = AttentionModel(d_model=128, n_layers=1, n_heads=8, dropout=0.1)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, None, 7))

model.summary()

Model: "attention_model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_54 (Dense)            multiple                  1024      
                                                                 
 transformer_block_22 (Tran  multiple                  1054464   
 sformerBlock)                                                   
                                                                 
 dense_57 (Dense)            multiple                  129       
                                                                 
Total params: 1055617 (4.03 MB)
Trainable params: 1055617 (4.03 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
