In [1]:
# transformer implementation in Keras and TensorFlow 2X

In [2]:
# https://keras.io/examples/nlp/text_classification_with_transformer/

In [3]:
# test and migrate to AddOns on TensorFlow 1.15 for Cloud TPU

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [6]:
# transformer block as a layer

In [5]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [7]:
# 2-layer embedding (word embedding and positional encoding)

In [8]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [9]:
# download and prepare dataset

In [10]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review
(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


25000 Training sequences
25000 Validation sequences


In [13]:
x_train.shape, y_train.shape

((25000, 200), (25000,))

In [14]:
x_val.shape, y_val.shape

((25000, 200), (25000,))

In [18]:
# feature is an array of 200 tokens (embedded word with positional encoding) with the IMDB review
x_train[0]

array([    5,    25,   100,    43,   838,   112,    50,   670,     2,
           9,    35,   480,   284,     5,   150,     4,   172,   112,
         167,     2,   336,   385,    39,     4,   172,  4536,  1111,
          17,   546,    38,    13,   447,     4,   192,    50,    16,
           6,   147,  2025,    19,    14,    22,     4,  1920,  4613,
         469,     4,    22,    71,    87,    12,    16,    43,   530,
          38,    76,    15,    13,  1247,     4,    22,    17,   515,
          17,    12,    16,   626,    18, 19193,     5,    62,   386,
          12,     8,   316,     8,   106,     5,     4,  2223,  5244,
          16,   480,    66,  3785,    33,     4,   130,    12,    16,
          38,   619,     5,    25,   124,    51,    36,   135,    48,
          25,  1415,    33,     6,    22,    12,   215,    28,    77,
          52,     5,    14,   407,    16,    82, 10311,     8,     4,
         107,   117,  5952,    15,   256,     4,     2,     7,  3766,
           5,   723,

In [19]:
# target is a boolean with review result (a good or bad movie)
y_train[0]

1

In [20]:
# Create classifier model using transformer layer.
# Transformer layer outputs one vector for each time step of our input sequence.
# Here, we take the mean across all time steps and use a feed forward network on top of it to classify text.

In [21]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [22]:
# train and evaluate

In [23]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
)

Epoch 1/2
Epoch 2/2


In [24]:
# so far the demo from Google used a transformer encoder
# to go from sequence to class (sentiment classification over IMDB reviews)
# now break the original code to examine intermediate results

In [25]:
# model setup
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

In [27]:
# input layer for Keras functional
inputs = layers.Input(shape=(maxlen,))
inputs

<KerasTensor: shape=(None, 200) dtype=float32 (created by layer 'input_3')>

In [28]:
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
embedded_inputs = embedding_layer(inputs)
embedded_inputs

<KerasTensor: shape=(None, 200, 32) dtype=float32 (created by layer 'token_and_position_embedding_1')>

In [29]:
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
transformer_output = transformer_block(embedded_inputs)
transformer_output

<KerasTensor: shape=(None, 200, 32) dtype=float32 (created by layer 'transformer_block_1')>

In [31]:
averaged_output = layers.GlobalAveragePooling1D()(transformer_output)
averaged_output

<KerasTensor: shape=(None, 32) dtype=float32 (created by layer 'global_average_pooling1d_2')>

In [32]:
first_dropout_output = layers.Dropout(0.1)(averaged_output)
first_dropout_output

<KerasTensor: shape=(None, 32) dtype=float32 (created by layer 'dropout_6')>

In [33]:
first_dense_output = layers.Dense(20, activation="relu")(first_dropout_output)
first_dense_output

<KerasTensor: shape=(None, 20) dtype=float32 (created by layer 'dense_6')>

In [34]:
second_dropout_output = layers.Dropout(0.1)(first_dense_output)
second_dropout_output

<KerasTensor: shape=(None, 20) dtype=float32 (created by layer 'dropout_7')>

In [35]:
model_output = layers.Dense(2, activation="softmax")(second_dropout_output)
model_output

<KerasTensor: shape=(None, 2) dtype=float32 (created by layer 'dense_7')>