# Chp 11: 11.4 Part 3 The Transformer Architecture

In [11]:
def setup():
    import tensorflow as tf
    from tensorflow import keras

    # every layer uses a 16-bit compute dtype and float32 variable dtype by default.
    # most of the forward pass of the model will be done in float16,
    # (with the exception of numerically unstable operations like softmax),
    # while the weights of the model will be stored and updated in float32.
    keras.mixed_precision.set_global_policy("mixed_float16")
    print(tf.keras.mixed_precision.global_policy())
    
setup()

<Policy "mixed_float16">


In [12]:
def HR():
    # print char * numeric
    print('-' * 80)

In [3]:
def listing11_12():
    import os

    dirpath = 'aclImdb'
    if not os.path.isdir(dirpath):
        print(f'{dirpath} not found, creating directory')
        HR()
        try:
            !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
            !tar -xf aclImdb_v1.tar.gz
            !rm -r aclImdb/train/unsup
        except Exception as ex:
            print(f"Not able to create directory due to error {ex}")
            
listing11_12()

aclImdb not found, creating directory
--------------------------------------------------------------------------------
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  64.8M      0  0:00:01  0:00:01 --:--:-- 64.8M


In [4]:
# dict
options = {
    'batch_size': 32,
    'max_length' : 600,
    'max_tokens' : 20_000
}

In [5]:
# Preparing the data
def listing11_13():
    import os, pathlib, shutil, random
    from tensorflow import keras

    dirpath = 'aclImdb/val'
    if os.path.isdir(dirpath):
        print(f"{dirpath} already exists")
    else:
        print(f"Prepare a validation set by setting apart 20% of the training text files in a new directory, {dirpath}")
        base_dir = pathlib.Path("aclImdb")
        val_dir = base_dir / "val"
        train_dir = base_dir / "train"
        for category in ("neg", "pos"):
            os.makedirs(val_dir / category, exist_ok=True)
            files = os.listdir(train_dir / category)

            # Shuffle the list of training files using a seed, to ensure
            # we get the same validation set every time we run the code
            random.Random(1337).shuffle(files)

            # Take 20% of the training files to use for validation
            num_val_samples = int(0.2 * len(files))
            val_files = files[-num_val_samples:]

            # Move the files to aclImdb/val/neg and aclImdb/val/pos
            for fname in val_files:
                shutil.move(train_dir / category / fname,
                            val_dir / category / fname)

# This should be its own function, since the action is conditional
# and we want to be able to treat it as so via control-flow eventually

listing11_13()

Prepare a validation set by setting apart 20% of the training text files in a new directory, aclImdb/val


In [6]:
# Vectorizing the data
def listing11_14():
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    train_ds = keras.preprocessing.text_dataset_from_directory(
        "aclImdb/train", batch_size=options['batch_size']
    )
    val_ds = keras.preprocessing.text_dataset_from_directory(
        "aclImdb/val", batch_size=options['batch_size']
    )
    test_ds = keras.preprocessing.text_dataset_from_directory(
        "aclImdb/test", batch_size=options['batch_size']
    )
    text_only_train_ds = train_ds.map(lambda x, y: x)


    # Preparing integer sequence datasets
    text_vectorization = layers.experimental.preprocessing.TextVectorization(
        max_tokens=options['max_tokens'],
        output_mode="int",
        output_sequence_length=options['max_length'],
    )
    text_vectorization.adapt(text_only_train_ds)

    int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
    int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
    int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))

    HR()
    print(type(int_train_ds))
    print(int_train_ds)
    HR()
    return int_train_ds, int_val_ds, int_test_ds

int_train_ds, int_val_ds, int_test_ds = listing11_14()

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
--------------------------------------------------------------------------------
<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>
<MapDataset shapes: ((None, 600), (None,)), types: (tf.int64, tf.int32)>
--------------------------------------------------------------------------------


In [7]:
# Listing 11.23 Transformer encoder implemented as a subclassed Layer
# p. 392

# The encoder part can be used for text classification — it’s a very generic 
# module that ingests a sequence and learns to turn it into a more useful 
# representation. Implement a Transformer encoder and try it on the movie 
# review sentiment classification task.

def listing11_23():
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    class TransformerEncoder(layers.Layer):
        def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
            super().__init__(**kwargs)

            # Size of the input token vectors
            self.embed_dim = embed_dim
            # Size of the inner dense layer
            self.dense_dim = dense_dim
            # Number of attention heads
            self.num_heads = num_heads
            self.attention = layers.MultiHeadAttention(
                num_heads=num_heads, key_dim=embed_dim)
            self.dense_proj = keras.Sequential(
                [layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),]
            )

            # The normalization layers we’re using here aren’t BatchNormalization 
            # layers like those you’ve used before in image models. That’s 
            # because BatchNormalization doesn’t work well for sequence data. 
            # Instead, we’re using the LayerNormalization layer, which normalizes 
            # each sequence independently from other sequences in the batch.
            self.layernorm_1 = layers.LayerNormalization()
            self.layernorm_2 = layers.LayerNormalization()

        # Computation goes in call
        def call(self, inputs, mask=None):
            # The mask that will be generated by the Embedding layer will be 
            # 2D, but the attention layer expects to be 3D or 4D, so we expand 
            # its rank.
            if mask is not None:
                mask = mask[:, tf.newaxis, :]
            attention_output = self.attention(
                inputs, inputs, attention_mask=mask)
            proj_input = self.layernorm_1(inputs + attention_output)
            proj_output = self.dense_proj(proj_input)
            return self.layernorm_2(proj_input + proj_output)

        # Serialization so we can save the model
        def get_config(self):
            config = super().get_config()
            config.update({
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "dense_dim": self.dense_dim,
            })
            return config

    return TransformerEncoder

TransformerEncoder = listing11_23()

print(type(TransformerEncoder))

<class 'type'>


---

This next section is about "sequence models". 

Word order is important, and the Transformer was a sequence-processing architecture, originally developed for machine translation. 

However, the Transformer encoder here isn't a sequence model at all.

It’s composed of dense layers, which process sequence tokens independently from each other, and an attention layer, which looks at the tokens as a set. 

You could change the order of the tokens in a sequence, and you’d get the exact same pairwise attention scores and the exact same context-aware representations. 

If you were to completely scramble the words in every movie review, the model wouldn’t notice, and you’d still get the exact same accuracy. 

Self-attention is a set-processing mechanism, focused on the relationships between pairs of sequence elements (see figure 11.10) — it’s blind to whether these elements occur at the beginning, at the end, or in the middle of a sequence. 

So, why do we say that Transformer is a sequence model, then? And how could it possibly be good for machine translation if it doesn’t look at word order? 

The Transformer is a hybrid approach, that is technically order-agnostic, but that manually injects order information in the representations it processes. This is the missing ingredient! It’s called "positional encoding". It has both word-order awareness and context awareness.

In [15]:
# Listing 11.24 Text classification model that combines the Transformer encoder 
# and a pooling layer
# We can use the TransformerEncoder to assemble a text-classification model 
# similar to the GRU-based one seen previously.
# p.394

# This example is flawed, as noted above.

def listing11_24():
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    vocab_size = 20000
    embed_dim = 256
    num_heads = 2
    dense_dim = 32


    # NOTE: This Transformer encoder is NOT a sequence model at all.
    # It’s composed of dense layers, which process sequence tokens 
    # independently from each other, and an attention layer, which looks at 
    # the tokens as a set. You could change the order of the tokens in a 
    # sequence, and you’d get the exact same pairwise attention scores and 
    # the exact same context-aware representations. If you were to completely 
    # scramble the words in every movie review, the model wouldn’t notice, 
    # and you’d still get the exact same accuracy. 

    inputs = keras.Input(shape=(None,), dtype="int64")
    x = layers.Embedding(vocab_size, embed_dim)(inputs)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
    # Since TransformerEncoder returns full sequences, we need to reduce each 
    # sequence to a single vector for classification, via a global pooling layer.
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",
                loss="binary_crossentropy",
                metrics=["accuracy"])
    model.summary()


    # Training and evaluating the Transformer encoder based model
    callbacks = [
        keras.callbacks.ModelCheckpoint("transformer_encoder.keras",
                                        save_best_only=True)
    ]
    
    model.fit(
        int_train_ds, 
        validation_data=int_val_ds, 
        # epochs=20,
        epochs=5, 
        callbacks=callbacks
    )
    
    model = keras.models.load_model(
        "transformer_encoder.keras",
        custom_objects={"TransformerEncoder": TransformerEncoder})
    print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

listing11_24()


# Model: "model"
# _________________________________________________________________
# Layer (type)                 Output Shape              Param #   
# =================================================================
# input_1 (InputLayer)         [(None, None)]            0         
# _________________________________________________________________
# embedding (Embedding)        (None, None, 256)         5120000   
# _________________________________________________________________
# transformer_encoder (Transfo (None, None, 256)         543776    
# _________________________________________________________________
# global_max_pooling1d (Global (None, 256)               0         
# _________________________________________________________________
# dropout (Dropout)            (None, 256)               0         
# _________________________________________________________________
# dense_2 (Dense)              (None, 1)                 257       
# =================================================================
# Total params: 5,664,033
# Trainable params: 5,664,033
# Non-trainable params: 0
# _________________________________________________________________
# Epoch 1/5
# 625/625 [==============================] - 45s 68ms/step - loss: 0.4814 - accuracy: 0.7789 - val_loss: 0.3244 - val_accuracy: 0.8620
# Epoch 2/5
# 625/625 [==============================] - 42s 67ms/step - loss: 0.3147 - accuracy: 0.8652 - val_loss: 0.2880 - val_accuracy: 0.8804
# Epoch 3/5
# 625/625 [==============================] - 41s 66ms/step - loss: 0.2370 - accuracy: 0.9049 - val_loss: 0.2856 - val_accuracy: 0.8880
# Epoch 4/5
# 625/625 [==============================] - 42s 67ms/step - loss: 0.1827 - accuracy: 0.9291 - val_loss: 0.3538 - val_accuracy: 0.8814
# Epoch 5/5
# 625/625 [==============================] - 41s 65ms/step - loss: 0.1511 - accuracy: 0.9452 - val_loss: 0.3209 - val_accuracy: 0.8860
# 782/782 [==============================] - 19s 24ms/step - loss: 0.3057 - accuracy: 0.8763

# Test acc: 0.876

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_7 (Embedding)      (None, None, 256)         5120000   
_________________________________________________________________
transformer_encoder_4 (Trans (None, None, 256)         543776    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 256)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 257       
Total params: 5,664,033
Trainable params: 5,664,033
Non-trainable params: 0
_________________________________________________

---
# USING POSITIONAL ENCODING TO REINJECT ORDER INFORMATION, p.395

The idea behind positional encoding is very simple: to give the model access to word order information, we’re going to add to each word embedding the word’s position in the sentence. 

Our input word embeddings will have two components: 

1. The usual **word vector**, which represents the word independently of any specific context.

2. The **position vector**, which represents the position of the word in the current sentence. Hopefully, the model will then figure out how to best leverage this additional information.

The simplest scheme you could come up with would be to concatenate the word’s position to its embedding vector. You’d add a "position" axis to the vector, and fill it with 0 for the first word in the sequence, 1 for the second one, and so on.

That may not be ideal, however, because your positions can potentially be very large integers, which will disrupt the range of values in the embedding vector. As you know, neural networks don’t like very large input values, or discrete input distributions.

The original "Attention is all you need paper" used an interesting trick to encode word positions: it added to the word embeddings a vector containing values in the range [-1, 1] that varied cyclically depending on the position (it used cosine functions to achieve this). This trick offers a way to uniquely characterize any integer in a large range via of vector of small values. 

It’s clever, but it’s not what we’re going to use in our case. We’ll do something simpler and more effective: we’ll just learn position embedding vectors, just the same way we learn to embed word indices. We’ll then proceed to add our position embeddings to the corresponding word embeddings, to obtain a position-aware word embedding. This technique is called "positional embedding". Let’s implement it:

In [18]:
# Using positional encoding to reinject order information
# Implementing positional embedding as a subclassed layer
# Listing 11.26 Implementing positional embedding as a subclassed layer

def listing11_26():
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    # Use this PositionEmbedding layer just like a regular Embedding layer.
    class PositionalEmbedding(layers.Layer):
        # A downside of position embeddings is that the sequence length needs to be known in advance.
        def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
            super().__init__(**kwargs)
            # Embedding layer for the token indices
            self.token_embeddings = layers.Embedding(
                input_dim=input_dim, output_dim=output_dim)
            # Embedding layer for the token positions
            self.position_embeddings = layers.Embedding(
                input_dim=sequence_length, output_dim=output_dim)
            self.sequence_length = sequence_length
            self.input_dim = input_dim
            self.output_dim = output_dim

        def call(self, inputs):
            length = tf.shape(inputs)[-1]
            positions = tf.range(start=0, limit=length, delta=1)
            embedded_tokens = self.token_embeddings(inputs)
            embedded_positions = self.position_embeddings(positions)
            # Add both embedding vectors together
            return embedded_tokens + embedded_positions

        # Like the Embedding layer, this layer should be able to generate a mask 
        # so we can ignore padding 0s in the inputs. The compute_mask method 
        # will called automatically by the framework and the mask will get 
        # propagated to the next layer.
        def compute_mask(self, inputs, mask=None):
            return tf.math.not_equal(inputs, 0)

        # Implement serialization so we can save the model.
        # Note on saving custom layers
        # When you write custom layers, make sure to implement the get_config 
        # method: this enables the layer to be reinstantiated from its config 
        # dict, which is useful during model saving and loading. The method 
        # should return a Python dict that contains the values of the 
        # constructor arguments used to create the layer.
        def get_config(self):
            config = super().get_config()
            config.update({
                "output_dim": self.output_dim,
                "sequence_length": self.sequence_length,
                "input_dim": self.input_dim,
            })
            return config

    return PositionalEmbedding

PositionalEmbedding = listing11_26()

In [19]:
# Listing 11.27 Text classification model that combines positional embedding, 
# the Transformer encoder, and a pooling layer
# Putting it all together: a text-classification Transformer
# p.397

def listing11_27():
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    vocab_size = 20_000
    sequence_length = 600
    embed_dim = 256
    num_heads = 2
    dense_dim = 32


    inputs = keras.Input(shape=(None,), dtype="int64")
    
    # Using our new positional embedding layer
    x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
    
    x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs, outputs)
    
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    model.summary()

    callbacks = [
        keras.callbacks.ModelCheckpoint(
            "full_transformer_encoder.keras",
            save_best_only=True
        )
    ]
    
    model.fit(
        int_train_ds, 
        validation_data=int_val_ds, 
        #epochs=20, 
        epochs = 5,
        callbacks=callbacks
    )
    
    model = keras.models.load_model(
        "full_transformer_encoder.keras",
        custom_objects={"TransformerEncoder": TransformerEncoder,
                        "PositionalEmbedding": PositionalEmbedding})
    
    HR()
    print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

listing11_27()


# Model: "model_6"
# _________________________________________________________________
# Layer (type)                 Output Shape              Param #   
# =================================================================
# input_7 (InputLayer)         [(None, None)]            0         
# _________________________________________________________________
# positional_embedding_2 (Posi (None, None, 256)         5273600   
# _________________________________________________________________
# transformer_encoder_6 (Trans (None, None, 256)         543776    
# _________________________________________________________________
# global_max_pooling1d_6 (Glob (None, 256)               0         
# _________________________________________________________________
# dropout_6 (Dropout)          (None, 256)               0         
# _________________________________________________________________
# dense_28 (Dense)             (None, 1)                 257       
# =================================================================
# Total params: 5,817,633
# Trainable params: 5,817,633
# Non-trainable params: 0
# _________________________________________________________________
# Epoch 1/5
# 625/625 [==============================] - 46s 70ms/step - loss: 0.4845 - accuracy: 0.7732 - val_loss: 0.2747 - val_accuracy: 0.8906
# Epoch 2/5
# 625/625 [==============================] - 46s 73ms/step - loss: 0.2388 - accuracy: 0.9100 - val_loss: 0.2596 - val_accuracy: 0.8880
# Epoch 3/5
# 625/625 [==============================] - 47s 75ms/step - loss: 0.1819 - accuracy: 0.9337 - val_loss: 0.2595 - val_accuracy: 0.8988
# Epoch 4/5
# 625/625 [==============================] - 47s 75ms/step - loss: 0.1508 - accuracy: 0.9442 - val_loss: 0.2965 - val_accuracy: 0.8960
# Epoch 5/5
# 625/625 [==============================] - 47s 75ms/step - loss: 0.1266 - accuracy: 0.9530 - val_loss: 0.3151 - val_accuracy: 0.8796
# 782/782 [==============================] - 21s 27ms/step - loss: 0.2965 - accuracy: 0.8804

# Test acc: 0.880


Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
positional_embedding_2 (Posi (None, None, 256)         5273600   
_________________________________________________________________
transformer_encoder_6 (Trans (None, None, 256)         543776    
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 256)               0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 257       
Total params: 5,817,633
Trainable params: 5,817,633
Non-trainable params: 0
_________________________________________________

We get around 88.3% test accuracy, a solid improvement that clearly demonstrates the value of word order information for text classification. This is our best sequence model so far! 

However, it’s still one notch below the bag-of-words approach.

---
# Up to 11.4.4 When to use sequence models over bag-of-words models?