<a href="https://colab.research.google.com/github/erinijapranckeviciene/MF54609_18981_1_20241/blob/main/FC_Chapter11_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Prepare IMDB data as before

In [None]:
!wget https://github.com/erinijapranckeviciene/MF54609_18981_1_20241/raw/refs/heads/main/datasets/RNN/aclImdb.zip
!unzip -qq aclImdb.zip

In [None]:
import os, pathlib, shutil, random
from tensorflow import keras

batch_size = 32
train_ds = keras.utils.text_dataset_from_directory("aclImdb/train", batch_size=batch_size )
val_ds = keras.utils.text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

### text only ds

In [None]:
# This function returns only data part without target
# to create a new dataset that will be used to create dictionary
text_only_train_ds = train_ds.map(lambda x, y: x)
for inputs in text_only_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("inputs[0]:", inputs[0])
    break

### Prepare int representation dataset

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# max_length is a length of the vector that encodes text
# 250 size gave ~0.85 on test data
# 600 with 4 LSTM units does not train at all
# the reviews are about 300 words
max_length = 300
max_tokens = 20000
text_vectorization = layers.TextVectorization( max_tokens=max_tokens, output_mode="int", output_sequence_length=max_length)

text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

### Verify

In [None]:
for inputs, targets in int_train_ds:
    print(inputs.shape)
    print(inputs[0])
    print(targets[0])
    # With this test_input variable verify tf.one_hot() transformation
    test_input=inputs[0]
    break

## Transformers  F.Chollet Chapter 11

#### 11.4 The Transformer architecture

..."Transformers were introduced in the seminal paper “Attention is all you need” by Vaswani et al. The gist of the paper is right there in the title: as it turned out, a simple mechanism called “neural attention” could be used to build powerful sequence models that didn’t feature any recurrent layers or convolution layers."...

### Listing 11.21 Transformer encoder implemented as a subclassed Layer

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class TransformerEncoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim
    self.dense_proj = keras.Sequential(
        [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
    )
    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()

  def call(self, inputs, mask=None):
    if mask is not None:
        mask=mask[:, tf.newaxis, :]
    attention_output = self.attention(
        inputs, inputs, attention_mask=mask
    )
    proj_input = self.layernorm_1(inputs + attention_output)
    proj_output = self.dense_proj(proj_input)
    return self.layernorm_2(proj_input + proj_output)

  def get_config(self):
    config = super().get_config()
    config.update({
        "embed_dim": self.embed_dim,
        "num_heads": self.num_heads,
        "dense_dim": self.dense_dim,
    })
    return config


### Listing 11.22 Using the Transformer encoder for text classification

In [None]:
vocab_size = 20000
embed_dim = 256
num_heads = 2
dense_dim = 32

inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()


### Listing 11.23 Training and evaluating the Transformer encoder based model

In [None]:
callbacks = [ keras.callbacks.ModelCheckpoint("transformer_encoder.keras", save_best_only=True) ]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=20, callbacks=callbacks)

model = keras.models.load_model("transformer_encoder.keras", custom_objects={"TransformerEncoder": TransformerEncoder})
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")
