# Model 14: Transformers â€“ Step-by-Step (Text Classification)
This notebook builds a **tiny Transformer encoder** for binary text classification.

You will learn:
- Tokenization with TextVectorization
- Self-Attention (MultiHeadAttention)
- Transformer encoder block
- Train and evaluate


In [None]:
# If TensorFlow is missing, uncomment:
# !pip -q install tensorflow

import tensorflow as tf
from tensorflow.keras import layers, Model

print('TensorFlow version:', tf.__version__)

## 1) Small example dataset

In [None]:
texts = [
    'i love this movie',
    'this film is amazing',
    'terrible movie i hate it',
    'worst film ever'
]
labels = [1, 1, 0, 0]
y = tf.constant(labels)

## 2) Vectorize text

In [None]:
vectorizer = layers.TextVectorization(max_tokens=2000, output_sequence_length=20)
vectorizer.adapt(texts)
X = vectorizer(texts)
print(X.shape)

## 3) Transformer block

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

## 4) Build model

In [None]:
embed_dim = 32
num_heads = 2
ff_dim = 64
maxlen = 20

inputs = layers.Input(shape=(maxlen,))
x = layers.Embedding(input_dim=2000, output_dim=embed_dim)(inputs)
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = Model(inputs, outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

## 5) Train

In [None]:
model.fit(X, y, epochs=30, verbose=0)
print('Training complete')

## 6) Test

In [None]:
test_texts = ['i really enjoyed this film', 'this was awful']
X_test = vectorizer(test_texts)
preds = model.predict(X_test)
for t, p in zip(test_texts, preds):
    print(t, '->', float(p))