# Transformer recipe generator training

Note: This notebook is designed to run with GPU runtime.

Install Huggingface libraries to use the pretrained tokenizer and the recipe dataset.

**You can igore the error message like `ERROR: pip's dependency resolver does not currently take into account...`.**

In [1]:
pip install -qU git+https://github.com/huggingface/transformers.git datasets

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-

Import modules and set random seeds.

In [2]:
import os, random
import numpy as np
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models, saving

random.seed(20230629)
np.random.seed(20230629)
tf.random.set_seed(20230629)

plt.rcParams.update({'font.size': 10})

Download the pretrained tokenizer and check the vacabulary size.

In [5]:
from transformers import AutoTokenizer, AutoConfig
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
VOCAB_SIZE = AutoConfig.from_pretrained(model_ckpt).vocab_size

print(f'Vocabulary size: {VOCAB_SIZE}')

Vocabulary size: 30522


Download the recipe dataset and extract directions texts.

In [6]:
from datasets import load_dataset
recipe = load_dataset('Shengtao/recipe')

def join_title_and_directions(title_directions):
    title, directions = title_directions
    return f'Recipe for {title}: {directions}'

recipe_texts = zip(recipe['train']['title'], recipe['train']['directions'])
recipe_texts = [*map(join_title_and_directions, recipe_texts)]
recipe_texts = recipe_texts[::4] # Select 25% of the entire training set.

Create the training and test datasets, truncating long texts into 512 words.

In [None]:
MAX_LEN = 128
train_set, test_set = train_test_split(recipe_texts, test_size=0.1)

train_set = tokenizer(train_set, max_length=MAX_LEN,
                      padding='max_length', truncation=True)
train_text = np.array(train_set['input_ids'])[:, :-1]
train_label = np.array(train_set['input_ids'])[:, 1:]

test_set = tokenizer(test_set, max_length=MAX_LEN,
                     padding='max_length', truncation=True)
test_text = np.array(test_set['input_ids'])[:, :-1]
test_label = np.array(test_set['input_ids'])[:, 1:]

Define the positional embedding layer.

In [None]:
@saving.register_keras_serializable()
class Embeddings(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim, **kwargs):
        super(Embeddings, self).__init__(**kwargs)
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(vocab_size, embed_dim)
        self.pos_emb = layers.Embedding(max_len, embed_dim)
        self.ln = layers.LayerNormalization(epsilon=1e-12)
        self.dropout = layers.Dropout(rate=0.5)

    def call(self, inputs):
        seq_len = tf.shape(inputs)[-1]
        position_ids = tf.range(start=0, limit=seq_len, delta=1)
        position_embeddings = self.pos_emb(position_ids)
        token_embeddings = self.token_emb(inputs)

        # Add positional embeddings
        embeddings = token_embeddings + position_embeddings
        embeddings = self.ln(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

    def build(self, input_shape):
        pass

    def get_config(self):
        config = super().get_config()
        config.update({
            'max_len': self.max_len,
            'vocab_size': self.vocab_size,
            'embed_dim': self.embed_dim,
        })
        return config

Dfiner the Transformer encoder block.

In [None]:
@saving.register_keras_serializable()
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.attn = layers.MultiHeadAttention(
            num_heads, key_dim, output_shape=embed_dim)
        self.dropout_1 = layers.Dropout(rate=0.1)
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.ff_dim, activation='relu')
        self.ffn_2 = layers.Dense(self.embed_dim)
        self.dropout_2 = layers.Dropout(rate=0.1)
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        # Multi-head attention
        attention_output, attention_scores = self.attn(
            inputs, inputs, inputs, # Inputs for Query, Value, Key
            use_causal_mask=True,
            return_attention_scores=True)
        attention_output = self.dropout_1(attention_output)
        attention_output = attention_output + inputs # Skip connection
        attention_output = self.ln_1(attention_output)

        # Feed forward
        ffn_1 = self.ffn_1(attention_output)
        ffn_2 = self.ffn_2(ffn_1)
        ffn_output = self.dropout_2(ffn_2)
        ffn_output = ffn_output + attention_output # Skip connection
        ffn_output = self.ln_2(ffn_output)
        return (ffn_output, attention_scores)

    def build(self, input_shape):
        pass

    def get_config(self):
        config = super().get_config()
        config.update({
                "key_dim": self.key_dim,
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "ff_dim": self.ff_dim,
        })
        return config

Define the Transformer model consisted of a single Transformer encoder blocks.

In [None]:
EMBEDDING_DIM = 512
N_HEADS = 4
KEY_DIM = EMBEDDING_DIM // N_HEADS
FEED_FORWARD_DIM = 2048

text_inputs = layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
x = Embeddings(
    MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM, name='text_embedding')(text_inputs)
x, attention_scores = TransformerBlock(
    N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM, name='Transformer')(x)
output = layers.Dense(VOCAB_SIZE, activation='softmax', name='softmax')(x)

# Model for training and prediction
transformer_model = models.Model(
    inputs=text_inputs, outputs=output,
    name='Transformer_next_word_predictor')

# Model for inference including attention scores
attention_scores_model = models.Model(
    inputs=text_inputs,
    outputs=[output, attention_scores],
    name='Transformer_attention_scores')

transformer_model.summary()

Compile the model using the Adam optimizer, and the sparse categorical crossentroy as a loss function.

In [None]:
transformer_model.compile('adam',
                          loss='sparse_categorical_crossentropy',
                          metrics=['acc'])

Train the model.

In [None]:
%%time
history = transformer_model.fit(train_text, train_label,
                                validation_data=(test_text, test_label),
                                batch_size=8, epochs=10)

Epoch 1/10
[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 56ms/step - acc: 0.3965 - loss: 3.5751 - val_acc: 0.5369 - val_loss: 2.2784
Epoch 2/10
[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 48ms/step - acc: 0.5330 - loss: 2.2337 - val_acc: 0.5721 - val_loss: 2.0532
Epoch 3/10
[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 47ms/step - acc: 0.5637 - loss: 1.9997 - val_acc: 0.5854 - val_loss: 1.9524
Epoch 4/10
[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 47ms/step - acc: 0.5805 - loss: 1.8706 - val_acc: 0.5959 - val_loss: 1.9001
Epoch 5/10
[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 47ms/step - acc: 0.5919 - loss: 1.7796 - val_acc: 0.6008 - val_loss: 1.8692
Epoch 6/10
[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 47ms/step - acc: 0.6011 - loss: 1.7133 - val_acc: 0.6035 - val_loss: 1.8530
Epoch 7/10
[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0

Mount google drive and save the trained model.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

model_file = '/content/gdrive/My Drive/Transformer_recipe_generator.keras'
transformer_model.save(model_file)
!ls -lh '{model_file}'

model_file = '/content/gdrive/My Drive/Transformer_attention_scores.keras'
attention_scores_model.save(model_file)
!ls -lh '{model_file}'

Mounted at /content/gdrive
-rw------- 1 root root 395M Feb  4 23:25 '/content/gdrive/My Drive/Transformer_recipe_generator.keras'
-rw------- 1 root root 132M Feb  4 23:26 '/content/gdrive/My Drive/Transformer_attention_scores.keras'
