<a href="https://colab.research.google.com/github/enakai00/colab_GenAI_lecture/blob/main/17_Transformer_recipe_generator_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install -q git+https://github.com/huggingface/transformers.git datasets

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m16.9 MB/s[0m eta [3

In [2]:
import os, random
os.environ['PYTHONHASHSEED'] = str(20230629)
random.seed(20240329)

import pickle
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers, models, losses, optimizers

np.random.seed(20230629)
tf.random.set_seed(20230629)

plt.rcParams.update({'font.size': 10})

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
with open('/content/gdrive/My Drive/recipe_texts.pkl', 'rb') as f:
    recipe_texts = pickle.load(f)

In [5]:
from transformers import AutoTokenizer, AutoConfig
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
VOCAB_SIZE = AutoConfig.from_pretrained(model_ckpt).vocab_size

VOCAB_SIZE

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

30522

In [6]:
MAX_LEN = 512
train_set = tokenizer(recipe_texts, max_length=MAX_LEN,
                      padding='max_length', truncation=True)

train_text = np.array(train_set['input_ids'])[:,:-1]
train_label = np.array(train_set['input_ids'])[:,1:]

In [7]:
class Embeddings(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(Embeddings, self).__init__()
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(input_dim=vocab_size,
                                          output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=max_len,
                                        output_dim=embed_dim)
        self.dropout = layers.Dropout(rate=0.5)
        self.ln = layers.LayerNormalization(epsilon=1e-12)

    def call(self, inputs):
        seq_len = tf.shape(inputs)[-1]
        position_ids = tf.range(start=0, limit=seq_len, delta=1)
        position_embeddings = self.pos_emb(position_ids)

        token_embeddings = self.token_emb(inputs)
        embeddings = token_embeddings + position_embeddings # Add position embeddings

        embeddings = self.ln(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "max_len": self.max_len,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

In [8]:
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.attn = layers.MultiHeadAttention(
            num_heads, key_dim, output_shape=embed_dim
        )
        self.dropout_1 = layers.Dropout(rate=0.1)
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.ff_dim, activation='relu')
        self.ffn_2 = layers.Dense(self.embed_dim)
        self.dropout_2 = layers.Dropout(rate=0.1)
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)

    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        seq_len = tf.shape(inputs)[1]
        causal_mask = self.causal_attention_mask(
            batch_size, seq_len, seq_len, tf.bool
        )
        attention_output, attention_scores = self.attn(
            inputs,
            inputs,
            attention_mask=causal_mask,
            return_attention_scores=True,
        )
        attention_output = self.dropout_1(attention_output)
        attention_output =  attention_output + inputs # Skip connection
        attention_output = self.ln_1(attention_output)

        ffn_1 = self.ffn_1(attention_output)
        ffn_2 = self.ffn_2(ffn_1)
        ffn_output = self.dropout_2(ffn_2)

        ffn_output = ffn_output + attention_output # Skip connection
        ffn_output = self.ln_2(ffn_output)
        return (ffn_output, attention_scores)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "key_dim": self.key_dim,
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "ff_dim": self.ff_dim,
            }
        )
        return config

In [9]:
EMBEDDING_DIM = 512
N_HEADS = 8
KEY_DIM = EMBEDDING_DIM // N_HEADS
FEED_FORWARD_DIM = 2048

text_inputs = layers.Input(shape=(None,), dtype=tf.int32)

x = Embeddings(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(text_inputs)
x, attention_scores1 = TransformerBlock(
    N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM)(x)
#x, attention_scores2 = TransformerBlock(
#    N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM)(x)
output = layers.Dense(VOCAB_SIZE, activation='softmax')(x)

# Model for training and prediction
transformer_model = models.Model(
    inputs=text_inputs, outputs=output)

# Model for inference including attention scores
attention_scores_model = models.Model(
    inputs=text_inputs,
    outputs=[output, attention_scores1])#, attention_scores2])

transformer_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embeddings (Embeddings)     (None, None, 512)         15890432  
                                                                 
 transformer_block (Transfor  ((None, None, 512),      3152384   
 merBlock)                    (None, 8, None, None))             
                                                                 
 dense_2 (Dense)             (None, None, 30522)       15657786  
                                                                 
Total params: 34,700,602
Trainable params: 34,700,602
Non-trainable params: 0
_________________________________________________________________


In [10]:
transformer_model.compile('adam',
                          loss=losses.SparseCategoricalCrossentropy(),
                          metrics=['acc'])

In [11]:
%%time
history = transformer_model.fit(train_text, train_label,
                                batch_size=8, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 12min 52s, sys: 29.6 s, total: 13min 21s
Wall time: 26min 24s


In [12]:
transformer_model.save('/content/gdrive/My Drive/Transformer_recipe_generator')
attention_scores_model.save('/content/gdrive/My Drive/Transformer_recipe_generator_attention_scores')



In [13]:
!ls -lhR '/content/gdrive/My Drive/Transformer_recipe_generator'
!ls -lhR '/content/gdrive/My Drive/Transformer_recipe_generator_attention_scores'

'/content/gdrive/My Drive/Transformer_recipe_generator':
total 577K
drwx------ 2 root root 4.0K Jul  4 01:35 assets
-rw------- 1 root root   58 Jul  4 02:11 fingerprint.pb
-rw------- 1 root root  22K Jul  4 02:11 keras_metadata.pb
-rw------- 1 root root 547K Jul  4 02:11 saved_model.pb
drwx------ 2 root root 4.0K Jul  4 02:11 variables

'/content/gdrive/My Drive/Transformer_recipe_generator/assets':
total 0

'/content/gdrive/My Drive/Transformer_recipe_generator/variables':
total 398M
-rw------- 1 root root 398M Jul  4 02:11 variables.data-00000-of-00001
-rw------- 1 root root 4.4K Jul  4 02:11 variables.index
'/content/gdrive/My Drive/Transformer_recipe_generator_attention_scores':
total 456K
drwx------ 2 root root 4.0K Jul  4 01:35 assets
-rw------- 1 root root   55 Jul  4 02:11 fingerprint.pb
-rw------- 1 root root  21K Jul  4 02:11 keras_metadata.pb
-rw------- 1 root root 427K Jul  4 02:11 saved_model.pb
drwx------ 2 root root 4.0K Jul  4 02:11 variables

'/content/gdrive/My Drive/