In [55]:
import tensorflow as tf
# tf.config.run_functions_eagerly(True)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn
import datetime, pathlib, io, os, time, random, re, string
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense, Flatten, InputLayer, BatchNormalization, Input, Embedding, TextVectorization
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dropout, Conv1D, MultiHeadAttention, LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorboard.plugins import projector
import traceback

# Dataset

In [2]:
# !wget https://www.manythings.org/anki/fra-eng.zip

# Data Processing

In [3]:
text_dataset = tf.data.TextLineDataset("localdata/Section7/fra.txt")
text_dataset

2023-10-26 12:13:13.484468: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2023-10-26 12:13:13.484496: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2023-10-26 12:13:13.484500: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2023-10-26 12:13:13.484540: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-26 12:13:13.484556: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


<TextLineDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [4]:
for i in text_dataset.take(3):
    print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)


In [5]:
VOCAB_SIZE = 20000
ENGLISH_SEQUENCE_LENGTH = 64
FRENCH_SEQUENCE_LENGTH = 64
EMBEDDING_DIM = 256
BATCH_SIZE = 64

In [6]:
english_vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)

In [7]:
french_vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [8]:
def selector(input_text):
    split_text = tf.strings.split(input_text, '\t')
    return { 'input_1': split_text[0:1], 'input_2': 'starttoken ' + split_text[1:2] },  split_text[1:2] + ' endtoken'

In [9]:
split_dataset = text_dataset.map(selector)
split_dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'input_2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.string, name=None))>

In [10]:
for i in split_dataset.take(3):
    print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [11]:
def seperator(input_text):
    split_text = tf.strings.split(input_text, '\t')
    return split_text[0:1], 'starttoken ' + split_text[1:2] + ' endtoken'

In [12]:
init_dataset = text_dataset.map(seperator)
init_dataset

<_MapDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None))>

In [13]:
for i in init_dataset.take(3):
    print(i)

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va ! endtoken'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche. endtoken'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route ! endtoken'], dtype=object)>)


### vocabulary

In [14]:
english_training_data = init_dataset.map(lambda x, y: x)

In [15]:
for i in english_training_data.take(1):
    print(i)

tf.Tensor([b'Go.'], shape=(1,), dtype=string)


In [16]:
english_vectorize_layer.adapt(english_training_data)

2023-10-26 12:13:13.788403: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [17]:
french_training_data = init_dataset.map(lambda x, y: y)
french_vectorize_layer.adapt(french_training_data)

### tokenization

In [18]:
def vectorizer(inputs, output):
    return {
        'input_1': english_vectorize_layer(inputs['input_1']),
        'input_2': french_vectorize_layer(inputs['input_2']) }, french_vectorize_layer(output)

In [19]:
dataset = split_dataset.map(vectorizer)
dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [20]:
french_vectorize_layer.get_vocabulary()[3]

'endtoken'

In [21]:
for i in dataset.take(3):
    print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[44,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[  2, 103,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[103,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,

### shuffle and make as batch

In [22]:
dataset = dataset.shuffle(2038)
print(dataset)

<_ShuffleDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>


In [23]:
dataset = dataset.unbatch()
dataset

<_UnbatchDataset element_spec=({'input_1': TensorSpec(shape=(64,), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(64,), dtype=tf.int64, name=None)}, TensorSpec(shape=(64,), dtype=tf.int64, name=None))>

In [24]:
dataset = dataset.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [25]:
NUM_BATCHES = int(200000 / BATCH_SIZE)
NUM_BATCHES

3125

In [26]:
train_dataset = dataset.take(int(0.9 * NUM_BATCHES))
val_dataset = dataset.skip(int(0.9 * NUM_BATCHES))
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [27]:
for i in train_dataset.take(1):
    print(i)

({'input_1': <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[ 418,  216,    0, ...,    0,    0,    0],
       [ 823,   15,    0, ...,    0,    0,    0],
       [   2,  144, 1696, ...,    0,    0,    0],
       ...,
       [3121,   11,    0, ...,    0,    0,    0],
       [  93,  361,    0, ...,    0,    0,    0],
       [1535,    0,    0, ...,    0,    0,    0]])>, 'input_2': <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[    2,     1,     0, ...,     0,     0,     0],
       [    2, 10875,     0, ...,     0,     0,     0],
       [    2,     4,    25, ...,     0,     0,     0],
       ...,
       [    2,  9080,    17, ...,     0,     0,     0],
       [    2,   667,     9, ...,     0,     0,     0],
       [    2, 15837,     0, ...,     0,     0,     0]])>}, <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[    1,     3,     0, ...,     0,     0,     0],
       [10875,     3,     0, ...,     0,     0,     0],
       [    4,    25,  1673, ...,     0,     0,     0

# Model

### Positional Encoding

In [28]:
def positional_encoding(model_size, SEQUENCE_LENGTH):
    output = []
    for pos in range(SEQUENCE_LENGTH):
        PE = np.zeros((model_size))
        for i in range(model_size):
            if i % 2 == 0:
                PE[i] = np.sin(pos / (10000 ** (i/model_size)))
            else:
                PE[i] = np.cos(pos / (10000 ** ((i - 1)/model_size)))
        output.append(tf.expand_dims(PE, axis=0))
    out = tf.concat(output, axis=0)
    # print("1.out.shape=",out.shape)
    out = tf.expand_dims(out, axis=0)
    # print("2.out.shape=",out.shape) # 1(batch dimenstion) SEQUENCE EMBEDDING
    return tf.cast(out, dtype=tf.float32)

In [29]:
positional_encoding(EMBEDDING_DIM, FRENCH_SEQUENCE_LENGTH)

<tf.Tensor: shape=(1, 64, 256), dtype=float32, numpy=
array([[[ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
          1.00000000e+00,  0.00000000e+00,  1.00000000e+00],
        [ 8.41470957e-01,  5.40302277e-01,  8.01961780e-01, ...,
          1.00000000e+00,  1.07460786e-04,  1.00000000e+00],
        [ 9.09297407e-01, -4.16146845e-01,  9.58144367e-01, ...,
          1.00000000e+00,  2.14921558e-04,  1.00000000e+00],
        ...,
        [-9.66117799e-01, -2.58101642e-01,  2.14545757e-01, ...,
          9.99975204e-01,  6.55506086e-03,  9.99978542e-01],
        [-7.39180684e-01,  6.73507154e-01,  9.11451578e-01, ...,
          9.99974370e-01,  6.66251918e-03,  9.99977827e-01],
        [ 1.67355701e-01,  9.85896587e-01,  8.74411643e-01, ...,
          9.99973536e-01,  6.76997751e-03,  9.99977112e-01]]],
      dtype=float32)>

In [30]:
class AddPositionEncoding(tf.keras.layers.Layer):
    def __init__(self, model_size, sequence_length):
        super().__init__()
        self.model_size = model_size
        self.sequence_length = sequence_length
        self.positional_encoding = positional_encoding(model_size, sequence_length)

    def call(self, input):
        return input + self.positional_encoding

In [31]:
AddPositionEncoding(EMBEDDING_DIM, FRENCH_SEQUENCE_LENGTH)(tf.zeros(
    [BATCH_SIZE, FRENCH_SEQUENCE_LENGTH, EMBEDDING_DIM]
))

<tf.Tensor: shape=(64, 64, 256), dtype=float32, numpy=
array([[[ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
          1.00000000e+00,  0.00000000e+00,  1.00000000e+00],
        [ 8.41470957e-01,  5.40302277e-01,  8.01961780e-01, ...,
          1.00000000e+00,  1.07460786e-04,  1.00000000e+00],
        [ 9.09297407e-01, -4.16146845e-01,  9.58144367e-01, ...,
          1.00000000e+00,  2.14921558e-04,  1.00000000e+00],
        ...,
        [-9.66117799e-01, -2.58101642e-01,  2.14545757e-01, ...,
          9.99975204e-01,  6.55506086e-03,  9.99978542e-01],
        [-7.39180684e-01,  6.73507154e-01,  9.11451578e-01, ...,
          9.99974370e-01,  6.66251918e-03,  9.99977827e-01],
        [ 1.67355701e-01,  9.85896587e-01,  8.74411643e-01, ...,
          9.99973536e-01,  6.76997751e-03,  9.99977112e-01]],

       [[ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
          1.00000000e+00,  0.00000000e+00,  1.00000000e+00],
        [ 8.41470957e-01,  5.40302277e-01,  8.01

### Embeddings

In [41]:
class Embeddings(tf.keras.layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim):
        super().__init__()
        self.token_embeddings = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.embedded_position = positional_encoding(self.embed_dim, self.sequence_length)

    def call(self, inputs):
        embedded_tokens = self.token_embeddings(inputs)
        return embedded_tokens + self.embedded_position

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

In [63]:
test_input = tf.constant([[2,4,7,21,3,5,0,0]])
emb = Embeddings(8, 20000, 256)
emb_out = emb(test_input)
print(emb_out.shape)

(1, 8, 256)


In [64]:
enc_mask = emb.compute_mask(test_input)
print(enc_mask)

tf.Tensor([[ True  True  True  True  True  True False False]], shape=(1, 8), dtype=bool)


### Encoder

In [65]:
class TransfomerEncoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = tf.keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(1)
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        print("1.mask=", mask)
        padding_mask = None
        if mask is not None:
            mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            # print("2.mask=", mask)
            T = tf.shape(mask)[2]
            padding_mask = tf.repeat(mask, T, axis=1)
            # print("padding_mask=", padding_mask)
        attention_output = self.attention(
            query=inputs, key=inputs, value=inputs,
            attention_mask=padding_mask
        )

        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)    

In [66]:
encoder_output = TransfomerEncoder(256, 2048, 2)(emb_out)
print(encoder_output.shape)

1.mask= tf.Tensor([[ True  True  True  True  True  True False False]], shape=(1, 8), dtype=bool)
(1, 8, 256)


In [74]:
class TransfomerDecoder(Layer):
    def __init__(self, embed_dim, latent_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = tf.keras.Sequential([
            Dense(latent_dim, activation="relu"), Dense(embed_dim)
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.layernorm_3 = LayerNormalization()

    def call(self, inputs, encoder_outputs, enc_mask, mask=None):
        print(mask)
        padding_mask = None
        cross_attn_mask = None
        combined_mask = None
        if mask is not None:
            causal_mask = tf.linalg.band_part(
                tf.ones([tf.shape(inputs)[0],
                         tf.shape(inputs)[1],
                         tf.shape(inputs)[1]], dtype=tf.int32), -1, 0)
            mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            enc_mask = tf.cast(enc_mask[:, tf.newaxis, :], dtype="int32")

            T = tf.shape(mask)[2]
            padding_mask = tf.repeat(mask, T, axis=1)
            cross_attn_mask = tf.repeat(enc_mask, T, axis=1)
            combined_mask = tf.minimum(padding_mask, causal_mask)
            print('padding=', padding_mask)
            print('causal=', causal_mask)
            print('cross_attn=', cross_attn_mask)
            print('combined=', combined_mask)
        attention_output_1 = self.attention_1(
            query=inputs, key=inputs, value=inputs,
            attention_mask=combined_mask
        )

        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1, key=encoder_outputs, value=encoder_outputs,
            attention_mask=cross_attn_mask
        )

        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)
        

In [75]:
dec_mask=enc_mask
decoder_output = TransfomerDecoder(256, 2048, 4)(emb_out, encoder_output, enc_mask)

tf.Tensor([[ True  True  True  True  True  True False False]], shape=(1, 8), dtype=bool)
padding= Tensor("transfomer_decoder_11/Repeat/Reshape_1:0", shape=(1, 8, 8), dtype=int32)
causal= Tensor("transfomer_decoder_11/MatrixBandPart:0", shape=(1, 8, 8), dtype=int32)
cross_attn= Tensor("transfomer_decoder_11/Repeat_1/Reshape_1:0", shape=(None, 8, None), dtype=int32)
combined= Tensor("transfomer_decoder_11/Minimum:0", shape=(1, 8, 8), dtype=int32)


# Transformer Model

In [76]:
EMBEDDING_DIM=256
D_FF=2048
NUM_HEADS=8
NUM_LAYERS=2
NUM_EPOCH=2

In [77]:
encoder_inputs = Input(shape=(None,), dtype="int64", name="input_1")
emb = Embeddings(ENGLISH_SEQUENCE_LENGTH, VOCAB_SIZE, EMBEDDING_DIM)
x = emb(encoder_inputs)
enc_mask = emb.compute_mask(encoder_inputs)

for _ in range(NUM_LAYERS):
    x = TransfomerEncoder(EMBEDDING_DIM, D_FF, NUM_HEADS)(x)
encoder_outputs = x

decoder_inputs = Input(shape=(None,), dtype="int64", name="input_2")

x = Embeddings(FRENCH_SEQUENCE_LENGTH, VOCAB_SIZE, EMBEDDING_DIM)(decoder_inputs)
for i in range(NUM_LAYERS):
    x = TransfomerDecoder(EMBEDDING_DIM, D_FF, NUM_HEADS)(x, encoder_outputs, enc_mask)

x = tf.keras.layers.Dropout(0.5)(x)
decoder_outputs = Dense(VOCAB_SIZE, activation="softmax")(x)

transformer = tf.keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)
transformer.summary()

1.mask= Tensor("Placeholder_1:0", shape=(None, None), dtype=bool)
1.mask= Tensor("Placeholder_1:0", shape=(None, None), dtype=bool)
Tensor("Placeholder_3:0", shape=(None, None), dtype=bool)
padding= Tensor("transfomer_decoder_12/Repeat/Reshape_1:0", shape=(None, None, None), dtype=int32)
causal= Tensor("transfomer_decoder_12/MatrixBandPart:0", shape=(None, 64, 64), dtype=int32)
cross_attn= Tensor("transfomer_decoder_12/Repeat_1/Reshape_1:0", shape=(None, None, None), dtype=int32)
combined= Tensor("transfomer_decoder_12/Minimum:0", shape=(None, 64, 64), dtype=int32)
None
Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embed

# Train

In [78]:
class Scheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warming_steps):
        super().__init__()
        self.d_model = tf.cast(d_model, tf.float64)
        self.warming_steps = tf.cast(warming_steps, dtype=tf.float64)

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float64)
        return (self.d_model**(-0.5)) * \
            tf.math.minimum(step**(-0.5), step * (self.warming_steps ** (-1.5)))

In [79]:
WARM_UP_STEPS=4000
lr_scheduler = Scheduler(EMBEDDING_DIM, WARM_UP_STEPS)

In [80]:
transformer.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=Adam(lr_scheduler, beta_1=0.9, beta_2=0.98, epsilon=1e9),)



In [81]:
history = transformer.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2)

Epoch 1/2
1.mask= Tensor("transformer/embeddings_15/NotEqual:0", shape=(None, 64), dtype=bool)
1.mask= Tensor("transformer/embeddings_15/NotEqual:0", shape=(None, 64), dtype=bool)
Tensor("transformer/embeddings_16/NotEqual:0", shape=(None, 64), dtype=bool)
padding= Tensor("transformer/transfomer_decoder_12/Repeat/Reshape_1:0", shape=(None, 64, 64), dtype=int32)
causal= Tensor("transformer/transfomer_decoder_12/MatrixBandPart:0", shape=(None, 64, 64), dtype=int32)
cross_attn= Tensor("transformer/transfomer_decoder_12/Repeat_1/Reshape_1:0", shape=(None, 64, 64), dtype=int32)
combined= Tensor("transformer/transfomer_decoder_12/Minimum:0", shape=(None, 64, 64), dtype=int32)
None
1.mask= Tensor("transformer/embeddings_15/NotEqual:0", shape=(None, 64), dtype=bool)
1.mask= Tensor("transformer/embeddings_15/NotEqual:0", shape=(None, 64), dtype=bool)
Tensor("transformer/embeddings_16/NotEqual:0", shape=(None, 64), dtype=bool)
padding= Tensor("transformer/transfomer_decoder_12/Repeat/Reshape_1:0

KeyboardInterrupt: 