<a href="https://colab.research.google.com/github/h4ck4l1/datasets/blob/main/NLP_with_RNN_and_Attention/NMT_with_attention_google.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [449]:
import os,warnings,sys
if "google.colab" in sys.modules:
    !pip3 install -q -U "tensorflow-text==2.13.0"
    !pip3 install -q -U einops
warnings.filterwarnings("ignore")
from IPython.display import clear_output
os.environ["TF_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text
import typing
from zipfile import ZipFile
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio
import einops
pio.templates.default = "plotly_dark"
import numpy as np
np.set_printoptions(precision=2)
tf.get_logger().setLevel("ERROR")
%xmode Minimal
if "google.colab" in sys.modules:
    clear_output()

In [450]:

class ShapeCheck():

    def __init__(self):

        self.shapes = {}

    def __call__(self,tensor,names,broadcast=False):

        parsed = einops.parse_shape(tensor,names)

        for name,new_dim in parsed.items():

            old_dim = self.shapes.get(name,None)

            if broadcast and (new_dim == 1):
                continue

            if old_dim is None:

                self.shapes[name] = new_dim
                continue

            if new_dim != old_dim:

                raise ValueError(f"SHAPE MISTMATCH FOR DIMENSION: '{name}' FOUND: {new_dim} EXPECTED: {old_dim}")

In [451]:
url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

In [452]:
file_path = keras.utils.get_file(fname="spa-eng.zip",origin=url,extract=True)

with ZipFile(file_path,"r") as f:

    f.extractall("spa-eng")

with open("spa-eng/spa-eng/spa.txt","r") as f:

    total_text = f.read()
    total_text = [line.split("\t") for line in total_text.splitlines()]
    en_text,es_text = zip(*total_text)

In [453]:
en_text[-1]

'If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.'

In [454]:
es_text[-1]

'Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.'

In [455]:
en_array = np.array(en_text)
es_array = np.array(es_text)

In [456]:
is_train = np.random.uniform(size=(len(en_array),)) < 0.8

raw_train = (
    tf.data.Dataset
    .from_tensor_slices((en_array[is_train],es_array[is_train]))
    .shuffle(len(en_text))
    .batch(64)
)
raw_valid = (
    tf.data.Dataset
    .from_tensor_slices((en_array[~is_train],es_array[~is_train]))
    .shuffle(len(en_text))
    .batch(64)
)

In [457]:
for en,es in raw_train.take(1):
    print(en[:4])
    print("translates to latin as ")
    print(es[:4])

tf.Tensor(
[b"Maybe Tom didn't have time."
 b'He came from one of the richest families in America.'
 b'Tom criticized Mary in front of everyone.'
 b"I'm looking forward to seeing you next Sunday."], shape=(4,), dtype=string)
translates to latin as 
tf.Tensor(
[b'Puede que Tom no tuviera tiempo.'
 b'Vino de una de las familias m\xc3\xa1s ricas de Am\xc3\xa9rica.'
 b'Tom critic\xc3\xb3 a Mary delante de todo el mundo.'
 b'Espero con ganas a verte el pr\xc3\xb3ximo domingo.'], shape=(4,), dtype=string)


# Standardize Text

In [458]:
es_text[:10]

('Ve.',
 'Vete.',
 'Vaya.',
 'Váyase.',
 'Hola.',
 '¡Corre!',
 'Corred.',
 '¿Quién?',
 '¡Fuego!',
 '¡Incendio!')

In [459]:
tf.constant(es_text[:10]) # converting to tensor

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'Ve.', b'Vete.', b'Vaya.', b'V\xc3\xa1yase.', b'Hola.',
       b'\xc2\xa1Corre!', b'Corred.', b'\xc2\xbfQui\xc3\xa9n?',
       b'\xc2\xa1Fuego!', b'\xc2\xa1Incendio!'], dtype=object)>

In [460]:
temp_text = text.normalize_utf8(es_text[:10],"NFKD") # Normalizing text so that it can be used in operations
temp_text

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'Ve.', b'Vete.', b'Vaya.', b'Va\xcc\x81yase.', b'Hola.',
       b'\xc2\xa1Corre!', b'Corred.', b'\xc2\xbfQuie\xcc\x81n?',
       b'\xc2\xa1Fuego!', b'\xc2\xa1Incendio!'], dtype=object)>

In [461]:
temp_text_1 = tf.strings.lower(temp_text) # Lower casing all the characters
temp_text_1

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've.', b'vete.', b'vaya.', b'va\xcc\x81yase.', b'hola.',
       b'\xc2\xa1corre!', b'corred.', b'\xc2\xbfquie\xcc\x81n?',
       b'\xc2\xa1fuego!', b'\xc2\xa1incendio!'], dtype=object)>

In [462]:
temp_text_2 = tf.strings.regex_replace(temp_text_1,"[^ a-z.?!,¿]","")  # [^ ...] means exclude..so excluding all the a-z and rest
temp_text_2                                                            # and replacing with noting

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've.', b'vete.', b'vaya.', b'vayase.', b'hola.', b'corre!',
       b'corred.', b'\xc2\xbfquien?', b'fuego!', b'incendio!'],
      dtype=object)>

In [463]:
temp_text_3 = tf.strings.regex_replace(temp_text_2,"[.¡¿,?!]",r' \0 ') # Placing a null character[raw_string : r'']
temp_text_3                                                            # before and after every punctuation

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've . ', b'vete . ', b'vaya . ', b'vayase . ', b'hola . ',
       b'corre ! ', b'corred . ', b' \xc2\xbf quien ? ', b'fuego ! ',
       b'incendio ! '], dtype=object)>

In [464]:
temp_text_4= tf.strings.strip(temp_text_3) # stripping any extra spaces
temp_text_4

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've .', b'vete .', b'vaya .', b'vayase .', b'hola .', b'corre !',
       b'corred .', b'\xc2\xbf quien ?', b'fuego !', b'incendio !'],
      dtype=object)>

In [465]:
temp_text_5 = tf.strings.join(['[startofsequence]',temp_text_4,'[endofsequence]'],separator=" ")
temp_text_5

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'[startofsequence] ve . [endofsequence]',
       b'[startofsequence] vete . [endofsequence]',
       b'[startofsequence] vaya . [endofsequence]',
       b'[startofsequence] vayase . [endofsequence]',
       b'[startofsequence] hola . [endofsequence]',
       b'[startofsequence] corre ! [endofsequence]',
       b'[startofsequence] corred . [endofsequence]',
       b'[startofsequence] \xc2\xbf quien ? [endofsequence]',
       b'[startofsequence] fuego ! [endofsequence]',
       b'[startofsequence] incendio ! [endofsequence]'], dtype=object)>

In [466]:
def text_preprocessor(input_text):

    input_text = text.normalize_utf8(input_text,"NFKD")
    input_text = tf.strings.lower(input_text)
    input_text = tf.strings.regex_replace(input_text,"[^ a-z?.!¿¡,]","")
    input_text = tf.strings.regex_replace(input_text,"[?.!¿¡,]",r" \0 ")
    input_text = tf.strings.strip(input_text)
    input_text = tf.strings.join(["[startofsequence]",input_text,"[endofsequence]"],separator=" ")
    return input_text

# Text Vectorization of En and Es

In [467]:
vocab_size = 5000

en_vec_layer = keras.layers.TextVectorization(vocab_size,standardize=text_preprocessor,ragged=True)
en_vec_layer.adapt(raw_train.map(lambda en,es:en))
es_vec_layer = keras.layers.TextVectorization(vocab_size,standardize=text_preprocessor,ragged=True)
es_vec_layer.adapt(raw_train.map(lambda en,es:es))

In [468]:
print(en_vec_layer.get_vocabulary()[:10])
print(es_vec_layer.get_vocabulary()[:10])

['', '[UNK]', '[startofsequence]', '[endofsequence]', '.', 'the', 'i', 'to', 'you', 'tom']
['', '[UNK]', '[startofsequence]', '[endofsequence]', '.', 'que', 'de', 'el', 'a', 'no']


In [469]:
for english_text,en_vectorized_out in zip(en.numpy()[:4],en_vec_layer(en[:4])):
    print(english_text,"---->",en_vectorized_out)

b"Maybe Tom didn't have time." ----> tf.Tensor([  2 628   9  64  22  53   4   3], shape=(8,), dtype=int64)
b'He came from one of the richest families in America.' ----> tf.Tensor([   2   13  197   69   73   15    5 3222 2186   14  545    4    3], shape=(13,), dtype=int64)
b'Tom criticized Mary in front of everyone.' ----> tf.Tensor([   2    9 3561   31   14  752   15  298    4    3], shape=(10,), dtype=int64)
b"I'm looking forward to seeing you next Sunday." ----> tf.Tensor([  2  38 256 798   7 808   8 210 635   4   3], shape=(11,), dtype=int64)


In [470]:
for latin_text,es_vectorized_out in zip(es.numpy()[:4],es_vec_layer(es[:4])):
    print(latin_text,"---->",es_vectorized_out)

b'Puede que Tom no tuviera tiempo.' ----> tf.Tensor([   2   73    5   10    9 1154   62    4    3], shape=(9,), dtype=int64)
b'Vino de una de las familias m\xc3\xa1s ricas de Am\xc3\xa9rica.' ----> tf.Tensor([   2  232    6   23    6   33 2652   35 4341    6  983    4    3], shape=(13,), dtype=int64)
b'Tom critic\xc3\xb3 a Mary delante de todo el mundo.' ----> tf.Tensor([   2   10 3929    8   32 1601    6   56    7  190    4    3], shape=(12,), dtype=int64)
b'Espero con ganas a verte el pr\xc3\xb3ximo domingo.' ----> tf.Tensor([  2 251  27 439   8 435   7 617 914   4   3], shape=(11,), dtype=int64)


In [471]:
en_vocab = np.array(en_vec_layer.get_vocabulary())
es_vocab = np.array(es_vec_layer.get_vocabulary())

In [472]:
print(" ".join(en_vocab[en_vectorized_out.numpy()]))
print(" ".join(es_vocab[es_vectorized_out.numpy()]))

[startofsequence] im looking forward to seeing you next sunday . [endofsequence]
[startofsequence] espero con ganas a verte el proximo domingo . [endofsequence]


In [473]:
en_vec_out = en_vec_layer(en)
es_vec_out = es_vec_layer(es)

In [474]:
fig = make_subplots(cols=2,subplot_titles=["Unmasked","Masked"])
fig.add_trace(go.Heatmap(z=en_vec_out.to_tensor().numpy()),row=1,col=1)
fig.add_trace(go.Heatmap(z=np.array((en_vec_out.to_tensor() != 0).numpy(),dtype=np.int32)),row=1,col=2)
fig.show()

In [475]:
def preprocess_dataset(en,es):
    X_train = en_vec_layer(en).to_tensor()
    X_dec = es_vec_layer(es)
    X_dec_train = X_dec[:,:-1].to_tensor()
    y_train = X_dec[:,1:].to_tensor()

    return (X_train,X_dec_train),y_train

In [476]:
train_ds = raw_train.map(preprocess_dataset,tf.data.AUTOTUNE)
valid_ds = raw_valid.map(preprocess_dataset,tf.data.AUTOTUNE)

In [477]:
for (en_in,es_in),es_out in train_ds.take(1):
    print(en_in.shape)
    print(es_in.shape)
    print(es_out.shape)

(64, 20)
(64, 18)
(64, 18)


# Encoder Class

- Embedding Layer
- GRU/LSTM Layer

In [478]:
vocab_size = len(en_vec_layer.get_vocabulary())
vocab_size

5000

In [479]:
embed_size = 256

In [480]:
encoder_embed_layer = keras.layers.Embedding(vocab_size,embed_size,mask_zero=True)
encoder = keras.layers.Bidirectional(
    keras.layers.LSTM(256,return_sequences=True,recurrent_initializer="glorot_uniform"),
    merge_mode="sum"
)

In [481]:
shape_checker = ShapeCheck()
shape_checker(en_in,"batch s")

In [482]:
shape_checker.shapes

{'batch': 64, 's': 20}

In [483]:
enc_embed_output = encoder_embed_layer(en_in)
enc_embed_output.shape

TensorShape([64, 20, 256])

In [484]:
shape_checker(enc_embed_output,"batch s units")
shape_checker.shapes

{'batch': 64, 's': 20, 'units': 256}

In [485]:
encoder_outputs = encoder(enc_embed_output)
encoder_outputs.shape

TensorShape([64, 20, 256])

In [486]:
shape_checker(encoder_outputs,"batch s units")

In [487]:
mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=256)

In [488]:
print(en_in.shape)
print(encoder_outputs.shape)

(64, 20)
(64, 20, 256)


In [489]:
len(en_vec_layer.get_vocabulary())

5000

In [490]:
class Encoder(keras.layers.Layer):

    def __init__(self,units=256,vec_layer=en_vec_layer,**kwargs):

        super(Encoder,self).__init__(**kwargs)
        self.vec_layer = vec_layer
        self.embed = keras.layers.Embedding(vec_layer.vocabulary_size(),units,mask_zero=True)
        self.Rnn = keras.layers.Bidirectional(
            layer=keras.layers.LSTM(units,return_sequences=True,return_state=True,recurrent_initializer="glorot_uniform"),
            merge_mode="sum"
        )

    def call(self,inputs):

        shape_checker = ShapeCheck()
        shape_checker(inputs,"batch s")
        z = self.embed(inputs)
        shape_checker(z,"batch s units")
        z,*encoder_state = self.Rnn(z)
        self.encoder_state = encoder_state
        shape_checker(z,"batch s units")
        return z


# CrossAttention

In [491]:
decoder_embed_layer = keras.layers.Embedding(es_vec_layer.vocabulary_size(),256,mask_zero=True)
decoder_embed_out = decoder_embed_layer(es_in)
decoder_embed_out.shape

TensorShape([64, 18, 256])

In [492]:
mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=256)
attention_output,attention_scores = mha(query=decoder_embed_out,value=encoder_outputs,return_attention_scores=True)
print(attention_output.shape)
print(attention_scores.shape)

(64, 18, 256)
(64, 1, 18, 20)


In [493]:
shape_checker = ShapeCheck()
shape_checker(decoder_embed_out,"batch t units")
shape_checker(encoder_outputs,"batch s units")

In [494]:
attention_scores = tf.reduce_mean(attention_scores,axis=1)
attention_scores.shape

TensorShape([64, 18, 20])

In [495]:
adding_layer = keras.layers.Add()
add_out = adding_layer([decoder_embed_out,attention_output])
add_out.shape

TensorShape([64, 18, 256])

In [496]:
layer_norm = keras.layers.LayerNormalization()
layer_out = layer_norm(add_out)
layer_out.shape

TensorShape([64, 18, 256])

In [497]:
class CrossAttention(keras.layers.Layer):

    def __init__(self,units=256,**kwargs):

        super(CrossAttention,self).__init__(**kwargs)
        self.mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=units)
        self.add = keras.layers.Add()
        self.layer_norm = keras.layers.LayerNormalization()

    def call(self,decoder_out,encoder_out):

        shape_checker = ShapeCheck()
        shape_checker(decoder_out,"batch t units")
        shape_checker(encoder_out,"batch s units")

        attention_output,attention_scores = self.mha(query=decoder_out,value=encoder_out,return_attention_scores=True)
        shape_checker(attention_output,"batch t units")
        shape_checker(attention_scores,"batch heads t s")

        add_and_layer_norm = self.layer_norm(self.add([decoder_out,attention_output]))
        self.attention_scores = tf.reduce_mean(attention_scores,axis=1)

        return add_and_layer_norm

In [498]:
attention_layer = CrossAttention()

attention_out = attention_layer(decoder_embed_out,encoder_outputs)
attention_out.shape

TensorShape([64, 18, 256])

In [499]:
attention_layer.attention_scores.shape

TensorShape([64, 18, 20])

In [500]:
np.sum(attention_layer.attention_scores,axis=-1)[:5,:]

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.]], dtype=float32)

In [501]:
fig = make_subplots(cols=2,subplot_titles=["Attention Output","Masked Output"])
fig.add_trace(go.Heatmap(z=attention_layer.attention_scores[:,0,:]),row=1,col=1)
fig.add_trace(go.Heatmap(z=np.array((en_vec_out.to_tensor() != 0).numpy(),dtype=np.int32)),row=1,col=2)
fig.show()

# Decoder

In [502]:
class Decoder(keras.layers.Layer):

    def __init__(self,units=256,vec_layer=es_vec_layer,**kwargs):

        super(Decoder,self).__init__(**kwargs)

        '''Preprocessing Section'''
        self.vec_layer = vec_layer
        self.vocab_size = vec_layer.vocabulary_size()
        self.word_to_id = keras.layers.StringLookup(
            vocabulary=vec_layer.get_vocabulary(),
            oov_token="[UNK]",
            mask_token=""
        )
        self.id_to_word = keras.layers.StringLookup(
            vocabulary=vec_layer.get_vocabulary(),
            oov_token="[UNK]",
            mask_token="",
            invert=True
        )
        self.start_token = self.word_to_id('[startofsequence]')
        self.end_token = self.word_to_id('[endofsequence]')
        self.units = units


        '''Model Layers section'''
        self.es_embed = keras.layers.Embedding(vec_layer.vocabulary_size(),units,mask_zero=True)
        self.decoder_cell = keras.layers.LSTM(units,return_sequences=True,return_state=True,recurrent_initializer="glorot_uniform")
        self.attention = CrossAttention()
        self.out = keras.layers.Dense(vec_layer.vocabulary_size())


    def call(self,encoder_outputs,decoder_inputs,encoder_state=None,return_state=False):

        shape_checker = ShapeCheck()
        shape_checker(encoder_outputs,"batch s units")
        shape_checker(decoder_inputs,"batch t")
        if encoder_state is not None:
            shape_checker(encoder_state[0],"batch units")
            shape_checker(encoder_state[1],"batch units")

        es_embed_out = self.es_embed(decoder_inputs)
        shape_checker(es_embed_out,"batch t units")



        decoder_outputs,*decoder_state = self.decoder_cell(es_embed_out,initial_state=encoder_state)
        shape_checker(decoder_outputs,"batch t units")
        shape_checker(decoder_state[0],"batch units")
        shape_checker(decoder_state[1],"batch units")

        attention_out = self.attention(decoder_outputs,encoder_outputs)
        shape_checker(attention_out,"batch t units")
        shape_checker(self.attention.attention_scores,"batch t s")

        total_out = self.out(attention_out)

        if return_state:
            return total_out,decoder_state
        else:
            return total_out


    def get_initial_state(self,encoder_outputs):
        batch_size = tf.shape(encoder_outputs)[0]
        start_tokens = tf.fill(dims=[batch_size,1],value=self.start_token)
        done = tf.zeros(shape=[batch_size,1],dtype=tf.bool)
        embedding = self.es_embed(start_tokens)
        return start_tokens,done,self.decoder_cell.get_initial_state(embedding)


    def tokens_to_text(self,tokens):
        text_ = self.id_to_word(tokens)
        text_ = tf.strings.reduce_join(text_,axis=-1,separator=" ")
        text_ = tf.strings.regex_replace(text_,"^ *\[startofsequence\] *","")
        text_ = tf.strings.regex_replace(text_," *\[endofsequence\] *$","")
        return text_

    def get_next_tokens(self,encoder_outputs,next_token,done,state,temperature=0.0):
        total_output,state = self(encoder_outputs,next_token,encoder_state=state,return_state=True)

        if temperature:
            next_token = tf.argmax(total_output,axis=-1)
        else:
            scaled_out = total_output/temperature
            next_token = tf.random.categorical(scaled_out[:,-1,:],num_samples=1,seed=42)

        done = done | (next_token == self.end_token)
        next_token = tf.where(done,tf.constant(0,dtype=tf.int64),next_token)
        return next_token,done,state

In [503]:
decoder = Decoder()

In [504]:
next_token,done,state = decoder.get_initial_state(encoder_outputs)
tokens_list = []

for i in range(10):

    next_token,done,state = decoder.get_next_tokens(encoder_outputs,next_token,done,state,temperature=1)
    tokens_list.append(next_token)

In [505]:
decoder.tokens_to_text(tf.concat(tokens_list,axis=-1))[:3]

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'opinas medianoche pierdas pierdas ataque ataque estudiantes estudiantes ! podias',
       b'opinas medianoche pierdas pierdas ataque ataque estudiantes estudiantes espectaculo camisa',
       b'opinas medianoche pierdas pierdas ataque ataque estudiantes estudiantes ! podias'],
      dtype=object)>

# Translator Total Model

In [506]:
%xmode Context

Exception reporting mode: Context


In [507]:
class Translator(keras.models.Model):

    @classmethod
    def add_method(cls,func):
        setattr(cls,func.__name__,func)
        return func

    def __init__(self,units=256,en_layer=en_vec_layer,es_layer=es_vec_layer,**kwargs):

        super(Translator,self).__init__(**kwargs)

        self.encoder = Encoder(units,en_layer)
        self.decoder = Decoder(units,es_layer)


    def call(self,inputs):

        encoder_inputs,decoder_inputs = inputs

        encoder_outputs = self.encoder(encoder_inputs)
        total_out = self.decoder(encoder_outputs,decoder_inputs)

        try:
            del total_out._keras_mask
        except AssertionError as error:
            pass


        return total_out

In [508]:
translator = Translator()
out = translator((en_in,es_in))
print("english inputs (batch s)",en_in.shape)
print("spanish inputs (batch t)",es_in.shape)
print("logits outputs (batch t vocab_size)",out.shape)

english inputs (batch s) (64, 20)
spanish inputs (batch t) (64, 18)
logits outputs (batch t vocab_size) (64, 18, 5000)


In [509]:
for (en_in,es_in),y_in in train_ds.take(1):
    print(en_in.shape)
    print(es_in.shape)
    print(y_in.shape)

(64, 17)
(64, 15)
(64, 15)


In [510]:
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction="none")
y_pred = translator((en_in,es_in))
y_true = y_in
calc_loss = loss_fn(y_true,y_pred)
calc_loss

<tf.Tensor: shape=(64, 15), dtype=float32, numpy=
array([[8.51, 8.4 , 8.46, 8.61, 8.5 , 8.43, 8.42, 8.44, 8.53, 8.53, 8.53,
        8.53, 8.53, 8.53, 8.53],
       [8.59, 8.57, 8.55, 8.46, 8.51, 8.52, 8.52, 8.52, 8.52, 8.52, 8.52,
        8.52, 8.52, 8.52, 8.52],
       [8.57, 8.56, 8.47, 8.61, 8.52, 8.5 , 8.44, 8.49, 8.49, 8.49, 8.49,
        8.49, 8.49, 8.49, 8.49],
       [8.6 , 8.61, 8.5 , 8.56, 8.5 , 8.53, 8.5 , 8.5 , 8.5 , 8.5 , 8.5 ,
        8.5 , 8.5 , 8.5 , 8.5 ],
       [8.51, 8.45, 8.52, 8.52, 8.45, 8.54, 8.51, 8.48, 8.51, 8.46, 8.46,
        8.46, 8.46, 8.46, 8.46],
       [8.67, 8.57, 8.57, 8.64, 8.53, 8.67, 8.58, 8.56, 8.53, 8.52, 8.46,
        8.46, 8.46, 8.46, 8.46],
       [8.5 , 8.59, 8.5 , 8.55, 8.37, 8.48, 8.53, 8.46, 8.49, 8.44, 8.57,
        8.57, 8.57, 8.57, 8.57],
       [8.52, 8.49, 8.46, 8.51, 8.67, 8.55, 8.56, 8.48, 8.52, 8.51, 8.44,
        8.6 , 8.58, 8.67, 8.59],
       [8.55, 8.51, 8.57, 8.57, 8.44, 8.5 , 8.5 , 8.58, 8.58, 8.58, 8.58,
        8.58, 8.58, 

In [511]:
mask = tf.cast(y_true != 0,calc_loss.dtype)
mask

<tf.Tensor: shape=(64, 15), dtype=float32, numpy=
array([[1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0

In [512]:
calc_loss *= mask
calc_loss

<tf.Tensor: shape=(64, 15), dtype=float32, numpy=
array([[8.51, 8.4 , 8.46, 8.61, 8.5 , 8.43, 8.42, 8.44, 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [8.59, 8.57, 8.55, 8.46, 8.51, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [8.57, 8.56, 8.47, 8.61, 8.52, 8.5 , 8.44, 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [8.6 , 8.61, 8.5 , 8.56, 8.5 , 8.53, 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [8.51, 8.45, 8.52, 8.52, 8.45, 8.54, 8.51, 8.48, 8.51, 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [8.67, 8.57, 8.57, 8.64, 8.53, 8.67, 8.58, 8.56, 8.53, 8.52, 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [8.5 , 8.59, 8.5 , 8.55, 8.37, 8.48, 8.53, 8.46, 8.49, 8.44, 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [8.52, 8.49, 8.46, 8.51, 8.67, 8.55, 8.56, 8.48, 8.52, 8.51, 8.44,
        8.6 , 8.58, 8.67, 8.59],
       [8.55, 8.51, 8.57, 8.57, 8.44, 8.5 , 8.5 , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 

In [513]:
reduced_loss = tf.reduce_sum(calc_loss)
reduced_loss

<tf.Tensor: shape=(), dtype=float32, numpy=4318.413>

In [514]:
reduced_mask = tf.reduce_sum(mask)
reduced_mask

<tf.Tensor: shape=(), dtype=float32, numpy=507.0>

In [515]:
reduced_loss/reduced_mask

<tf.Tensor: shape=(), dtype=float32, numpy=8.51758>

In [516]:
loss_fn_reduced = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss_fn_reduced(y_true,y_pred)

<tf.Tensor: shape=(), dtype=float32, numpy=8.511817>

In [517]:
y_pred = tf.argmax(y_true,axis=-1)
y_pred

<tf.Tensor: shape=(64,), dtype=int64, numpy=
array([ 5,  1,  2,  0,  0,  1,  1, 12,  4,  5,  2,  2,  2,  2,  5,  5,  3,
        2,  2,  4,  2,  1,  0,  4,  1,  2,  6,  4,  1,  1,  1,  1,  3, 11,
        3,  2,  1,  1,  3,  3,  2,  8,  1,  4,  5,  2,  3,  4,  3,  2,  2,
       10,  2,  4,  2,  3,  9,  5,  3,  6,  6,  6,  3,  0])>

In [518]:
y_pred = tf.cast(y_pred,y_true.dtype)
y_pred

<tf.Tensor: shape=(64,), dtype=int64, numpy=
array([ 5,  1,  2,  0,  0,  1,  1, 12,  4,  5,  2,  2,  2,  2,  5,  5,  3,
        2,  2,  4,  2,  1,  0,  4,  1,  2,  6,  4,  1,  1,  1,  1,  3, 11,
        3,  2,  1,  1,  3,  3,  2,  8,  1,  4,  5,  2,  3,  4,  3,  2,  2,
       10,  2,  4,  2,  3,  9,  5,  3,  6,  6,  6,  3,  0])>

In [519]:
accuracy_tensor = tf.cast(y_true == y_pred, tf.float32)
accuracy_tensor

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [520]:
mask = tf.cast(y_true != 0,tf.float32)
mask

<tf.Tensor: shape=(64, 15), dtype=float32, numpy=
array([[1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0

In [521]:
tf.reduce_sum(accuracy_tensor)/tf.reduce_sum(mask)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [522]:
def masked_loss(y_true,y_pred):
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction="none")
    loss = loss_fn(y_true,y_pred)
    mask = tf.cast(y_true != 0,y_true.dtype)
    loss *= mask
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def masked_accuracy(y_true,y_pred):
    y_pred = tf.argmax(y_pred,axis=-1,output_type=y_true.dtype)
    accuracy = tf.cast(y_true == y_pred,tf.float32)
    mask = tf.cast(y_true != 0,tf.float32)
    return tf.reduce_sum(accuracy)/tf.reduce_sum(mask)

In [523]:
masked_accuracy(y_in,translator((en_in,es_in)))

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [524]:
translator.compile(optimizer="adam",loss=masked_loss,metrics=[masked_accuracy,masked_loss])

In [525]:
for i,y in valid_ds:
    print(i[0].shape)
    print(i[1].shape)
    print(y.shape)

(64, 19)
(64, 16)
(64, 16)
(64, 17)
(64, 17)
(64, 17)
(64, 22)
(64, 18)
(64, 18)
(64, 16)
(64, 17)
(64, 17)
(64, 17)
(64, 16)
(64, 16)
(64, 29)
(64, 25)
(64, 25)
(64, 16)
(64, 17)
(64, 17)
(64, 17)
(64, 19)
(64, 19)
(64, 20)
(64, 15)
(64, 15)
(64, 17)
(64, 17)
(64, 17)
(64, 17)
(64, 17)
(64, 17)
(64, 17)
(64, 16)
(64, 16)
(64, 19)
(64, 20)
(64, 20)
(64, 19)
(64, 17)
(64, 17)
(64, 25)
(64, 27)
(64, 27)
(64, 17)
(64, 15)
(64, 15)
(64, 17)
(64, 16)
(64, 16)
(64, 20)
(64, 16)
(64, 16)
(64, 19)
(64, 17)
(64, 17)
(64, 20)
(64, 20)
(64, 20)
(64, 17)
(64, 16)
(64, 16)
(64, 17)
(64, 17)
(64, 17)
(64, 17)
(64, 15)
(64, 15)
(64, 21)
(64, 19)
(64, 19)
(64, 17)
(64, 15)
(64, 15)
(64, 15)
(64, 15)
(64, 15)
(64, 16)
(64, 17)
(64, 17)
(64, 20)
(64, 19)
(64, 19)
(64, 15)
(64, 16)
(64, 16)
(64, 17)
(64, 18)
(64, 18)
(64, 18)
(64, 14)
(64, 14)
(64, 26)
(64, 27)
(64, 27)
(64, 14)
(64, 13)
(64, 13)
(64, 22)
(64, 19)
(64, 19)
(64, 14)
(64, 13)
(64, 13)
(64, 16)
(64, 14)
(64, 14)
(64, 19)
(64, 18)
(64, 18)
(

In [526]:
translator.evaluate(valid_ds,steps=20,return_dict=True)

ValueError: ignored