<a href="https://colab.research.google.com/github/h4ck4l1/datasets/blob/main/NLP_with_RNN_and_Attention/NMT_with_attention_google.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os,warnings,sys
if "google.colab" in sys.modules:
    !pip3 install -q -U "tensorflow-text==2.13.0"
    !pip3 install -q -U einops
warnings.filterwarnings("ignore")
from IPython.display import clear_output
os.environ["TF_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text
import typing
from zipfile import ZipFile
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio
import einops
pio.templates.default = "plotly_dark"
import numpy as np
np.set_printoptions(precision=2)
tf.get_logger().setLevel("ERROR")
%xmode Minimal
if "google.colab" in sys.modules:
    clear_output()

In [2]:

class ShapeCheck():

    def __init__(self):

        self.shapes = {}

    def __call__(self,tensor,names,broadcast=False):

        parsed = einops.parse_shape(tensor,names)

        for name,new_dim in parsed.items():

            old_dim = self.shapes.get(name,None)

            if broadcast and (new_dim == 1):
                continue

            if old_dim is None:

                self.shapes[name] = new_dim
                continue

            if new_dim != old_dim:

                raise ValueError(f"SHAPE MISTMATCH FOR DIMENSION: '{name}' FOUND: {new_dim} EXPECTED: {old_dim}")

In [3]:
url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

In [4]:
file_path = keras.utils.get_file(fname="spa-eng.zip",origin=url,extract=True)

with ZipFile(file_path,"r") as f:

    f.extractall("spa-eng")

with open("spa-eng/spa-eng/spa.txt","r") as f:

    total_text = f.read()
    total_text = [line.split("\t") for line in total_text.splitlines()]
    en_text,es_text = zip(*total_text)

In [5]:
en_text[-1]

'If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.'

In [6]:
es_text[-1]

'Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.'

In [7]:
en_array = np.array(en_text)
es_array = np.array(es_text)

In [8]:
is_train = np.random.uniform(size=(len(en_array),)) < 0.8

raw_train = (
    tf.data.Dataset
    .from_tensor_slices((en_array[is_train],es_array[is_train]))
    .shuffle(len(en_text))
    .batch(64)
)
raw_valid = (
    tf.data.Dataset
    .from_tensor_slices((en_array[~is_train],es_array[~is_train]))
    .shuffle(len(en_text))
    .batch(64)
)

In [9]:
for en,es in raw_train.take(1):
    print(en[:4])
    print("translates to latin as ")
    print(es[:4])

tf.Tensor(
[b'Every dog has his day.' b'Is somebody there?' b'We watch TV every day.'
 b'Tom accidentally set fire to the curtain.'], shape=(4,), dtype=string)
translates to latin as 
tf.Tensor(
[b'A cada santo le llega su d\xc3\xada.' b'\xc2\xbfHay alguien?'
 b'Nosotros vemos televisi\xc3\xb3n todos los d\xc3\xadas.'
 b'Tom le prendi\xc3\xb3 fuego accidentalmente a la cortina.'], shape=(4,), dtype=string)


# Standardize Text

In [10]:
es_text[:10]

('Ve.',
 'Vete.',
 'Vaya.',
 'Váyase.',
 'Hola.',
 '¡Corre!',
 'Corred.',
 '¿Quién?',
 '¡Fuego!',
 '¡Incendio!')

In [11]:
tf.constant(es_text[:10]) # converting to tensor

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'Ve.', b'Vete.', b'Vaya.', b'V\xc3\xa1yase.', b'Hola.',
       b'\xc2\xa1Corre!', b'Corred.', b'\xc2\xbfQui\xc3\xa9n?',
       b'\xc2\xa1Fuego!', b'\xc2\xa1Incendio!'], dtype=object)>

In [12]:
temp_text = text.normalize_utf8(es_text[:10],"NFKD") # Normalizing text so that it can be used in operations
temp_text

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'Ve.', b'Vete.', b'Vaya.', b'Va\xcc\x81yase.', b'Hola.',
       b'\xc2\xa1Corre!', b'Corred.', b'\xc2\xbfQuie\xcc\x81n?',
       b'\xc2\xa1Fuego!', b'\xc2\xa1Incendio!'], dtype=object)>

In [13]:
temp_text_1 = tf.strings.lower(temp_text) # Lower casing all the characters
temp_text_1

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've.', b'vete.', b'vaya.', b'va\xcc\x81yase.', b'hola.',
       b'\xc2\xa1corre!', b'corred.', b'\xc2\xbfquie\xcc\x81n?',
       b'\xc2\xa1fuego!', b'\xc2\xa1incendio!'], dtype=object)>

In [14]:
temp_text_2 = tf.strings.regex_replace(temp_text_1,"[^ a-z.?!,¿]","")  # [^ ...] means exclude..so excluding all the a-z and rest
temp_text_2                                                            # and replacing with noting

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've.', b'vete.', b'vaya.', b'vayase.', b'hola.', b'corre!',
       b'corred.', b'\xc2\xbfquien?', b'fuego!', b'incendio!'],
      dtype=object)>

In [15]:
temp_text_3 = tf.strings.regex_replace(temp_text_2,"[.¡¿,?!]",r' \0 ') # Placing a null character[raw_string : r'']
temp_text_3                                                            # before and after every punctuation

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've . ', b'vete . ', b'vaya . ', b'vayase . ', b'hola . ',
       b'corre ! ', b'corred . ', b' \xc2\xbf quien ? ', b'fuego ! ',
       b'incendio ! '], dtype=object)>

In [16]:
temp_text_4= tf.strings.strip(temp_text_3) # stripping any extra spaces
temp_text_4

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've .', b'vete .', b'vaya .', b'vayase .', b'hola .', b'corre !',
       b'corred .', b'\xc2\xbf quien ?', b'fuego !', b'incendio !'],
      dtype=object)>

In [17]:
temp_text_5 = tf.strings.join(['[startofsequence]',temp_text_4,'[endofsequence]'],separator=" ")
temp_text_5

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'[startofsequence] ve . [endofsequence]',
       b'[startofsequence] vete . [endofsequence]',
       b'[startofsequence] vaya . [endofsequence]',
       b'[startofsequence] vayase . [endofsequence]',
       b'[startofsequence] hola . [endofsequence]',
       b'[startofsequence] corre ! [endofsequence]',
       b'[startofsequence] corred . [endofsequence]',
       b'[startofsequence] \xc2\xbf quien ? [endofsequence]',
       b'[startofsequence] fuego ! [endofsequence]',
       b'[startofsequence] incendio ! [endofsequence]'], dtype=object)>

In [18]:
def text_preprocessor(input_text):

    input_text = text.normalize_utf8(input_text,"NFKD")
    input_text = tf.strings.lower(input_text)
    input_text = tf.strings.regex_replace(input_text,"[^ a-z?.!¿¡,]","")
    input_text = tf.strings.regex_replace(input_text,"[?.!¿¡,]",r" \0 ")
    input_text = tf.strings.strip(input_text)
    input_text = tf.strings.join(["[startofsequence]",input_text,"[endofsequence]"],separator=" ")
    return input_text

# Text Vectorization of En and Es

In [19]:
vocab_size = 5000

en_vec_layer = keras.layers.TextVectorization(vocab_size,standardize=text_preprocessor,ragged=True)
en_vec_layer.adapt(raw_train.map(lambda en,es:en))
es_vec_layer = keras.layers.TextVectorization(vocab_size,standardize=text_preprocessor,ragged=True)
es_vec_layer.adapt(raw_train.map(lambda en,es:es))

In [20]:
print(en_vec_layer.get_vocabulary()[:10])
print(es_vec_layer.get_vocabulary()[:10])

['', '[UNK]', '[startofsequence]', '[endofsequence]', '.', 'the', 'i', 'to', 'you', 'tom']
['', '[UNK]', '[startofsequence]', '[endofsequence]', '.', 'que', 'de', 'el', 'a', 'no']


In [21]:
for english_text,en_vectorized_out in zip(en.numpy()[:4],en_vec_layer(en[:4])):
    print(english_text,"---->",en_vectorized_out)

b'Every dog has his day.' ----> tf.Tensor([  2 154 175  48  30 109   4   3], shape=(8,), dtype=int64)
b'Is somebody there?' ----> tf.Tensor([  2  12 723  53  11   3], shape=(6,), dtype=int64)
b'We watch TV every day.' ----> tf.Tensor([  2  31 288 366 154 109   4   3], shape=(8,), dtype=int64)
b'Tom accidentally set fire to the curtain.' ----> tf.Tensor([   2    9 3942  832  430    7    5 2753    4    3], shape=(10,), dtype=int64)


In [22]:
for latin_text,es_vectorized_out in zip(es.numpy()[:4],es_vec_layer(es[:4])):
    print(latin_text,"---->",es_vectorized_out)

b'A cada santo le llega su d\xc3\xada.' ----> tf.Tensor([   2    8  225    1   28 1047   25  101    4    3], shape=(10,), dtype=int64)
b'\xc2\xbfHay alguien?' ----> tf.Tensor([  2  13  62 155  12   3], shape=(6,), dtype=int64)
b'Nosotros vemos televisi\xc3\xb3n todos los d\xc3\xadas.' ----> tf.Tensor([   2  164 1607  322   66   26  153    4    3], shape=(9,), dtype=int64)
b'Tom le prendi\xc3\xb3 fuego accidentalmente a la cortina.' ----> tf.Tensor([   2   10   28 2616  649    1    8   11 3691    4    3], shape=(11,), dtype=int64)


In [23]:
en_vocab = np.array(en_vec_layer.get_vocabulary())
es_vocab = np.array(es_vec_layer.get_vocabulary())

In [24]:
print(" ".join(en_vocab[en_vectorized_out.numpy()]))
print(" ".join(es_vocab[es_vectorized_out.numpy()]))

[startofsequence] tom accidentally set fire to the curtain . [endofsequence]
[startofsequence] tom le prendio fuego [UNK] a la cortina . [endofsequence]


In [25]:
en_vec_out = en_vec_layer(en)
es_vec_out = es_vec_layer(es)

In [26]:
fig = make_subplots(cols=2,subplot_titles=["Unmasked","Masked"])
fig.add_trace(go.Heatmap(z=en_vec_out.to_tensor().numpy()),row=1,col=1)
fig.add_trace(go.Heatmap(z=np.array((en_vec_out.to_tensor() != 0).numpy(),dtype=np.int32)),row=1,col=2)
fig.show()

In [27]:
def preprocess_dataset(en,es):
    X_train = en_vec_layer(en).to_tensor()
    X_dec = es_vec_layer(es)
    X_dec_train = X_dec[:,:-1].to_tensor()
    y_train = X_dec[:,1:].to_tensor()

    return (X_train,X_dec_train),y_train

In [28]:
train_ds = raw_train.map(preprocess_dataset,tf.data.AUTOTUNE)
valid_ds = raw_valid.map(preprocess_dataset,tf.data.AUTOTUNE)

In [29]:
train_shapes = []
valid_shapes = []
for i,y in train_ds:
    train_shapes.append(i[0].shape[0])

for i,y in valid_ds:
    valid_shapes.append(i[0].shape[0])

In [30]:
print(train_shapes[-3:])
print(valid_shapes[-3:])

[64, 64, 25]
[64, 64, 27]


In [31]:
for (en_in,es_in),es_out in train_ds.take(1):
    print(en_in.shape)
    print(es_in.shape)
    print(es_out.shape)

(64, 18)
(64, 21)
(64, 21)


# Encoder Class

- Embedding Layer
- GRU/LSTM Layer

In [32]:
vocab_size = len(en_vec_layer.get_vocabulary())
vocab_size

5000

In [33]:
embed_size = 256

In [34]:
encoder_embed_layer = keras.layers.Embedding(vocab_size,embed_size,mask_zero=True)
encoder = keras.layers.Bidirectional(
    keras.layers.LSTM(256,return_sequences=True,recurrent_initializer="glorot_uniform"),
    merge_mode="sum"
)

In [35]:
shape_checker = ShapeCheck()
shape_checker(en_in,"batch s")

In [36]:
shape_checker.shapes

{'batch': 64, 's': 18}

In [37]:
enc_embed_output = encoder_embed_layer(en_in)
enc_embed_output.shape

TensorShape([64, 18, 256])

In [38]:
shape_checker(enc_embed_output,"batch s units")
shape_checker.shapes

{'batch': 64, 's': 18, 'units': 256}

In [39]:
encoder_outputs = encoder(enc_embed_output)
encoder_outputs.shape

TensorShape([64, 18, 256])

In [40]:
shape_checker(encoder_outputs,"batch s units")

In [41]:
mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=256)

In [42]:
print(en_in.shape)
print(encoder_outputs.shape)

(64, 18)
(64, 18, 256)


In [43]:
len(en_vec_layer.get_vocabulary())

5000

In [44]:
class Encoder(keras.layers.Layer):

    def __init__(self,units=256,vec_layer=en_vec_layer,**kwargs):

        super(Encoder,self).__init__(**kwargs)
        self.vec_layer = vec_layer
        self.embed = keras.layers.Embedding(vec_layer.vocabulary_size(),units,mask_zero=True)
        self.encoder = keras.layers.Bidirectional(
            layer=keras.layers.LSTM(units,return_sequences=True,return_state=True,recurrent_initializer="glorot_uniform"),
            merge_mode="sum"
        )

    def call(self,inputs):

        shape_checker = ShapeCheck()
        shape_checker(inputs,"batch s")
        en_embed_out = self.embed(inputs)
        shape_checker(en_embed_out,"batch s units")
        encoder_outputs,*encoder_state = self.encoder(en_embed_out)
        self.encoder_state = encoder_state
        shape_checker(encoder_outputs,"batch s units")
        return encoder_outputs


    def convert_encoder_inputs_to_encoder_outputs(self,texts):
        texts = tf.convert_to_tensor(texts)
        if len(texts.shape) == 0:
            texts = tf.convert_to_tensor(texts)[tf.newaxis]
        encoder_inputs = self.vec_layer(texts).to_tensor()
        return self(encoder_inputs)


# CrossAttention

In [45]:
decoder_embed_layer = keras.layers.Embedding(es_vec_layer.vocabulary_size(),256,mask_zero=True)
decoder_embed_out = decoder_embed_layer(es_in)
decoder_embed_out.shape

TensorShape([64, 21, 256])

In [46]:
mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=256)
attention_output,attention_scores = mha(query=decoder_embed_out,value=encoder_outputs,return_attention_scores=True)
print(attention_output.shape)
print(attention_scores.shape)

(64, 21, 256)
(64, 1, 21, 18)


In [47]:
shape_checker = ShapeCheck()
shape_checker(decoder_embed_out,"batch t units")
shape_checker(encoder_outputs,"batch s units")

In [48]:
attention_scores = tf.reduce_mean(attention_scores,axis=1)
attention_scores.shape

TensorShape([64, 21, 18])

In [49]:
adding_layer = keras.layers.Add()
add_out = adding_layer([decoder_embed_out,attention_output])
add_out.shape

TensorShape([64, 21, 256])

In [50]:
layer_norm = keras.layers.LayerNormalization()
layer_out = layer_norm(add_out)
layer_out.shape

TensorShape([64, 21, 256])

In [51]:
class CrossAttention(keras.layers.Layer):

    def __init__(self,units=256,**kwargs):

        super(CrossAttention,self).__init__(**kwargs)
        self.mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=units)
        self.add = keras.layers.Add()
        self.layer_norm = keras.layers.LayerNormalization()

    def call(self,decoder_out,encoder_out):

        shape_checker = ShapeCheck()
        shape_checker(decoder_out,"batch t units")
        shape_checker(encoder_out,"batch s units")

        attention_output,attention_scores = self.mha(query=decoder_out,value=encoder_out,return_attention_scores=True)
        shape_checker(attention_output,"batch t units")
        shape_checker(attention_scores,"batch heads t s")

        add_and_layer_norm = self.layer_norm(self.add([decoder_out,attention_output]))
        self.attention_scores = tf.reduce_mean(attention_scores,axis=1)

        return add_and_layer_norm

In [52]:
attention_layer = CrossAttention()

attention_out = attention_layer(decoder_embed_out,encoder_outputs)
attention_out.shape

TensorShape([64, 21, 256])

In [53]:
attention_layer.attention_scores.shape

TensorShape([64, 21, 18])

In [54]:
np.sum(attention_layer.attention_scores,axis=-1)[:5,:]

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1.]], dtype=float32)

In [55]:
fig = make_subplots(cols=2,subplot_titles=["Attention Output","Masked Output"])
fig.add_trace(go.Heatmap(z=attention_layer.attention_scores[:,0,:]),row=1,col=1)
fig.add_trace(go.Heatmap(z=np.array((en_vec_out.to_tensor() != 0).numpy(),dtype=np.int32)),row=1,col=2)
fig.show()

# Decoder

In [56]:
class Decoder(keras.layers.Layer):

    def __init__(self,units=256,vec_layer=es_vec_layer,**kwargs):

        super(Decoder,self).__init__(**kwargs)

        '''Preprocessing Section'''
        self.vec_layer = vec_layer
        self.vocab_size = vec_layer.vocabulary_size()
        self.word_to_id = keras.layers.StringLookup(
            vocabulary=vec_layer.get_vocabulary(),
            oov_token="[UNK]",
            mask_token=""
        )
        self.id_to_word = keras.layers.StringLookup(
            vocabulary=vec_layer.get_vocabulary(),
            oov_token="[UNK]",
            mask_token="",
            invert=True
        )
        self.start_token = self.word_to_id('[startofsequence]')
        self.end_token = self.word_to_id('[endofsequence]')
        self.units = units


        '''Model Layers section'''
        self.es_embed = keras.layers.Embedding(vec_layer.vocabulary_size(),units,mask_zero=True)
        self.decoder_cell = keras.layers.LSTM(units,return_sequences=True,return_state=True,recurrent_initializer="glorot_uniform")
        self.attention = CrossAttention()
        self.out = keras.layers.Dense(vec_layer.vocabulary_size())


    def call(self,encoder_outputs,decoder_inputs,encoder_state=None,return_state=False):

        shape_checker = ShapeCheck()
        shape_checker(encoder_outputs,"batch s units")
        shape_checker(decoder_inputs,"batch t")
        if encoder_state is not None:
            shape_checker(encoder_state[0],"batch units")
            shape_checker(encoder_state[1],"batch units")

        es_embed_out = self.es_embed(decoder_inputs)
        shape_checker(es_embed_out,"batch t units")



        decoder_outputs,*decoder_state = self.decoder_cell(es_embed_out,initial_state=encoder_state)
        shape_checker(decoder_outputs,"batch t units")
        shape_checker(decoder_state[0],"batch units")
        shape_checker(decoder_state[1],"batch units")

        attention_out = self.attention(decoder_outputs,encoder_outputs)
        shape_checker(attention_out,"batch t units")
        shape_checker(self.attention.attention_scores,"batch t s")

        total_out = self.out(attention_out)

        if return_state:
            return total_out,decoder_state
        else:
            return total_out


    def get_initial_state(self,encoder_outputs):
        batch_size = tf.shape(encoder_outputs)[0]
        start_tokens = tf.fill(dims=[batch_size,1],value=self.start_token)
        done = tf.zeros(shape=[batch_size,1],dtype=tf.bool)
        embedding = self.es_embed(start_tokens)
        return start_tokens,done,self.decoder_cell.get_initial_state(embedding)


    def tokens_to_text(self,tokens):
        text_ = self.id_to_word(tokens)
        text_ = tf.strings.reduce_join(text_,axis=-1,separator=" ")
        text_ = tf.strings.regex_replace(text_,"^ *\[startofsequence\] *","")
        text_ = tf.strings.regex_replace(text_," *\[endofsequence\] *$","")
        return text_

    def get_next_tokens(self,encoder_outputs,next_token,done,state,temperature=0.0):
        total_output,state = self(encoder_outputs,next_token,encoder_state=state,return_state=True)

        if temperature:
            next_token = tf.argmax(total_output,axis=-1)
        else:
            scaled_out = total_output/temperature
            next_token = tf.random.categorical(scaled_out[:,-1,:],num_samples=1,seed=42)

        done = done | (next_token == self.end_token)
        next_token = tf.where(done,tf.constant(0,dtype=tf.int64),next_token)
        return next_token,done,state

In [57]:
decoder = Decoder()

In [58]:
next_token,done,state = decoder.get_initial_state(encoder_outputs)
tokens_list = []

for i in range(10):

    next_token,done,state = decoder.get_next_tokens(encoder_outputs,next_token,done,state,temperature=1)
    tokens_list.append(next_token)

In [59]:
decoder.tokens_to_text(tf.concat(tokens_list,axis=-1))[:3]

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'noche economico enviado enviado primer favorito envio ello terrible terrible',
       b'noche criado planeta construyeron frente prestar prestar quedese quedese marcho',
       b'noche naranjas pequena oportunidad salvado saliera cuadro separado piscina aprendio'],
      dtype=object)>

# Translator Total Model

In [60]:
%xmode Context

Exception reporting mode: Context


In [61]:
class Translator(keras.models.Model):

    @classmethod
    def add_method(cls,func):
        setattr(cls,func.__name__,func)
        return func

    def __init__(self,units=256,en_layer=en_vec_layer,es_layer=es_vec_layer,**kwargs):

        super(Translator,self).__init__(**kwargs)

        self.encoder = Encoder(units,en_layer)
        self.decoder = Decoder(units,es_layer)


    def call(self,inputs):

        encoder_inputs,decoder_inputs = inputs

        encoder_outputs = self.encoder(encoder_inputs)
        total_out = self.decoder(encoder_outputs,decoder_inputs)


        return total_out

In [62]:
translator = Translator()
out = translator((en_in,es_in))
print("english inputs (batch s)",en_in.shape)
print("spanish inputs (batch t)",es_in.shape)
print("logits outputs (batch t vocab_size)",out.shape)

english inputs (batch s) (64, 18)
spanish inputs (batch t) (64, 21)
logits outputs (batch t vocab_size) (64, 21, 5000)


In [63]:
for (en_in,es_in),y_in in train_ds.take(1):
    print(en_in.shape)
    print(es_in.shape)
    print(y_in.shape)

(64, 19)
(64, 17)
(64, 17)


In [64]:
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction="none")
y_pred = translator((en_in,es_in))
y_true = y_in
calc_loss = loss_fn(y_true,y_pred)
calc_loss

<tf.Tensor: shape=(64, 17), dtype=float32, numpy=
array([[8.57, 8.44, 8.58, ..., 0.  , 0.  , 0.  ],
       [8.62, 8.47, 8.62, ..., 0.  , 0.  , 0.  ],
       [8.52, 8.49, 8.55, ..., 8.4 , 8.47, 0.  ],
       ...,
       [8.5 , 8.54, 8.5 , ..., 0.  , 0.  , 0.  ],
       [8.55, 8.51, 8.5 , ..., 0.  , 0.  , 0.  ],
       [8.52, 8.49, 8.52, ..., 0.  , 0.  , 0.  ]], dtype=float32)>

In [65]:
mask = tf.cast(y_true != 0,calc_loss.dtype)
mask

<tf.Tensor: shape=(64, 17), dtype=float32, numpy=
array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 1., 1., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>

In [66]:
calc_loss *= mask
calc_loss

<tf.Tensor: shape=(64, 17), dtype=float32, numpy=
array([[8.57, 8.44, 8.58, ..., 0.  , 0.  , 0.  ],
       [8.62, 8.47, 8.62, ..., 0.  , 0.  , 0.  ],
       [8.52, 8.49, 8.55, ..., 8.4 , 8.47, 0.  ],
       ...,
       [8.5 , 8.54, 8.5 , ..., 0.  , 0.  , 0.  ],
       [8.55, 8.51, 8.5 , ..., 0.  , 0.  , 0.  ],
       [8.52, 8.49, 8.52, ..., 0.  , 0.  , 0.  ]], dtype=float32)>

In [67]:
reduced_loss = tf.reduce_sum(calc_loss)
reduced_loss

<tf.Tensor: shape=(), dtype=float32, numpy=4392.4985>

In [68]:
reduced_mask = tf.reduce_sum(mask)
reduced_mask

<tf.Tensor: shape=(), dtype=float32, numpy=515.0>

In [69]:
reduced_loss/reduced_mask

<tf.Tensor: shape=(), dtype=float32, numpy=8.529123>

In [70]:
loss_fn_reduced = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss_fn_reduced(y_true,y_pred)

<tf.Tensor: shape=(), dtype=float32, numpy=8.529122>

In [71]:
y_pred = tf.argmax(y_true,axis=-1)
y_pred

<tf.Tensor: shape=(64,), dtype=int64, numpy=
array([ 1,  5, 11,  4,  0,  0,  4,  1,  5,  3,  1, 14,  4,  3,  2,  7,  0,
        4,  2,  0,  0,  0,  2,  1,  9,  7,  2,  6,  7,  4, 10,  2,  2,  2,
        3,  2,  2,  0,  2,  7,  2,  4,  3,  6,  6,  3,  4,  3,  5,  4,  1,
        3,  4,  4,  0,  1,  4,  0,  3,  1,  1,  1,  4,  1])>

In [72]:
y_pred = tf.cast(y_pred,y_true.dtype)
y_pred

<tf.Tensor: shape=(64,), dtype=int64, numpy=
array([ 1,  5, 11,  4,  0,  0,  4,  1,  5,  3,  1, 14,  4,  3,  2,  7,  0,
        4,  2,  0,  0,  0,  2,  1,  9,  7,  2,  6,  7,  4, 10,  2,  2,  2,
        3,  2,  2,  0,  2,  7,  2,  4,  3,  6,  6,  3,  4,  3,  5,  4,  1,
        3,  4,  4,  0,  1,  4,  0,  3,  1,  1,  1,  4,  1])>

In [73]:
accuracy_tensor = tf.cast(y_true == y_pred, tf.float32)
accuracy_tensor

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [74]:
mask = tf.cast(y_true != 0,tf.float32)
mask

<tf.Tensor: shape=(64, 17), dtype=float32, numpy=
array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 1., 1., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>

In [75]:
tf.reduce_sum(accuracy_tensor)/tf.reduce_sum(mask)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [76]:
def masked_loss(y_true,y_pred):
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction="none")
    loss = loss_fn(y_true,y_pred)
    mask = tf.cast(y_true != 0,loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def masked_accuracy(y_true,y_pred):
    y_pred = tf.argmax(y_pred,axis=-1)
    y_pred = tf.cast(y_pred,y_true.dtype)
    accuracy = tf.cast(y_true == y_pred,tf.float32)
    mask = tf.cast(y_true != 0,tf.float32)
    return tf.reduce_sum(accuracy)/tf.reduce_sum(mask)

In [77]:
print(masked_loss(y_in,translator((en_in,es_in))))
print(masked_accuracy(y_in,translator((en_in,es_in))))

tf.Tensor(8.529123, shape=(), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32)


In [78]:
translator.compile(optimizer="adam",loss=masked_loss,metrics=[masked_accuracy,masked_loss])

In [79]:
translator.evaluate(valid_ds,steps=20,return_dict=True)

ValueError: ignored