<a href="https://colab.research.google.com/github/h4ck4l1/datasets/blob/main/NLP_with_RNN_and_Attention/NMT_with_tpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os,warnings
from IPython.display import clear_output
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
!pip3 install -q -U "tensorflow-text==2.13.0"
!pip3 install -q -U einops
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as tf_text
np.printoptions(precision=2)
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_dark"
import einops
from zipfile import ZipFile
from typing import Any
# %xmode Minimal
tf.get_logger().setLevel("ERROR")
clear_output()

In [2]:

origin = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
file_path = keras.utils.get_file(fname="spa-eng.zip",origin=origin,extract=True)
with ZipFile(file_path,"r") as f:
    f.extractall("spa-eng")
with open("spa-eng/spa-eng/spa.txt","r") as f:
    text = f.read()
en_text,es_text = zip(*[line.split("\t") for line in text.splitlines()])
for en,es in zip(en_text[:10],es_text[:10]):
    print(en,"---->",es)

Go. ----> Ve.
Go. ----> Vete.
Go. ----> Vaya.
Go. ----> Váyase.
Hi. ----> Hola.
Run! ----> ¡Corre!
Run. ----> Corred.
Who? ----> ¿Quién?
Fire! ----> ¡Fuego!
Fire! ----> ¡Incendio!


In [3]:
def text_preprocess(sentence:str):
    sentence = tf_text.normalize_utf8(sentence,"NFKD")
    sentence = tf.strings.lower(sentence)
    sentence = tf.strings.regex_replace(sentence,r"[^ a-z.,!?¿]","")
    sentence = tf.strings.regex_replace(sentence,r"[.,!?¿]",r" \0 ")
    sentence = tf.strings.strip(sentence)
    sentence = tf.strings.join(["[START]",sentence,"[END]"],separator=" ")
    return sentence

In [4]:
def get_layers(vocab_size=5000):
    en_vec_layer = keras.layers.TextVectorization(max_tokens=vocab_size,standardize=text_preprocess,ragged=True)
    es_vec_layer = keras.layers.TextVectorization(max_tokens=vocab_size,standardize=text_preprocess,ragged=True)
    en_vec_layer.adapt(en_text)
    es_vec_layer.adapt(es_text)
    return en_vec_layer,es_vec_layer

In [5]:
en_vec_layer,es_vec_layer = get_layers(vocab_size=5000)

In [6]:
def preprocess(en_inputs,es_inputs):
    en_inputs = en_vec_layer(en_inputs).to_tensor()
    es_inputs = es_vec_layer(es_inputs).to_tensor()
    return (en_inputs,es_inputs[:,:-1]),es_inputs[:,1:]

In [7]:
AUTO = tf.data.AUTOTUNE

all_indices = np.random.uniform(size=len(en_text))
train_indices = all_indices <= 0.8
valid_indices = (all_indices > 0.8) & (all_indices <= 0.97)
test_indices = all_indices > 0.97
en_text = np.array(en_text)
es_text = np.array(es_text)
batch_size = 64
train_size = len(train_indices)
valid_size = len(valid_indices)
train_ds = (
    tf.data.Dataset
    .from_tensor_slices((en_text[train_indices],es_text[train_indices]))
    .shuffle(len(en_text))
    .batch(batch_size)
    .map(preprocess)
    .prefetch(AUTO)
)
valid_ds = (
    tf.data.Dataset
    .from_tensor_slices((en_text[valid_indices],es_text[valid_indices]))
    .shuffle(len(en_text))
    .batch(batch_size)
    .map(preprocess)
    .prefetch(AUTO)
)
test_ds = (
    tf.data.Dataset
    .from_tensor_slices((en_text[test_indices],es_text[test_indices]))
    .shuffle(len(en_text))
    .batch(batch_size)
    .map(preprocess)
    .prefetch(AUTO)
)

In [8]:
class Encoder(keras.layers.Layer):

    def __init__(self,vec_layer: keras.layers.TextVectorization=en_vec_layer,units: int=256,**kwargs):

        super(Encoder,self).__init__(**kwargs)
        self.units = units
        self.vec_layer = vec_layer
        self.vocab_size = vec_layer.vocabulary_size()

        self.embedder = keras.layers.Embedding(self.vocab_size,units,mask_zero=True)
        self.encoder_unit = keras.layers.Bidirectional(keras.layers.LSTM(units,return_state=True,return_sequences=True,recurrent_initializer="glorot_uniform"),merge_mode="sum")

    def call(self,encoder_inputs):

        encoder_embedded_outputs = self.embedder(encoder_inputs)

        encoder_outputs,*self.encoder_state = self.encoder_unit(encoder_embedded_outputs)

        return encoder_outputs

In [9]:
encoder = Encoder()

for en_in in train_ds.map(lambda x,y:x[0]).take(1):
    print(encoder(en_in).shape)

for en_in in valid_ds.map(lambda x,y:x[0]).take(1):
    print(encoder(en_in).shape)

(64, 22, 256)
(64, 15, 256)


In [10]:
class Decoder(keras.layers.Layer):

    def __init__(self,vec_layer:keras.layers.TextVectorization=es_vec_layer,units:int=256,**kwargs):

        super(Decoder,self).__init__(**kwargs)
        self.units = units
        self.vec_layer = vec_layer
        self.vocab_size = vec_layer.vocabulary_size()

        self.embedder = keras.layers.Embedding(self.vocab_size,units,mask_zero=True)
        self.decoder_unit = keras.layers.LSTM(units,return_state=True,return_sequences=True,recurrent_initializer="glorot_uniform")


    def call(self,decoder_inputs,decoder_initial_state=None):


        decoder_embedded_outputs = self.embedder(decoder_inputs)

        decoder_outputs,*self.decoder_state = self.decoder_unit(decoder_embedded_outputs,initial_state=decoder_initial_state)

        return decoder_outputs

In [11]:
class CrossAttention(keras.layers.Layer):

    def __init__(self,units=256,**kwargs):

        super(CrossAttention,self).__init__(**kwargs)

        self.mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=units)
        self.add = keras.layers.Add()
        self.layer_norm = keras.layers.LayerNormalization()

    def call(self,encoder_outputs,decoder_outputs):


        attention_outputs,attention_scores = self.mha(query=decoder_outputs,value=encoder_outputs,return_attention_scores=True)
        self.attention_scores = tf.reduce_mean(attention_scores,axis=1)
        normalized_attention_outputs = self.layer_norm(self.add([attention_outputs,decoder_outputs]))

        return normalized_attention_outputs

In [12]:
class Translator(keras.Model):
    @classmethod
    def add_method(cls,func):
        setattr(cls,func.__name__,func)
        return func

    def __init__(self,input_vec_layer=en_vec_layer,output_vec_layer=es_vec_layer,units=256,**kwargs):

        super(Translator,self).__init__(**kwargs)

        self.encoder_layer = Encoder(units=units,vec_layer=input_vec_layer)
        self.decoder_layer = Decoder(units=units,vec_layer=output_vec_layer)
        self.attention_layer = CrossAttention(units=units)

        self.words_to_ids = keras.layers.StringLookup(
            vocabulary=self.decoder_layer.vec_layer.get_vocabulary(),
            oov_token="[UNK]",
            mask_token=""
        )
        self.ids_to_words = keras.layers.StringLookup(
            vocabulary=self.decoder_layer.vec_layer.get_vocabulary(),
            oov_token="[UNK]",
            mask_token="",
            invert=True
        )
        self.start_token = self.words_to_ids(["[START]"])
        self.end_token = self.words_to_ids(["[END]"])

        self.out = keras.layers.Dense(self.decoder_layer.vec_layer.vocabulary_size())


    def call(self,inputs,decoder_initial_state=None):



        encoder_inputs,decoder_inputs = inputs

        encoder_outputs = self.encoder_layer(encoder_inputs)

        decoder_outputs = self.decoder_layer(decoder_inputs,decoder_initial_state)

        attention_outputs = self.attention_layer(encoder_outputs,decoder_outputs)

        total_outputs = self.out(attention_outputs)

        try:
            del total_outputs._keras_mask

        except AttributeError:

            pass

        return total_outputs


In [13]:
model = Translator()

In [14]:
BATCH_SIZE = 64
UNITS = 256
train_steps = train_size//BATCH_SIZE
valid_steps = valid_size//BATCH_SIZE

In [15]:
def custom_loss(y_true,y_pred):

    '''
        y_pred will be [batch sequence vocab_size]
        y_true will be [batch sequence]
        as the sequence contains zeros we only use the non-zero part of the sequence so we will mask it
    '''
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction="none")
    loss = loss_fn(y_true,y_pred) # tf.float32
    mask = tf.cast(y_true != 0,loss.dtype) # tf.float32
    loss *= mask # reducing the effective output scale
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)


def custom_metric(y_true,y_pred):

    '''
        y_pred will be [batch sequence vocab_size]             with dtype = tf.float32
        y_true will be [batch sequence]                        with dtype = tf.int64
        as the sequence also has zeros we use masked accuracy
    '''

    y_pred = tf.cast(tf.argmax(y_pred,-1),y_true.dtype) # tf.int64
    mask = tf.cast(y_true != 0,tf.float32) # tf.float32
    accuracy = tf.cast(y_pred == y_true,tf.float32) # tf.float32
    return tf.reduce_sum(accuracy)/tf.reduce_sum(mask) # tf.float32


In [16]:
model.compile(loss=custom_loss,optimizer=keras.optimizers.Adam(),metrics=[custom_metric,custom_loss],steps_per_execution=20)

In [17]:
early_stop = keras.callbacks.EarlyStopping(patience=15,monitor='val_custom_loss',restore_best_weights=True)
history = model.fit(
    train_ds,
    epochs=100,
    validation_data=valid_ds,
    callbacks=[early_stop]
    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100


In [18]:
history.history.keys()

dict_keys(['loss', 'custom_metric', 'custom_loss', 'val_loss', 'val_custom_metric', 'val_custom_loss'])

In [19]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=model.history.history['custom_loss'],mode="lines"))
fig.add_trace(go.Scatter(y=model.history.history['val_custom_loss'],mode="lines"))
fig.update_layout(title="Loss Train v/s Validation")
fig.show()

In [20]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=model.history.history['custom_metric'],mode="lines"))
fig.add_trace(go.Scatter(y=model.history.history['val_custom_metric'],mode="lines"))
fig.update_layout(title="MaskedAccuracy Train v/s Validation")
fig.update_yaxes(range=[0,1])
fig.show()

In [21]:
@Translator.add_method
def text_to_encoder_outputs(self,texts):
    texts = tf.convert_to_tensor(texts)
    en_vec_outputs = self.encoder_layer.vec_layer(texts).to_tensor()
    return self.encoder_layer(en_vec_outputs)

In [22]:
@Translator.add_method
def get_decoder_initial_state(self,encoder_outputs):
    batch_size = encoder_outputs.shape[0]
    start_tokens = tf.fill(dims=[batch_size,1],value=self.start_token)
    done = tf.zeros(shape=[batch_size,1],dtype=tf.bool)
    embedding = self.decoder_layer.embedder(start_tokens)
    return start_tokens,done,self.decoder_layer.decoder_unit.get_initial_state(embedding)

In [23]:
@Translator.add_method
def get_next_token(self,encoder_inputs,next_token,done,state,temperature=0.0):
    total_out = self((encoder_inputs,next_token),state)

    if temperature:
        scaled_total_out = total_out[:,-1,:]/temperature
        next_token = tf.random.categorical(scaled_total_out,num_samples=1)
    else:
        next_token = tf.argmax(total_out,axis=-1)

    done = done | (next_token == self.end_token)
    next_token = tf.where(done,tf.constant(0,tf.int64),next_token)
    return next_token,done,self.decoder_layer.decoder_state

In [24]:
@Translator.add_method
def tokens_to_text(self,tokens):
    texts = self.ids_to_words(tokens)
    texts = tf.strings.reduce_join(texts,separator=" ",axis=-1)
    texts = tf.strings.regex_replace(texts,r"^ *\[START\]* ","")
    texts = tf.strings.regex_replace(texts,r" *\[END]\ *$","")
    texts = tf.strings.strip(texts)
    return texts

In [38]:
@Translator.add_method
def translate(self,text,temperature=1):
    preprocessed_text = self.encoder_layer.vec_layer(text).to_tensor()
    encoder_outputs = self.encoder_layer(preprocessed_text)
    next_token,done,state = self.get_decoder_initial_state(encoder_outputs)
    tokens_list = []

    for i in range(10):
        next_token,done,state = self.get_next_token(preprocessed_text,next_token,done,state,temperature)
        tokens_list.append(next_token)

    string_obj = self.tokens_to_text(tf.concat(tokens_list,axis=-1))
    return tf.strings.regex_replace(string_obj,r" *\[UNK]\ *$","").numpy()[0].decode()

In [26]:
'''Testing on a random inputs'''
model.translate(["Hey! are you still there?"])

'eh ! ¿ estas todavia ?'

In [27]:
train_result = model.evaluate(train_ds,return_dict=True)
train_result



{'loss': 0.5834096670150757,
 'custom_metric': 0.844276487827301,
 'custom_loss': 0.5832920074462891}

In [28]:
valid_result = model.evaluate(valid_ds,return_dict=True)
valid_result



{'loss': 1.1259492635726929,
 'custom_metric': 0.7554615139961243,
 'custom_loss': 1.126450538635254}

In [29]:
test_result = model.evaluate(test_ds,return_dict=True)
test_result



{'loss': 1.1146509647369385,
 'custom_metric': 0.7564138174057007,
 'custom_loss': 1.1154146194458008}

In [33]:
en_text[test_indices][:10]

array(['Call me.', 'Get out.', 'Get out.', "It's me!", 'Go on in.',
       'I agreed.', 'I can go.', "I'm fine.", "I'm poor.", 'Leave it.'],
      dtype='<U247')

In [39]:
for en,es in zip(en_text[test_indices][:10],es_text[test_indices][:10]):
    print(f"Original: {es}     Translated: {model.translate([en])}")

Original: Llámame.     Translated: me [UNK] .
Original: Salid.     Translated: sal .
Original: Salgan.     Translated: baje .
Original: Soy yo.     Translated: es mio !
Original: Entrad.     Translated: vaya adentro .
Original: Accedí.     Translated: estoy de acuerdo .
Original: Puedo ir.     Translated: puedo ir .
Original: Estoy perfectamente.     Translated: estoy perfectamente .
Original: Soy pobre.     Translated: soy pobre .
Original: Déjalo.     Translated: dejalo .


In [40]:
model.translate(["Hello, How are you doing"])

'hola como [UNK] .'