<a href="https://colab.research.google.com/github/h4ck4l1/datasets/blob/main/NLP_with_RNN_and_Attention/NMT_with_attention_google.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install -q -U "tensorflow-text==2.12.0"
!pip3 install einops
import os,warnings
from IPython.display import clear_output
os.environ["TF_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text
import typing
from zipfile import ZipFile
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio
import einops
pio.templates.default = "plotly_dark"
import numpy as np
clear_output()

In [None]:
class ShapeCheck():

    def __init__(self):

        self.shapes = {}

    def __call__(self,tensor,names,broadcast=False):

        parsed = einops.parse_shape(tensor,names)

        for name,new_dim in parsed.items():

            old_dim = self.shapes.get(name,None)

            if broadcast and (new_dim == 1):
                continue

            if old_dim is None:

                self.shapes[name] = new_dim
                continue

            if new_dim != old_dim:

                raise ValueError(f"shape mismatch for dimension: '{name}' found: {new_dim} expected: {old_dim}")

In [None]:
url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

In [None]:
file_path = keras.utils.get_file(fname="spa-eng.zip",origin=url,extract=True)

with ZipFile(file_path,"r") as f:

    f.extractall("spa-eng")

with open("spa-eng/spa-eng/spa.txt","r") as f:

    total_text = f.read()
    total_text = [line.split("\t") for line in total_text.splitlines()]
    en_text,es_text = zip(*total_text)

In [None]:
en_text[-1]

'If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.'

In [None]:
es_text[-1]

'Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.'

In [None]:
en_array = np.array(en_text)
es_array = np.array(es_text)

In [None]:
is_train = np.random.uniform(size=(len(en_array),)) < 0.8

raw_train = (
    tf.data.Dataset
    .from_tensor_slices((en_array[is_train],es_array[is_train]))
    .shuffle(len(en_text))
    .batch(16*8)
)
raw_valid = (
    tf.data.Dataset
    .from_tensor_slices((en_array[~is_train],es_array[~is_train]))
    .shuffle(len(en_text))
    .batch(16*8)
)

In [None]:
for en,es in raw_train.take(1):
    print(en[:4])
    print("translates to latin as ")
    print(es[:4])

tf.Tensor(
[b'He knows many folk dances.' b'I accept your apologies.'
 b"I'm good at mathematics." b"They're with me."], shape=(4,), dtype=string)
translates to latin as 
tf.Tensor(
[b'\xc3\x89l conoce muchos bailes folcl\xc3\xb3ricos.'
 b'Acepto sus disculpas.' b'Se me dan bien las matem\xc3\xa1ticas.'
 b'Ellos est\xc3\xa1n conmigo.'], shape=(4,), dtype=string)


# Standardize Text

In [None]:
es_text[:10]

('Ve.',
 'Vete.',
 'Vaya.',
 'Váyase.',
 'Hola.',
 '¡Corre!',
 'Corred.',
 '¿Quién?',
 '¡Fuego!',
 '¡Incendio!')

In [None]:
tf.constant(es_text[:10]) # converting to tensor

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'Ve.', b'Vete.', b'Vaya.', b'V\xc3\xa1yase.', b'Hola.',
       b'\xc2\xa1Corre!', b'Corred.', b'\xc2\xbfQui\xc3\xa9n?',
       b'\xc2\xa1Fuego!', b'\xc2\xa1Incendio!'], dtype=object)>

In [None]:
temp_text = text.normalize_utf8(es_text[:10],"NFKD") # Normalizing text so that it can be used in operations
temp_text

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'Ve.', b'Vete.', b'Vaya.', b'Va\xcc\x81yase.', b'Hola.',
       b'\xc2\xa1Corre!', b'Corred.', b'\xc2\xbfQuie\xcc\x81n?',
       b'\xc2\xa1Fuego!', b'\xc2\xa1Incendio!'], dtype=object)>

In [None]:
temp_text_1 = tf.strings.lower(temp_text) # Lower casing all the characters
temp_text_1

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've.', b'vete.', b'vaya.', b'va\xcc\x81yase.', b'hola.',
       b'\xc2\xa1corre!', b'corred.', b'\xc2\xbfquie\xcc\x81n?',
       b'\xc2\xa1fuego!', b'\xc2\xa1incendio!'], dtype=object)>

In [None]:
temp_text_2 = tf.strings.regex_replace(temp_text_1,"[^ a-z.?!,¿]","")  # [^ ...] means exclude..so excluding all the a-z and rest
temp_text_2                                                            # and replacing with noting

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've.', b'vete.', b'vaya.', b'vayase.', b'hola.', b'corre!',
       b'corred.', b'\xc2\xbfquien?', b'fuego!', b'incendio!'],
      dtype=object)>

In [None]:
temp_text_3 = tf.strings.regex_replace(temp_text_2,"[.¡¿,?!]",r' \0 ') # Placing a null character[raw_string : r'']
temp_text_3                                                            # before and after every punctuation

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've . ', b'vete . ', b'vaya . ', b'vayase . ', b'hola . ',
       b'corre ! ', b'corred . ', b' \xc2\xbf quien ? ', b'fuego ! ',
       b'incendio ! '], dtype=object)>

In [None]:
temp_text_4= tf.strings.strip(temp_text_3) # stripping any extra spaces
temp_text_4

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've .', b'vete .', b'vaya .', b'vayase .', b'hola .', b'corre !',
       b'corred .', b'\xc2\xbf quien ?', b'fuego !', b'incendio !'],
      dtype=object)>

In [None]:
temp_text_5 = tf.strings.join(['[startofsequence]',temp_text_4,'[endofsequence]'],separator=" ")
temp_text_5

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'[startofsequence] ve . [endofsequence]',
       b'[startofsequence] vete . [endofsequence]',
       b'[startofsequence] vaya . [endofsequence]',
       b'[startofsequence] vayase . [endofsequence]',
       b'[startofsequence] hola . [endofsequence]',
       b'[startofsequence] corre ! [endofsequence]',
       b'[startofsequence] corred . [endofsequence]',
       b'[startofsequence] \xc2\xbf quien ? [endofsequence]',
       b'[startofsequence] fuego ! [endofsequence]',
       b'[startofsequence] incendio ! [endofsequence]'], dtype=object)>

In [None]:
def text_preprocessor(input_text):

    input_text = text.normalize_utf8(input_text,"NFKD")
    input_text = tf.strings.lower(input_text)
    input_text = tf.strings.regex_replace(input_text,"[^ a-z?.!¿¡,]","")
    input_text = tf.strings.regex_replace(input_text,"[?.!¿¡,]",r" \0 ")
    input_text = tf.strings.strip(input_text)
    input_text = tf.strings.join(["[startofsequence]",input_text,"[endofsequence]"],separator=" ")
    return input_text

# Text Vectorization of En and Es

In [None]:
vocab_size = 5000

en_vec_layer = keras.layers.TextVectorization(vocab_size,standardize=text_preprocessor,ragged=True)
en_vec_layer.adapt(raw_train.map(lambda en,es:en))
es_vec_layer = keras.layers.TextVectorization(vocab_size,standardize=text_preprocessor,ragged=True)
es_vec_layer.adapt(raw_train.map(lambda en,es:es))

In [None]:
print(en_vec_layer.get_vocabulary()[:10])
print(es_vec_layer.get_vocabulary()[:10])

['', '[UNK]', '[startofsequence]', '[endofsequence]', '.', 'the', 'i', 'to', 'you', 'tom']
['', '[UNK]', '[startofsequence]', '[endofsequence]', '.', 'que', 'de', 'el', 'a', 'no']


In [None]:
for english_text,en_vectorized_out in zip(en.numpy()[:4],en_vec_layer(en[:4])):
    print(english_text,"---->",en_vectorized_out)

b'He knows many folk dances.' ----> tf.Tensor([   2   13  254  123    1 4647    4    3], shape=(8,), dtype=int64)
b'I accept your apologies.' ----> tf.Tensor([  2   6 874  33   1   4   3], shape=(7,), dtype=int64)
b"I'm good at mathematics." ----> tf.Tensor([   2   38   83   42 1719    4    3], shape=(7,), dtype=int64)
b"They're with me." ----> tf.Tensor([  2 287  37  21   4   3], shape=(6,), dtype=int64)


In [None]:
for latin_text,es_vectorized_out in zip(es.numpy()[:4],es_vec_layer(es[:4])):
    print(latin_text,"---->",es_vectorized_out)

b'\xc3\x89l conoce muchos bailes folcl\xc3\xb3ricos.' ----> tf.Tensor([  2   7 583 212   1   1   4   3], shape=(8,), dtype=int64)
b'Acepto sus disculpas.' ----> tf.Tensor([   2 1463   87 2069    4    3], shape=(6,), dtype=int64)
b'Se me dan bien las matem\xc3\xa1ticas.' ----> tf.Tensor([   2   17   18 1332   74   33 1067    4    3], shape=(9,), dtype=int64)
b'Ellos est\xc3\xa1n conmigo.' ----> tf.Tensor([  2  79 101 221   4   3], shape=(6,), dtype=int64)


In [None]:
en_vocab = np.array(en_vec_layer.get_vocabulary())
es_vocab = np.array(es_vec_layer.get_vocabulary())

In [None]:
print(" ".join(en_vocab[en_vectorized_out.numpy()]))
print(" ".join(es_vocab[es_vectorized_out.numpy()]))

[startofsequence] theyre with me . [endofsequence]
[startofsequence] ellos estan conmigo . [endofsequence]


In [None]:
en_vec_out = en_vec_layer(en)
es_vec_out = es_vec_layer(es)

In [None]:
fig = make_subplots(cols=2,subplot_titles=["Unmasked","Masked"])
fig.add_trace(go.Heatmap(z=en_vec_out.to_tensor().numpy()),row=1,col=1)
fig.add_trace(go.Heatmap(z=np.array((en_vec_out.to_tensor() != 0).numpy(),dtype=np.int32)),row=1,col=2)
fig.show()

In [None]:
def preprocess_dataset(en,es):
    X_train = en_vec_layer(en).to_tensor()
    X_dec = es_vec_layer(es)
    X_dec_train = X_dec[:,:-1].to_tensor()
    y_train = X_dec[:,1:].to_tensor()

    return (X_train,X_dec),y_train

In [None]:
train_ds = raw_train.map(preprocess_dataset,tf.data.AUTOTUNE)
valid_ds = raw_valid.map(preprocess_dataset,tf.data.AUTOTUNE)

In [None]:
for (X_train,X_dec_train),y_train in train_ds.take(1):
    print("inputs:")
    print(X_train[0].numpy())
    print(X_dec_train[0].numpy())
    print("outputs")
    print(y_train[0].numpy())

inputs:
[  2  20   8  43   5 673   6 199  61  11   3   0   0   0   0   0   0   0
   0   0   0   0   0]
[   2   13  190   21    5 1529   51   12    3]
outputs
[  13  190   21    5 1529   51   12    3    0    0    0    0    0    0
    0    0    0    0]


In [None]:
class Encoder(keras.Model):

    def __init__(self,embed_size)