<a href="https://colab.research.google.com/github/h4ck4l1/datasets/blob/main/NLP_with_RNN_and_Attention/NMT_with_attention_google.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [115]:
import os,warnings
warnings.filterwarnings("ignore")
from IPython.display import clear_output
os.environ["TF_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text
import typing
from zipfile import ZipFile
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio
import einops
pio.templates.default = "plotly_dark"
import numpy as np
np.set_printoptions(precision=2)
tf.get_logger().setLevel("ERROR")
%xmode Minimal

Exception reporting mode: Minimal


In [116]:

class ShapeCheck():

    def __init__(self):

        self.shapes = {}

    def __call__(self,tensor,names,broadcast=False):

        parsed = einops.parse_shape(tensor,names)

        for name,new_dim in parsed.items():

            old_dim = self.shapes.get(name,None)

            if broadcast and (new_dim == 1):
                continue

            if old_dim is None:

                self.shapes[name] = new_dim
                continue

            if new_dim != old_dim:

                raise ValueError(f"shape mismatch for dimension: '{name}' found: {new_dim} expected: {old_dim}")

In [117]:
url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

In [118]:
file_path = keras.utils.get_file(fname="spa-eng.zip",origin=url,extract=True)

with ZipFile(file_path,"r") as f:

    f.extractall("spa-eng")

with open("spa-eng/spa-eng/spa.txt","r") as f:

    total_text = f.read()
    total_text = [line.split("\t") for line in total_text.splitlines()]
    en_text,es_text = zip(*total_text)

In [119]:
en_text[-1]

'If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.'

In [120]:
es_text[-1]

'Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.'

In [121]:
en_array = np.array(en_text)
es_array = np.array(es_text)

In [122]:
is_train = np.random.uniform(size=(len(en_array),)) < 0.8

raw_train = (
    tf.data.Dataset
    .from_tensor_slices((en_array[is_train],es_array[is_train]))
    .shuffle(len(en_text))
    .batch(64)
)
raw_valid = (
    tf.data.Dataset
    .from_tensor_slices((en_array[~is_train],es_array[~is_train]))
    .shuffle(len(en_text))
    .batch(64)
)

In [123]:
for en,es in raw_train.take(1):
    print(en[:4])
    print("translates to latin as ")
    print(es[:4])

tf.Tensor(
[b'Tom put on a tie.' b'Due to bad weather, the plane was late.'
 b'Do you have your laptop with you?' b'You must be patient.'], shape=(4,), dtype=string)
translates to latin as 
tf.Tensor(
[b'Tom se puso una corbata.'
 b'El avi\xc3\xb3n se retras\xc3\xb3 a causa del mal clima.'
 b'\xc2\xbfLlevas tu port\xc3\xa1til?' b'Tienes que ser paciente.'], shape=(4,), dtype=string)


# Standardize Text

In [124]:
es_text[:10]

('Ve.',
 'Vete.',
 'Vaya.',
 'Váyase.',
 'Hola.',
 '¡Corre!',
 'Corred.',
 '¿Quién?',
 '¡Fuego!',
 '¡Incendio!')

In [125]:
tf.constant(es_text[:10]) # converting to tensor

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'Ve.', b'Vete.', b'Vaya.', b'V\xc3\xa1yase.', b'Hola.',
       b'\xc2\xa1Corre!', b'Corred.', b'\xc2\xbfQui\xc3\xa9n?',
       b'\xc2\xa1Fuego!', b'\xc2\xa1Incendio!'], dtype=object)>

In [126]:
temp_text = text.normalize_utf8(es_text[:10],"NFKD") # Normalizing text so that it can be used in operations
temp_text

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'Ve.', b'Vete.', b'Vaya.', b'Va\xcc\x81yase.', b'Hola.',
       b'\xc2\xa1Corre!', b'Corred.', b'\xc2\xbfQuie\xcc\x81n?',
       b'\xc2\xa1Fuego!', b'\xc2\xa1Incendio!'], dtype=object)>

In [127]:
temp_text_1 = tf.strings.lower(temp_text) # Lower casing all the characters
temp_text_1

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've.', b'vete.', b'vaya.', b'va\xcc\x81yase.', b'hola.',
       b'\xc2\xa1corre!', b'corred.', b'\xc2\xbfquie\xcc\x81n?',
       b'\xc2\xa1fuego!', b'\xc2\xa1incendio!'], dtype=object)>

In [128]:
temp_text_2 = tf.strings.regex_replace(temp_text_1,"[^ a-z.?!,¿]","")  # [^ ...] means exclude..so excluding all the a-z and rest
temp_text_2                                                            # and replacing with noting

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've.', b'vete.', b'vaya.', b'vayase.', b'hola.', b'corre!',
       b'corred.', b'\xc2\xbfquien?', b'fuego!', b'incendio!'],
      dtype=object)>

In [129]:
temp_text_3 = tf.strings.regex_replace(temp_text_2,"[.¡¿,?!]",r' \0 ') # Placing a null character[raw_string : r'']
temp_text_3                                                            # before and after every punctuation

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've . ', b'vete . ', b'vaya . ', b'vayase . ', b'hola . ',
       b'corre ! ', b'corred . ', b' \xc2\xbf quien ? ', b'fuego ! ',
       b'incendio ! '], dtype=object)>

In [130]:
temp_text_4= tf.strings.strip(temp_text_3) # stripping any extra spaces
temp_text_4

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b've .', b'vete .', b'vaya .', b'vayase .', b'hola .', b'corre !',
       b'corred .', b'\xc2\xbf quien ?', b'fuego !', b'incendio !'],
      dtype=object)>

In [131]:
temp_text_5 = tf.strings.join(['[startofsequence]',temp_text_4,'[endofsequence]'],separator=" ")
temp_text_5

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'[startofsequence] ve . [endofsequence]',
       b'[startofsequence] vete . [endofsequence]',
       b'[startofsequence] vaya . [endofsequence]',
       b'[startofsequence] vayase . [endofsequence]',
       b'[startofsequence] hola . [endofsequence]',
       b'[startofsequence] corre ! [endofsequence]',
       b'[startofsequence] corred . [endofsequence]',
       b'[startofsequence] \xc2\xbf quien ? [endofsequence]',
       b'[startofsequence] fuego ! [endofsequence]',
       b'[startofsequence] incendio ! [endofsequence]'], dtype=object)>

In [132]:
def text_preprocessor(input_text):

    input_text = text.normalize_utf8(input_text,"NFKD")
    input_text = tf.strings.lower(input_text)
    input_text = tf.strings.regex_replace(input_text,"[^ a-z?.!¿¡,]","")
    input_text = tf.strings.regex_replace(input_text,"[?.!¿¡,]",r" \0 ")
    input_text = tf.strings.strip(input_text)
    input_text = tf.strings.join(["[startofsequence]",input_text,"[endofsequence]"],separator=" ")
    return input_text

# Text Vectorization of En and Es

In [133]:
vocab_size = 5000

en_vec_layer = keras.layers.TextVectorization(vocab_size,standardize=text_preprocessor,ragged=True)
en_vec_layer.adapt(raw_train.map(lambda en,es:en))
es_vec_layer = keras.layers.TextVectorization(vocab_size,standardize=text_preprocessor,ragged=True)
es_vec_layer.adapt(raw_train.map(lambda en,es:es))

In [134]:
print(en_vec_layer.get_vocabulary()[:10])
print(es_vec_layer.get_vocabulary()[:10])

['', '[UNK]', '[startofsequence]', '[endofsequence]', '.', 'the', 'i', 'to', 'you', 'tom']
['', '[UNK]', '[startofsequence]', '[endofsequence]', '.', 'que', 'de', 'el', 'a', 'no']


In [135]:
for english_text,en_vectorized_out in zip(en.numpy()[:4],en_vec_layer(en[:4])):
    print(english_text,"---->",en_vectorized_out)

b'Tom put on a tie.' ----> tf.Tensor([  2   9 183  34  10 857   4   3], shape=(8,), dtype=int64)
b'Due to bad weather, the plane was late.' ----> tf.Tensor([   2 1605    7  252  539   19    5  606   18  230    4    3], shape=(12,), dtype=int64)
b'Do you have your laptop with you?' ----> tf.Tensor([   2   20    8   21   33 2273   36    8   11    3], shape=(10,), dtype=int64)
b'You must be patient.' ----> tf.Tensor([  2   8 148  35 773   4   3], shape=(7,), dtype=int64)


In [136]:
for latin_text,es_vectorized_out in zip(es.numpy()[:4],es_vec_layer(es[:4])):
    print(latin_text,"---->",es_vectorized_out)

b'Tom se puso una corbata.' ----> tf.Tensor([   2   10   17  297   23 1076    4    3], shape=(8,), dtype=int64)
b'El avi\xc3\xb3n se retras\xc3\xb3 a causa del mal clima.' ----> tf.Tensor([   2    7  451   17 1423    8  872   46  230  871    4    3], shape=(12,), dtype=int64)
b'\xc2\xbfLlevas tu port\xc3\xa1til?' ----> tf.Tensor([   2   13 1611   36 3598   12    3], shape=(7,), dtype=int64)
b'Tienes que ser paciente.' ----> tf.Tensor([  2  93   5  83 818   4   3], shape=(7,), dtype=int64)


In [137]:
en_vocab = np.array(en_vec_layer.get_vocabulary())
es_vocab = np.array(es_vec_layer.get_vocabulary())

In [138]:
print(" ".join(en_vocab[en_vectorized_out.numpy()]))
print(" ".join(es_vocab[es_vectorized_out.numpy()]))

[startofsequence] you must be patient . [endofsequence]
[startofsequence] tienes que ser paciente . [endofsequence]


In [139]:
en_vec_out = en_vec_layer(en)
es_vec_out = es_vec_layer(es)

In [140]:
fig = make_subplots(cols=2,subplot_titles=["Unmasked","Masked"])
fig.add_trace(go.Heatmap(z=en_vec_out.to_tensor().numpy()),row=1,col=1)
fig.add_trace(go.Heatmap(z=np.array((en_vec_out.to_tensor() != 0).numpy(),dtype=np.int32)),row=1,col=2)
fig.show()

In [141]:
def preprocess_dataset(en,es):
    X_train = en_vec_layer(en).to_tensor()
    X_dec = es_vec_layer(es)
    X_dec_train = X_dec[:,:-1].to_tensor()
    y_train = X_dec[:,1:].to_tensor()

    return (X_train,X_dec_train),y_train

In [142]:
train_ds = raw_train.map(preprocess_dataset,tf.data.AUTOTUNE)
valid_ds = raw_valid.map(preprocess_dataset,tf.data.AUTOTUNE)

In [143]:
for (en_in,es_in),es_out in train_ds.take(1):
    print(en_in.shape)
    print(es_in.shape)
    print(es_out.shape)

(64, 15)
(64, 14)
(64, 14)


# Encoder Class

- Embedding Layer
- GRU/LSTM Layer

In [144]:
vocab_size = len(en_vec_layer.get_vocabulary())
vocab_size

5000

In [145]:
embed_size = 256

In [146]:
encoder_embed_layer = keras.layers.Embedding(vocab_size,embed_size,mask_zero=True)
encoder = keras.layers.Bidirectional(
    keras.layers.LSTM(256,return_sequences=True,recurrent_initializer="glorot_uniform"),
    merge_mode="sum"
)

In [147]:
shape_checker = ShapeCheck()
shape_checker(en_in,"batch s")

In [148]:
shape_checker.shapes

{'batch': 64, 's': 15}

In [149]:
enc_embed_output = encoder_embed_layer(en_in)
enc_embed_output.shape

TensorShape([64, 15, 256])

In [150]:
shape_checker(enc_embed_output,"batch s units")
shape_checker.shapes

{'batch': 64, 's': 15, 'units': 256}

In [151]:
encoder_outputs = encoder(enc_embed_output)
encoder_outputs.shape

TensorShape([64, 15, 256])

In [152]:
shape_checker(encoder_outputs,"batch s units")

In [153]:
mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=256)

In [154]:
print(en_in.shape)
print(encoder_outputs.shape)

(64, 15)
(64, 15, 256)


In [155]:
len(en_vec_layer.get_vocabulary())

5000

In [156]:
class Encoder(keras.Model):

    def __init__(self,units=256,text_process_layer=en_vec_layer,**kwargs):
        
        super(Encoder,self).__init__(**kwargs)
        self.embed = keras.layers.Embedding(text_process_layer.vocabulary_size(),units,mask_zero=True)
        self.Rnn = keras.layers.Bidirectional(
            layer=keras.layers.LSTM(units,return_sequences=True,recurrent_initializer="glorot_uniform"),
            merge_mode="sum"
        )
    
    def call(self,inputs):

        shape_checker = ShapeCheck()
        shape_checker(inputs,"batch s")
        z = self.embed(inputs)
        shape_checker(z,"batch s units")
        z = self.Rnn(z)
        shape_checker(z,"batch s units")
        return z


# CrossAttention

In [157]:
decoder_embed_layer = keras.layers.Embedding(es_vec_layer.vocabulary_size(),256,mask_zero=True)
decoder_embed_out = decoder_embed_layer(es_in)
decoder_embed_out.shape

TensorShape([64, 14, 256])

In [158]:
mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=256)
attention_output,attention_scores = mha(query=decoder_embed_out,value=encoder_outputs,return_attention_scores=True)
print(attention_output.shape)
print(attention_scores.shape)

(64, 14, 256)
(64, 1, 14, 15)


In [159]:
shape_checker = ShapeCheck()
shape_checker(decoder_embed_out,"batch t units")
shape_checker(encoder_outputs,"batch s units")

In [160]:
attention_scores = tf.reduce_mean(attention_scores,axis=1)
attention_scores.shape

TensorShape([64, 14, 15])

In [161]:
adding_layer = keras.layers.Add()
add_out = adding_layer([decoder_embed_out,attention_output])
add_out.shape

TensorShape([64, 14, 256])

In [162]:
layer_norm = keras.layers.LayerNormalization()
layer_out = layer_norm(add_out)
layer_out.shape

TensorShape([64, 14, 256])

In [163]:
class CrossAttention(keras.layers.Layer):

    def __init__(self,units=256,**kwargs):

        super(CrossAttention,self).__init__(**kwargs)
        self.mha = keras.layers.MultiHeadAttention(num_heads=1,key_dim=units)
        self.add = keras.layers.Add()
        self.layer_norm = keras.layers.LayerNormalization()

    def call(self,decoder_out,encoder_out):

        shape_checker = ShapeCheck()
        shape_checker(decoder_out,"batch t units")
        shape_checker(encoder_out,"batch s units")

        attention_output,attention_scores = self.mha(query=decoder_out,value=encoder_out,return_attention_scores=True)
        shape_checker(attention_output,"batch t units")
        shape_checker(attention_scores,"batch heads t s")

        add_and_layer_norm = self.layer_norm(self.add([decoder_out,attention_output]))
        self.attention_scores = tf.reduce_mean(attention_scores,axis=1)
        
        return add_and_layer_norm

In [164]:
attention_layer = CrossAttention()

attention_out = attention_layer(decoder_embed_out,encoder_outputs)
attention_out.shape

TensorShape([64, 14, 256])

In [165]:
attention_layer.attention_scores.shape

TensorShape([64, 14, 15])

In [168]:
np.sum(attention_layer.attention_scores,axis=-1)[:5,:]

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]],
      dtype=float32)

In [183]:
fig = make_subplots(cols=2,subplot_titles=["Attention Output","Masked Output"])
fig.add_trace(go.Heatmap(z=attention_layer.attention_scores[:,0,:]),row=1,col=1)
fig.add_trace(go.Heatmap(z=np.array((en_vec_out.to_tensor() != 0).numpy(),dtype=np.int32)),row=1,col=2)
fig.show()