In [1]:
import os,sys,warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
from zipfile import ZipFile
from IPython.display import clear_output
# if "google.colab" in sys.modules:
#     !pip3 install -q -U "tensorflow-text==2.13.0"
#     !pip3 install -q -U einops
import numpy as np
import tensorflow as tf
from tensorflow import keras
import einops
import tensorflow_text as tftext
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
tf.get_logger().setLevel("ERROR")
pio.templates.default = "plotly_dark"
clear_output()

In [2]:
origin = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

In [3]:
from typing import Any


class ShapeCheck():

    def __init__(self):
        self.shapes = {}

    def __call__(self,tensor,names,**kwargs) -> Any:
        parsed_tensor = einops.parse_shape(tensor,names)
        for name,new_dim in parsed_tensor.items():
            
            if name not in self.shapes:
                self.shapes[name] = new_dim
            else:
                if new_dim == self.shapes[name]:
                    continue
                else:
                    raise AttributeError(f"\n\033[1mDIMENSION MISMATCH FOR {tensor.shape}\033[0m\n\033[1mFOUND DIMENSION\033[0m ----->{new_dim}\n\033[1mINCOMPATIBLE WITH OLD DIMENSION\033[0m ---->{self.shapes[name]}")

In [4]:
shape_checker = ShapeCheck()
shape_checker(tf.random.normal(shape=[64,17,256]),"batch t units")
shape_checker(tf.random.normal(shape=[64,16,256]),"batch s units")
shape_checker(tf.random.normal(shape=[64,17,16]),"batch t s")
# shape_checker(tf.random.normal(shape=[64,17,256]),"batch t s")
shape_checker.shapes

{'batch': 64, 't': 17, 'units': 256, 's': 16}

In [5]:
if "google.colab" in sys.modules:
    file_path = keras.utils.get_file(fname="spa-eng.zip",origin=origin,extract=True)
    with ZipFile(file_path,"r") as f:
        f.extractall("spa-eng")
    with open("spa-eng/spa-eng/spa.txt","r") as f:
        text = f.read()
else:
    with open("spa-eng/spa-eng/spa.txt","r") as f:
        text = f.read()

en_text,es_text = zip(*[line.split("\t") for line in text.splitlines()])
for en_in,es_in in zip(en_text[:10],es_text[:10]):
    print(en_in,"---->",es_in)

Go. ----> Ve.
Go. ----> Vete.
Go. ----> Vaya.
Go. ----> Váyase.
Hi. ----> Hola.
Run! ----> ¡Corre!
Run. ----> Corred.
Who? ----> ¿Quién?
Fire! ----> ¡Fuego!
Fire! ----> ¡Incendio!


In [6]:
en_text = np.array(en_text)
es_text = np.array(es_text)

In [7]:
indices = np.random.uniform(0,1,len(en_text))
train_indices = indices < 0.8
test_indices = indices > 0.8

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((en_text[train_indices],es_text[train_indices]))
    .shuffle(len(en_text))
    .batch(64)
)
valid_raw = (
    tf.data.Dataset
    .from_tensor_slices((en_text[test_indices],es_text[test_indices]))
    .shuffle(len(en_text))
    .batch(64)
)

In [8]:
def text_preprocessing(sentence):
    sentence = tftext.normalize_utf8(sentence,"NFKD")
    sentence = tf.strings.lower(sentence)
    sentence = tf.strings.regex_replace(sentence,"[^ a-z¿?!.,]","")
    sentence = tf.strings.regex_replace(sentence,"[.?!,¿]",r" \0 ")
    sentence = tf.strings.strip(sentence)
    sentence = tf.strings.join(["[startofsequence]",sentence,"[endofsequence]"],separator=" ")
    return sentence

In [9]:
vocab_size = 5000
en_vec_layer = keras.layers.TextVectorization(
    max_tokens=vocab_size,
    standardize=text_preprocessing,
    ragged=True
)
es_vec_layer = keras.layers.TextVectorization(
    max_tokens=vocab_size,
    standardize=text_preprocessing,
    ragged=True
)

en_vec_layer.adapt(en_text)
es_vec_layer.adapt(es_text)

In [11]:
def preprocess(en,es):
    en = en_vec_layer(en)
    es = es_vec_layer(es)
    return (en.to_tensor(),es[:,:-1].to_tensor()),es[:,1:].to_tensor()

In [12]:
for i,y in train_raw.map(preprocess).take(1):
    print(i[0][:2])
    print(i[1][:2])
    print(y[:2])

tf.Tensor(
[[   2   27  165   22    1 1071  189  953  540    4    3    0    0    0
     0    0]
 [   2    6   70  204   44 1077    4    3    0    0    0    0    0    0
     0    0]], shape=(2, 16), dtype=int64)
tf.Tensor(
[[  2   9 830   5  40 105   6   1   1  14 510   1   4   0   0   0   0   0
    0]
 [  2   9  22  55 342  21 503 328   4   0   0   0   0   0   0   0   0   0
    0]], shape=(2, 19), dtype=int64)
tf.Tensor(
[[  9 830   5  40 105   6   1   1  14 510   1   4   3   0   0   0   0   0
    0]
 [  9  22  55 342  21 503 328   4   3   0   0   0   0   0   0   0   0   0
    0]], shape=(2, 19), dtype=int64)


In [13]:
train_ds = train_raw.map(preprocess)
valid_ds = valid_raw.map(preprocess)

In [14]:
for (en_inputs,es_inputs),es_targets in train_ds.take(1):
    print(en_inputs.shape)
    print(es_inputs.shape)
    print(es_targets.shape)

(64, 16)
(64, 14)
(64, 14)


In [15]:
class Encoder(keras.layers.Layer):

    def __init__(self,units=256,vec_layer=en_vec_layer,**kwargs):
        super(Encoder,self).__init__(**kwargs)
        self.vec_layer = vec_layer
        self.vocab_size = vec_layer.vocabulary_size()
        self.Embedding = keras.layers.Embedding(self.vocab_size,output_dim=units,mask_zero=True)
        self.encoder_unit = keras.layers.Bidirectional(keras.layers.LSTM(units,return_state=True,return_sequences=True,recurrent_initializer="glorot_uniform"),merge_mode="sum")

    def call(self,inputs,return_state=False):
        shape_checker = ShapeCheck()
        shape_checker(inputs,"batch encoder_sequence")

        embedder_outputs = self.Embedding(inputs)
        shape_checker(embedder_outputs,"batch encoder_sequence units")

        encoder_outputs,*encoder_state = self.encoder_unit(embedder_outputs)
        shape_checker(encoder_outputs,"batch encoder_sequence units")

        if return_state:
            return encoder_outputs,encoder_state
        else:
            return encoder_outputs
        

    def text_to_encoder_outputs(self,texts):
        texts = tf.convert_to_tensor(texts)
        if len(texts.shape) == 0:
            texts = texts[tf.newaxis]
        texts = self.vec_layer(texts).to_tensor()
        return self(texts)


In [16]:
'''Testing encoder is working on the whole set'''

encoder = Encoder()

for (en_inputs,es_inputs),es_targets in train_ds:
    encoder(en_inputs)

for (en_inputs,es_inputs),es_targets in valid_ds:
    encoder(es_inputs)

_ = encoder.text_to_encoder_outputs(en_text[:10])

In [17]:
class CrossAttention(keras.layers.Layer):

    def __init__(self,units=256,**kwargs):

        super(CrossAttention,self).__init__(**kwargs)
        self.attention = keras.layers.MultiHeadAttention(num_heads=1,key_dim=units)
        self.add = keras.layers.Add()
        self.layer_norm = keras.layers.LayerNormalization(axis=-1)

    def call(self,encoder_outputs,decoder_outputs):

        shape_checker = ShapeCheck()
        shape_checker(encoder_outputs,"batch encoder_sequence units")
        shape_checker(decoder_outputs,"batch decoder_sequence units")
        attention_outputs,attention_scores = self.attention(query=decoder_outputs,value=encoder_outputs,return_attention_scores=True)
        shape_checker(attention_outputs,"batch decoder_sequence units")
        self.attention_scores = tf.reduce_mean(attention_scores,axis=1)
        shape_checker(self.attention_scores,"batch decoder_sequence encoder_sequence")
        return self.layer_norm(self.add([decoder_outputs,attention_outputs]))

In [18]:
''' Testing Attention layer'''
attention_layer = CrossAttention()

out = attention_layer(tf.random.normal(shape=[64,17,256]),tf.random.normal(shape=[64,15,256]))
out.shape

TensorShape([64, 15, 256])

In [19]:
lookup = keras.layers.StringLookup(vocabulary=es_vec_layer.get_vocabulary(),mask_token="",oov_token="[UNK]")
start_token = lookup(["[startofsequence]"])
start_tokens = tf.fill([64,1],start_token)
start_tokens.shape

TensorShape([64, 1])

In [20]:
embedding = keras.layers.Embedding(es_vec_layer.vocabulary_size(),256,mask_zero=True)
initial_state = embedding(start_tokens)
initial_state.shape

TensorShape([64, 1, 256])

In [124]:
class Decoder(keras.layers.Layer):

    def __init__(self,units=256,vec_layer=es_vec_layer,**kwargs):

        super(Decoder,self).__init__(**kwargs)
        '''preprocessing layers'''
        self.vec_layer = vec_layer
        self.words_to_ids = keras.layers.StringLookup(
            vocabulary=vec_layer.get_vocabulary(),
            oov_token="[UNK]",
            mask_token=""
        )
        self.ids_to_words = keras.layers.StringLookup(
            vocabulary=vec_layer.get_vocabulary(),
            oov_token="[UNK]",
            mask_token="",
            invert=True
        )
        self.start_token = self.words_to_ids(["[startofsequence]"])
        self.end_token = self.words_to_ids(["[endofsequence]"])

        '''decoder layers'''
        self.embedding = keras.layers.Embedding(vec_layer.vocabulary_size(),units,mask_zero=True)
        self.decoder_unit = keras.layers.LSTM(units,return_state=True,return_sequences=True,recurrent_initializer="glorot_uniform")
        self.attention = CrossAttention()
        self.out = keras.layers.Dense(vec_layer.vocabulary_size())

    def call(self,encoder_outputs,decoder_inputs,decoder_state=None,return_state=False):

        shape_checker = ShapeCheck()
        shape_checker(encoder_outputs,"batch encoder_sequence units")
        shape_checker(decoder_inputs,"batch decoder_sequence")

        embedding_outputs = self.embedding(decoder_inputs)
        shape_checker(embedding_outputs,"batch decoder_sequence units")

        decoder_ouputs,*decoder_state = self.decoder_unit(embedding_outputs,initial_state=decoder_state)
        shape_checker(decoder_ouputs,"batch decoder_sequence units")
        shape_checker(decoder_state[0],"batch units")
        shape_checker(decoder_state[1],"batch units")

        attention_output = self.attention(encoder_outputs,decoder_ouputs)
        total_out = self.out(attention_output)        

        if return_state:
            return total_out,decoder_state #decoder_state in shape (64,seq_len,256)
        else:
            return total_out #total_out in shape (64,seq_len,5000)
        

    def get_initial_state(self,encoder_inputs):
        batch_size = tf.shape(encoder_inputs)[0] # 64
        done = tf.zeros(shape=[batch_size,1],dtype=tf.bool) #done in shape (64,1)
        start_tokens = tf.fill(shape=[batch_size,1],value=self.start_token) #start_tokens in shape (64,1)
        return start_tokens,done,self.decoder_unit.get_initial_state(self.embedding(start_tokens)) #state in shape (64,1,256)
    
    def get_next_token(self,encoder_inputs,prev_token,done,decoder_state,temperature=0):

        total_out,decoder_state = self(encoder_inputs,prev_token,decoder_state,return_state=True)
        #total_out in shape (64,1,5000)

        if temperature:
            next_token = np.argmax(total_out,axis=-1)
        else:
            scaled_out = total_out[:,-1,:]/temperature
            next_token = tf.random.categorical(scaled_out,num_samples=1,dtype=tf.int64)
        #scaled_out in shape (64,1)
        
        done = done | (next_token == self.end_token)
        next_token = tf.where(done,tf.zeros(shape=tf.shape(next_token),dtype=tf.int64),next_token)

        return next_token,decoder_state
    
    def tokens_to_text(self,tokens):
        texts = self.ids_to_words(tokens)
        texts = tf.strings.reduce_join(texts,separator=" ",axis=-1)
        texts = tf.strings.regex_replace(texts,r"^ *\[startofsequence\]* ","")
        texts = tf.strings.regex_replace(texts,r" *\[endofsequence\] *$","")
        texts = tf.strings.strip(texts)
        return texts

In [125]:
decoder = Decoder()

In [130]:
for (en_in,es_in),tar_in in train_ds:
    decoder(encoder(en_in),es_in)

In [131]:
for (en_in,es_in),tar_in in valid_ds:
    decoder(encoder(en_in),es_in)

In [134]:
class Translator(keras.Model):

    def __init__(self,units=256,**kwargs):

        super(Translator,self).__init__(**kwargs)
        self.encoder = Encoder(units=units)
        self.decoder = Decoder(units=units)
        
    def call(self,inputs):

        encoder_inputs,decoder_inputs = inputs
        encoder_outputs = self.encoder(encoder_inputs)
        total_out = self.decoder(encoder_outputs,decoder_inputs)

        try:
            del total_out._keras_mask
        except AttributeError as error:
            pass

        return total_out


In [136]:
model = Translator()

In [138]:
for (en_in,es_in),tar_in in train_ds:
    model((en_in,es_in))

In [139]:
for (en_in,es_in),tar_in in valid_ds:
    model((en_in,es_in))