<a href="https://colab.research.google.com/github/h4ck4l1/datasets/blob/main/NLP_with_RNN_and_Attention/transformers_XLA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# from google.colab import auth
# auth.authenticate_user()
import sys,os,warnings
if "google.colab" in sys.modules:
    %pip install "tensorflow-text==2.13.0"
    %pip install kaleido
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
import numpy as np
import re
from typing import Literal
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as tftext
import tensorflow_text.tools.wordpiece_vocab.bert_vocab_from_dataset as bert_vocab
from zipfile import ZipFile
from IPython.display import clear_output
from shutil import copytree,copy2
import requests
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_dark"
if "google.colab" not in sys.modules:
    gpus = tf.config.list_physical_devices("GPU")
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=9000)]
        )
tf.get_logger().setLevel("ERROR")
%xmode Context
clear_output()

In [2]:
# tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(tpu_resolver)
# tf.tpu.experimental.initialize_tpu_system(tpu_resolver)
# strategy = tf.distribute.TPUStrategy(tpu_resolver)
strategy = tf.distribute.experimental.CentralStorageStrategy()

In [3]:
with tf.device("/job:localhost"):
    url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
    file_path = keras.utils.get_file(fname="spa-eng.zip",origin=url,extract=True)
    with ZipFile(file_path,"r") as f:
        f.extractall("spa-eng")
    with open("spa-eng/spa-eng/spa.txt","r") as f:
        text = f.read()

    en_text,es_text = zip(*[line.split("\t") for line in text.splitlines()])
    for en,es in zip(en_text[:10],es_text[:10]):
        print(f"{en} ----> {es}")
    en_url = "https://github.com/h4ck4l1/datasets/raw/main/NLP_with_RNN_and_Attention/en_vocab.txt"
    es_url = "https://github.com/h4ck4l1/datasets/raw/main/NLP_with_RNN_and_Attention/spa_vocab.txt"

    en_content = requests.get(en_url).content
    es_content = requests.get(es_url).content

    with open("en_vocab.txt","wb") as f:
        f.write(en_content)

    with open("spa_vocab.txt","wb") as f:
        f.write(es_content)
    en_tokenizer = tftext.BertTokenizer(
        "en_vocab.txt",
        normalization_form="NFKD"
    )
    es_tokenizer = tftext.BertTokenizer(
        "spa_vocab.txt",
        normalization_form="NFKD"
    )
    with open("en_vocab.txt","r") as f:
        en_vocab = f.read()

    with open("spa_vocab.txt","r") as f:
        es_vocab = f.read()

    en_vocab = np.array(en_vocab.splitlines())
    es_vocab = np.array(es_vocab.splitlines())
    en_text = np.array(en_text)
    es_text = np.array(es_text)
    start_token = tf.argmax(en_vocab == "[START]",output_type=tf.int64)
    end_token = tf.argmax(es_vocab== "[END]",output_type=tf.int64)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Go. ----> Ve.
Go. ----> Vete.
Go. ----> Vaya.
Go. ----> Váyase.
Hi. ----> Hola.
Run! ----> ¡Corre!
Run. ----> Corred.
Who? ----> ¿Quién?
Fire! ----> ¡Fuego!
Fire! ----> ¡Incendio!


In [4]:
def upstream(sentence:str,lang:Literal["en","es"]):
    assert lang in ["en","es"],f"The provided argument for lang is not in ['en','es']"
    bsize = tf.shape(sentence)[0]
    sentence = tf.convert_to_tensor(sentence)
    sentence = tftext.normalize_utf8(sentence,"NFKD")
    sentence = tf.strings.lower(sentence)
    sentence = tf.strings.regex_replace(sentence,r"[^ a-z,.?!¿]","")
    sentence = tf.strings.regex_replace(sentence,r"[,.?!¿]",r" \0 ")
    sentence = tf.strings.strip(sentence)
    if lang == "en":
        tokens = en_tokenizer.tokenize(sentence).merge_dims(-2,-1).to_tensor()
    else:
        tokens = es_tokenizer.tokenize(sentence).merge_dims(-2,-1).to_tensor()
    return tf.concat([tf.fill(dims=[bsize,1],value=start_token),tokens,tf.fill(dims=[bsize,1],value=end_token)],axis=-1)

In [5]:
def downstream(tokens:tf.Tensor,lang:Literal["en","es"]):
    assert lang in ["en","es"],f"The provided argument for lang is not in ['en','es']"
    if lang == "en":
        words = en_tokenizer.detokenize(tokens)
    else:
        words = es_tokenizer.detokenize(tokens)
    bad_tokens = "|".join([re.escape(_) for _ in ["[START]","[END]","[PAD]"]])
    mask = tf.strings.regex_full_match(words,bad_tokens)
    re_words = tf.ragged.boolean_mask(words,~mask)
    return tf.strings.reduce_join(re_words,separator=" ",axis=1)


# escape strings ^\[START\](?:.*?\[PAD\])*?(.*?)\[END\]$

In [6]:
def preprocess(context,target):
    context = upstream(context,"en")
    target = upstream(target,"es")
    return (context,target[:,:-1]),target[:,1:]

In [7]:
def get_data(batch_size:int):
    all_indices = np.random.uniform(size=len(en_text))
    train_indices = all_indices <= 0.8
    valid_indices = all_indices > 0.8
    train_size = len(train_indices)
    valid_size = len(valid_indices)
    train_ds = (
        tf.data.Dataset
        .from_tensor_slices((en_text[train_indices],es_text[train_indices]))
        .batch(batch_size)
        .shuffle(len(en_text))
        .map(preprocess)
        # .repeat()
        .prefetch(tf.data.AUTOTUNE)
    )
    valid_ds = (
        tf.data.Dataset
        .from_tensor_slices((en_text[valid_indices],es_text[valid_indices]))
        .batch(batch_size)
        .shuffle(len(en_text))
        .map(preprocess)
        # .repeat()
        .prefetch(tf.data.AUTOTUNE)
    )
    return train_ds,valid_ds,train_size,valid_size

In [8]:
class PositionEncoding(keras.layers.Layer):

    def __init__(
        self,
        vocab_size:int,
        length:int,
        d_model:int,
        casting:Literal["concat","interleave"],
        **kwargs
        ):

        super(PositionEncoding,self).__init__(**kwargs)
        assert d_model%2==0,f"The provided d_model is not even it should be even"
        assert casting in ["concat","interleave"],f"The provided casting is not in the given values"
        self.depth = d_model//2
        angle_rads = np.arange(length)[:,np.newaxis] * 1/(10000**(np.arange(self.depth)[np.newaxis,:]/self.depth))
        if casting == "concat":
            embed = tf.concat([tf.sin(angle_rads),tf.cos(angle_rads)],axis=-1)
        else:
            embed = np.zeros(shape=angle_rads.shape)
            embed[:,::2] = tf.sin(angle_rads)
            embed[:,1::2] = tf.cos(angle_rads)

        self.embed = tf.cast(embed,tf.float32)
        self.embedding = keras.layers.Embedding(vocab_size,d_model,mask_zero=True)

    def compute_mask(self,*args,**kwargs):
        return self.embedding.compute_mask(*args,**kwargs)

    def call(self,inputs):
        seq_l = tf.shape(inputs)[1]
        embed_out = self.embedding(inputs)
        embed_out *= tf.math.sqrt(tf.cast(self.depth*2,tf.float32))
        return embed_out + self.embed[tf.newaxis,:seq_l,:]


In [9]:
class Encoder(keras.layers.Layer):

    def __init__(
        self,
        num_layers:int,
        num_heads:int,
        d_model:int,
        feed_forward_dense_units:int,
        dropout_rate:float,
        **kwargs
    ):

        super(Encoder,self).__init__(**kwargs)
        self.num_layers = num_layers

        self.self_attention = keras.layers.MultiHeadAttention(num_heads=num_heads,key_dim=d_model)
        self.ff_dense = keras.layers.Dense(feed_forward_dense_units,"relu")
        self.scale_dense = keras.layers.Dense(d_model)
        self.dropout = keras.layers.Dropout(dropout_rate)
        self.layer_norm = keras.layers.LayerNormalization()
        self.add = keras.layers.Add()


    def call(self,z_enc):

        for _ in range(self.num_layers):

            z_enc_copy = z_enc
            z_enc = self.self_attention(query=z_enc,key=z_enc,value=z_enc)
            z_enc = self.layer_norm(self.add([z_enc_copy,z_enc]))
            z_enc_copy = z_enc
            z_enc = self.ff_dense(z_enc)
            z_enc = self.scale_dense(z_enc)
            z_enc = self.dropout(z_enc)
            z_enc = self.layer_norm(self.add([z_enc_copy,z_enc]))

        return z_enc


In [10]:
class Decoder(keras.layers.Layer):

    def __init__(
        self,
        num_layers:int,
        num_self_heads:int,
        num_cross_heads:int,
        d_model:int,
        feed_forward_dense_units:int,
        dropout_rate:float,
        **kwargs
    ):

        super(Decoder,self).__init__(**kwargs)
        self.num_layers = num_layers

        self.self_attention = keras.layers.MultiHeadAttention(num_heads=num_self_heads,key_dim=d_model)
        self.cross_attention = keras.layers.MultiHeadAttention(num_heads=num_cross_heads,key_dim=d_model)
        self.ff_dense = keras.layers.Dense(feed_forward_dense_units,"relu")
        self.scale_dense = keras.layers.Dense(d_model)
        self.dropout = keras.layers.Dropout(dropout_rate)
        self.layer_norm = keras.layers.LayerNormalization()
        self.add = keras.layers.Add()

    def call(self,z_enc,z):

        for _ in range(self.num_layers):

            z_copy = z
            z = self.self_attention(query=z,key=z,value=z,use_causal_mask=True)
            z = self.layer_norm(self.add([z_copy,z]))
            z_copy = z
            z = self.cross_attention(query=z,key=z_enc,value=z_enc)
            z = self.layer_norm(self.add([z_copy,z]))
            z_copy = z
            z = self.ff_dense(z)
            z = self.scale_dense(z)
            z = self.dropout(z)
            z = self.layer_norm(self.add([z_copy,z]))

        return z


In [11]:
class Transformer(keras.Model):

    def __init__(
        self,
        encoder_vocab_size:int,
        decoder_vocab_size:int,
        encoder_length:int=2048,
        decoder_length:int=2048,
        d_model:int=512,
        encoder_casting:Literal["concat","interleave"]="concat",
        decoder_casting:Literal["concat","interleave"]="concat",
        encoder_num_layers:int=8,
        encoder_num_heads:int=3,
        encoder_feed_forward_dense_units:int=2048,
        encoder_dropout_rate:float=.1,
        decoder_num_layers:int=12,
        decoder_num_self_heads:int=3,
        decoder_num_cross_heads:int=3,
        decoder_feed_forward_dense_units:int=2048,
        decoder_dropout_rate:float=.1,
        **kwargs
    ):


        super(Transformer,self).__init__(**kwargs)
        self.encoder_embed = PositionEncoding(
            encoder_vocab_size,
            encoder_length,
            d_model,
            encoder_casting
            )
        self.decoder_embed = PositionEncoding(
            decoder_vocab_size,
            decoder_length,
            d_model,
            decoder_casting
            )
        self.encoder_layer = Encoder(
            encoder_num_layers,
            encoder_num_heads,
            d_model,
            encoder_feed_forward_dense_units,
            encoder_dropout_rate
            )
        self.decoder_layer = Decoder(
            decoder_num_layers,
            decoder_num_self_heads,
            decoder_num_cross_heads,
            d_model,
            decoder_feed_forward_dense_units,
            decoder_dropout_rate
            )
        self.total_out = keras.layers.Dense(decoder_vocab_size)


    def call(self,inputs):

        z_enc,z_dec = inputs

        z_enc = self.encoder_embed(z_enc)
        z_enc = self.encoder_layer(z_enc)
        z_dec = self.decoder_embed(z_dec)
        z = self.decoder_layer(z_enc,z_dec)
        z = self.total_out(z)

        try:
            del z._keras_mask
        except AttributeError:
            pass

        return z


In [12]:
def custom_loss(y_true,y_pred):
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction="none")
    loss = loss_fn(y_true,y_pred)
    mask = tf.cast(y_true != 0,loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def custom_metric(y_true,y_pred):
    y_pred = tf.cast(tf.argmax(y_pred,axis=-1),y_true.dtype)
    mask = tf.cast(y_true != 0,tf.int64)
    acc = tf.cast(y_true == y_pred,tf.int64)
    acc = acc & mask
    return tf.reduce_sum(acc)/tf.reduce_sum(mask)


In [13]:
class CustomLR(keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self,d_model:int=512,warmup:int=4000,**kwargs):

        self.factor = tf.math.rsqrt(tf.cast(d_model,tf.float32)) # d_model^-0.5
        self.warmup_factor = tf.math.pow(tf.cast(warmup,tf.float32),tf.cast(-1.5,tf.float32))

    def __call__(self,step):
        step = tf.cast(step,tf.float32)
        return self.factor * tf.math.minimum(tf.math.rsqrt(step),step*self.warmup_factor)


In [14]:
with strategy.scope():
    BATCH_SIZE = 64
    train_ds,valid_ds,train_size,valid_size = get_data(BATCH_SIZE)
    train_steps = train_size//BATCH_SIZE
    valid_steps = valid_size//BATCH_SIZE
    model = Transformer(
        encoder_vocab_size=len(en_vocab),
        decoder_vocab_size=len(es_vocab)
        )
    cust_lr = CustomLR(d_model=512,warmpu=4000)
    model.compile(
        loss=custom_loss,
        metrics=[custom_metric,custom_loss],
        optimizer=keras.optimizers.Adam(
            learning_rate=cust_lr,
            beta_1=0.9,
            beta_2=0.98
        ),
        steps_per_execution=25
    )

In [None]:
model.fit(train_ds,validation_data=valid_ds,epochs=10,steps_per_epoch=train_steps,validation_steps=valid_steps)

Epoch 1/10
 250/1858 [===>..........................] - ETA: 17:58 - loss: 6.6554 - custom_metric: 0.1933 - custom_loss: 6.6554