In [10]:
# from google.colab import auth
# auth.authenticate_user()
import os,sys,warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"
warnings.filterwarnings("ignore")
if "google.colab" in sys.modules:
    %pip install -q "tensorflow-text==2.13.0"
    %pip install -q kaleido

from typing import Literal
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as tftext
import tensorflow_datasets as tfds
if "google.colab" not in sys.modules:
    gpus = tf.config.list_physical_devices("GPU")
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=9216)]
    )
import requests
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
tf.get_logger().setLevel("ERROR")
pio.templates.default = "plotly_dark"

In [11]:
# tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(tpu_resolver)
# tf.tpu.experimental.initialize_tpu_system(tpu_resolver)
# strategy = tf.distribute.TPUStrategy(tpu_resolver)
strategy = tf.distribute.OneDeviceStrategy(device="/device:GPU:0")

In [12]:
with tf.device("/job:localhost"):
    dataset_name = "ted_hrlr_translate/pt_to_en"
    total_dataset = tfds.load(name=dataset_name,batch_size=-1,shuffle_files=True)
    ds_info = tfds.builder(dataset_name).info
    pt_tokenizer = tftext.BertTokenizer("pt_en_vocab.txt",lower_case=True)
    en_tokenizer = tftext.BertTokenizer("en_pt_vocab.txt",lower_case=True)

In [16]:
total_dataset['train']['en'][:5]

<tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .',
       b'but what if it were active ?',
       b"but they did n't test for curiosity .",
       b'and this conscious defiance is why i , as an agnostic , can still have faith .',
       b'you can use everything on the table on me .'], dtype=object)>

In [17]:
total_dataset['train']['pt'][:5]

<tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'e quando melhoramos a procura , tiramos a \xc3\xbanica vantagem da impress\xc3\xa3o , que \xc3\xa9 a serendipidade .',
       b'mas e se estes fatores fossem ativos ?',
       b'mas eles n\xc3\xa3o tinham a curiosidade de me testar .',
       b'e esta rebeldia consciente \xc3\xa9 a raz\xc3\xa3o pela qual eu , como agn\xc3\xb3stica , posso ainda ter f\xc3\xa9 .',
       b"`` `` '' podem usar tudo sobre a mesa no meu corpo . ''"],
      dtype=object)>

In [4]:
AUTO = tf.data.AUTOTUNE
MAX_TOKENS = 128
def get_dataset(total_dataset,split,ds_info,batch_size,preprocess_fn):
    ds = tf.data.Dataset.from_tensor_slices(total_dataset[split])
    ds_size = ds_info.splits[split].num_examples
    if "train" in split:
        ds = ds.shuffle(ds_size//2)
        ds = ds.repeat()
    ds = ds.batch(batch_size,drop_remainder=True,num_parallel_calls=AUTO)
    ds = ds.map(preprocess_fn,AUTO)
    ds = ds.cache()
    ds = ds.prefetch(AUTO)
    return ds,ds_size

def preprocess_fn(pt_text,en_text):
    pt_tokens = pt_tokenizer.tokenize(pt_text).merge_dims(-2,-1)
    en_tokens = en_tokenizer.tokenize(en_text).merge_dims(-2,-1)
    pt_tokens = pt_tokens[:,:MAX_TOKENS+1]
    en_tokens = en_tokens[:,:MAX_TOKENS]
    return (en_tokens.to_tensor(),pt_tokens[:,:-1].to_tensor()),pt_tokens[:,1:].to_tensor()

In [5]:
class PositionEmbedding(keras.layers.Layer):

    def __init__(self,casting:Literal["concat","interleave"],length:int=2048,d_model:int=512,**kwargs):


        super(PositionEmbedding,self).__init__(**kwargs)
        assert d_model%2==0,f"The depth_of_model {d_model} should be even number"
        d_model = d_model//2
        positions = np.arange(length)[:,np.newaxis]
        angles = np.arange(d_model)[np.newaxis,:]/d_model
        angles = 1/(10000**angles)
        angle_rads = positions * angles
        if casting == "concat":
            self.embed = tf.concat([tf.sin(angle_rads),tf.cos(angle_rads)],axis=-1)
        else:
            self.embed = np.zeros(shape=[length,d_model])
            self.embed[:,::2] = tf.sin(angle_rads)
            self.embed[:,1::2] = tf.cos(angle_rads)



    def call(self,inputs):

        seq_l = tf.shape(inputs)[1]
        return tf.cast(inputs,self.embed.dtype) + self.embed[tf.newaxis,:seq_l,:]

In [6]:
def get_model(
    context_vocab_size:int,
    target_vocab_size:int,
    d_model:int=128,
    length:int=2048,
    enc_heads:int=8,
    dec_heads:int=8,
    num_encoder_layers:int=4,
    num_decoder_layers:int=4,
    encoder_feed_forward_units:int=512,
    decoder_feed_forward_units:int=512,
    encoder_dropout_rate:float=0.1,
    decoder_dropout_rate:float=0.1
    ):

    encoder_inputs = keras.layers.Input(shape=[],dtype=tf.int64)
    decoder_inputs = keras.layers.Input(shape=[],dtype=tf.int64)
    encoder_embedding = keras.layers.Embedding(context_vocab_size,d_model,mask_zero=True)(encoder_inputs)
    decoder_embedding = keras.layers.Embedding(target_vocab_size,d_model,mask_zero=True)(decoder_inputs)
    z_enc = PositionEmbedding("concat",length,d_model)(encoder_embedding)
    z_dec = PositionEmbedding("concat",length,d_model)(decoder_embedding)

    for _ in range(num_encoder_layers):

        '''Self Attention Part'''
        z_copy = z_enc
        self_attention = keras.layers.MultiHeadAttention(enc_heads,d_model)
        z_enc = self_attention(query=z_enc,key=z_enc,value=z_enc)
        z_enc = keras.layers.Add()([z_copy,z_enc])
        z_enc = keras.layers.LayerNormalization()(z_enc)

        '''Feed Forward Part'''
        z_copy = z_enc
        z_enc = keras.layers.Dense(encoder_feed_forward_units,"relu")(z_enc)
        z_enc = keras.layers.Dense(d_model)(z_enc)
        z_enc = keras.layers.Dropout(encoder_dropout_rate)(z_enc)
        z_enc = keras.layers.Add()([z_enc,z_copy])
        z_enc = keras.layers.LayerNormalization()(z_enc)


    for _ in range(num_decoder_layers):

        '''Masked Self Attention Part'''
        z_copy = z_dec
        masked_self_attention = keras.layers.MultiHeadAttention(dec_heads,d_model)
        z_dec = masked_self_attention(query=z_dec,key=z_dec,value=z_dec,use_causal_mask=True)
        z_dec = keras.layers.Add()([z_copy,z_dec])
        z_dec = keras.layers.LayerNormalization()(z_dec)

        '''Cross Attention Part'''
        z_copy = z_dec
        cross_attention = keras.layers.MultiHeadAttention(dec_heads,d_model)
        z_dec = cross_attention(query=z_dec,key=z_enc,value=z_enc)
        z_dec = keras.layers.Add()([z_copy,z_dec])
        z_dec = keras.layers.LayerNormalization()(z_dec)

        '''Feed Forward Part'''
        z_copy = z_dec
        z_dec = keras.layers.Dense(decoder_feed_forward_units,"relu")(z_dec)
        z_dec = keras.layers.Dense(d_model)(z_dec)
        z_dec = keras.layers.Dropout(decoder_dropout_rate)(z_dec)
        z_dec = keras.layers.Add()([z_copy,z_dec])
        z_dec = keras.layers.LayerNormalization()(z_dec)

    out = keras.layers.Dense(target_vocab_size)(z_dec)

    return keras.Model(inputs=[encoder_inputs,decoder_inputs],outputs=[out])

In [7]:
def custom_loss(y_true,y_pred):

    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction=keras.losses.Reduction.NONE)
    loss = loss_fn(y_true,y_pred)
    mask = tf.cast(y_true != 0,loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)


def custom_accuracy(y_true,y_pred):

    y_pred = tf.cast(tf.argmax(y_pred,axis=-1),y_true.dtype)
    mask = tf.cast(y_true != 0,tf.int32)
    acc = tf.cast(y_true == y_pred,tf.int32)
    acc = acc & mask
    return tf.reduce_sum(acc)/tf.reduce_sum(mask)


class CustomLR(keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self,d_model:int=512,warmup:int=4000,**kwargs):

        self.factor = tf.math.rsqrt(tf.cast(d_model,tf.float32))
        self.warmup_factor = tf.math.pow(tf.cast(warmup,tf.float32),tf.cast(-1.5,tf.float32))

    def __call__(self,step):
        step = tf.cast(step,tf.float32)
        return self.factor * tf.math.minimum(tf.math.rsqrt(step),step*self.warmup_factor)

In [8]:
cont_vocab = 7010
targ_vocab = 7765
BATCH_SIZE = 16*strategy.num_replicas_in_sync
EPOCH = 20

with strategy.scope():
    train_ds,train_size = get_dataset(total_dataset,"train",ds_info,BATCH_SIZE,preprocess_fn)
    valid_ds,valid_size = get_dataset(total_dataset,"validation",ds_info,BATCH_SIZE,preprocess_fn)
    train_steps = train_size//BATCH_SIZE
    valid_steps = valid_size//BATCH_SIZE
    total_steps = train_steps*BATCH_SIZE
    model = get_model(cont_vocab,targ_vocab)
    cust_lr = CustomLR(d_model=128,warmup=total_steps//10)
    model.compile(
        loss=custom_loss,
        metrics=[custom_accuracy,custom_loss],
        optimizer=keras.optimizers.Adam(
            learning_rate=cust_lr,
            beta_1=0.9,
            beta_2=0.98,
            epsilon=1e-9
        ),
        steps_per_execution=24,
        jit_compile=True
    )

TypeError: in user code:


    TypeError: tf__preprocess_fn() missing 1 required positional argument: 'en_text'
