In [1]:
# from google.colab import auth
# auth.authenticate_user()
import sys,os,warnings
if "google.colab" in sys.modules:
    %pip install "tensorflow-text==2.13.0"
    %pip install kaleido
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
import numpy as np
import re
from typing import Literal
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as tftext
import tensorflow_text.tools.wordpiece_vocab.bert_vocab_from_dataset as bert_vocab
from zipfile import ZipFile
from IPython.display import clear_output
from shutil import copytree,copy2
import requests
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_dark"
if "google.colab" not in sys.modules:
    gpus = tf.config.list_physical_devices("GPU")
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=9000)]
        )
tf.get_logger().setLevel("ERROR")
%xmode Context
clear_output()

In [2]:
with tf.device("/job:localhost"):
    url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
    file_path = keras.utils.get_file(fname="spa-eng.zip",origin=url,extract=True)
    with ZipFile(file_path,"r") as f:
        f.extractall("spa-eng")
    with open("spa-eng/spa-eng/spa.txt","r") as f:
        text = f.read()
clear_output()
en_text,es_text = zip(*[line.split("\t") for line in text.splitlines()])
for en,es in zip(en_text[:10],es_text[:10]):
    print(f"{en} ----> {es}")

Go. ----> Ve.
Go. ----> Vete.
Go. ----> Vaya.
Go. ----> Váyase.
Hi. ----> Hola.
Run! ----> ¡Corre!
Run. ----> Corred.
Who? ----> ¿Quién?
Fire! ----> ¡Fuego!
Fire! ----> ¡Incendio!


In [3]:
en_tokenizer = tftext.BertTokenizer(
    "en_vocab.txt",
    normalization_form="NFKD"
)
es_tokenizer = tftext.BertTokenizer(
    "spa_vocab.txt",
    normalization_form="NFKD"
)

In [4]:
with open("en_vocab.txt","r") as f:
    en_vocab = f.read()

with open("spa_vocab.txt","r") as f:
    es_vocab = f.read()

en_vocab = np.array(en_vocab.splitlines())
es_vocab = np.array(es_vocab.splitlines())
en_text = np.array(en_text)
es_text = np.array(es_text)

In [5]:
start_token = tf.argmax(en_vocab == "[START]",output_type=tf.int64)
end_token = tf.argmax(es_vocab== "[END]",output_type=tf.int64)

In [6]:
def upstream(sentence:str,lang:Literal["en","es"]):
    assert lang in ["en","es"],f"The provided argument for lang is not in ['en','es']"
    bsize = tf.shape(sentence)[0]
    sentence = tf.convert_to_tensor(sentence)
    sentence = tftext.normalize_utf8(sentence,"NFKD")
    sentence = tf.strings.lower(sentence)
    sentence = tf.strings.regex_replace(sentence,r"[^ a-z,.?!¿]","")
    sentence = tf.strings.regex_replace(sentence,r"[,.?!¿]",r" \0 ")
    sentence = tf.strings.strip(sentence)
    if lang == "en":
        tokens = en_tokenizer.tokenize(sentence).merge_dims(-2,-1).to_tensor()
    else:
        tokens = es_tokenizer.tokenize(sentence).merge_dims(-2,-1).to_tensor()
    return tf.concat([tf.fill(dims=[bsize,1],value=start_token),tokens,tf.fill(dims=[bsize,1],value=end_token)],axis=-1)

In [7]:
def downstream(tokens:tf.Tensor,lang:Literal["en","es"]):
    assert lang in ["en","es"],f"The provided argument for lang is not in ['en','es']"
    if lang == "en":
        words = en_tokenizer.detokenize(tokens)
    else:
        words = es_tokenizer.detokenize(tokens)
    bad_tokens = "|".join([re.escape(_) for _ in ["[START]","[END]","[PAD]"]])
    mask = tf.strings.regex_full_match(words,bad_tokens)
    re_words = tf.ragged.boolean_mask(words,~mask)
    return tf.strings.reduce_join(re_words,separator=" ",axis=1)

In [8]:
def preprocess(context,target):
    context = upstream(context,"en")
    target = upstream(target,"es")
    return (context,target[:,:-1]),target[:,1:]

In [9]:
BATCH_SIZE = 64
all_indices = np.random.uniform(size=len(en_text))
train_indices = all_indices <= 0.8
valid_indices = all_indices > 0.8
train_size = len(train_indices)
valid_size = len(valid_indices)
train_ds = (
    tf.data.Dataset
    .from_tensor_slices((en_text[train_indices],es_text[train_indices]))
    .batch(BATCH_SIZE)
    .shuffle(len(en_text))
    .map(preprocess)
    .repeat()
    .prefetch(-1)
)
valid_ds = (
    tf.data.Dataset
    .from_tensor_slices((en_text[valid_indices],es_text[valid_indices]))
    .batch(BATCH_SIZE)
    .shuffle(len(en_text))
    .map(preprocess)
    .repeat()
    .prefetch(-1)
)
for (en_in,es_in),targ_in in train_ds.take(1):
    print(en_in.shape,es_in.shape,targ_in.shape)
    print(es_in[0,:10])
    print(targ_in[0,:10])

(64, 13) (64, 17) (64, 17)
tf.Tensor([  2 372 628 711 221  36  40 222   6   0], shape=(10,), dtype=int64)
tf.Tensor([372 628 711 221  36  40 222   6   0   0], shape=(10,), dtype=int64)


In [10]:
class PositionEncoding(tf.Module):

    def __init__(self,vocab_size:int,d_model:int=512,casting:Literal["concat","interleave"]="concat",**kwargs):

        super(PositionEncoding,self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model/2
        self.casting = casting

    def __call__(self,inputs):

        pass
