In [1]:
# from google.colab import auth
# auth.authenticate_user()
import sys,os,warnings
if "google.colab" in sys.modules:
    %pip install "tensorflow-text==2.13.0"
    %pip install kaleido
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
import numpy as np
import re
from typing import Literal
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as tftext
import tensorflow_text.tools.wordpiece_vocab.bert_vocab_from_dataset as bert_vocab
from zipfile import ZipFile
from IPython.display import clear_output
from shutil import copytree,copy2
import requests
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_dark"
if "google.colab" not in sys.modules:
    gpus = tf.config.list_physical_devices("GPU")
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=9000)]
        )
tf.get_logger().setLevel("ERROR")
%xmode Context
clear_output()

In [2]:
with tf.device("/job:localhost"):
    url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
    file_path = keras.utils.get_file(fname="spa-eng.zip",origin=url,extract=True)
    with ZipFile(file_path,"r") as f:
        f.extractall("spa-eng")
    with open("spa-eng/spa-eng/spa.txt","r") as f:
        text = f.read()
clear_output()
en_text,es_text = zip(*[line.split("\t") for line in text.splitlines()])
for en,es in zip(en_text[:10],es_text[:10]):
    print(f"{en} ----> {es}")

Go. ----> Ve.
Go. ----> Vete.
Go. ----> Vaya.
Go. ----> Váyase.
Hi. ----> Hola.
Run! ----> ¡Corre!
Run. ----> Corred.
Who? ----> ¿Quién?
Fire! ----> ¡Fuego!
Fire! ----> ¡Incendio!


In [3]:
en_tokenizer = tftext.BertTokenizer(
    "en_vocab.txt",
    normalization_form="NFKD"
)
es_tokenizer = tftext.BertTokenizer(
    "spa_vocab.txt",
    normalization_form="NFKD"
)

In [49]:
with open("en_vocab.txt","r") as f:
    en_vocab = f.read()

with open("spa_vocab.txt","r") as f:
    es_vocab = f.read()

en_vocab = np.array(en_vocab.splitlines())
es_vocab = np.array(es_vocab.splitlines())

In [50]:
print(en_vocab[:10])
print(en_vocab[-10:])
print(es_vocab[:10])
print(es_vocab[-10:])

['[PAD]' '[UNK]' '[START]' '[END]' '!' ',' '.' '?' 'a' 'b']
['yell' 'youngest' 'youth' '##!' '##,' '##.' '##?' '##j' '##q' '##v']
['[PAD]' '[UNK]' '[START]' '[END]' '!' ',' '.' '?' 'a' 'b']
['volviera' '##!' '##,' '##.' '##?' '##f' '##j' '##q' '##v' '##¿']


In [59]:
BATCH_SIZE = 64
start_token = tf.argmax(en_vocab == "[START]")
end_token = tf.argmax(es_vocab== "[END]")
print(start_token)
print(end_token)

tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)


In [None]:
def upstream(sentence:str,lang:Literal["en","es"]):
    sentence = tf.convert_to_tensor(sentence)
    sentence = tftext.normalize_utf8(sentence,"NFKD")
    sentence = tf.strings.lower(sentence)
    sentence = tf.strings.regex_replace(sentence,r"[^ a-z,.?!¿]","")
    sentence = tf.strings.regex_replace(sentence,r"[,.?!¿]",r" \0 ")
    sentence = tf.strings.strip(sentence)
    if lang == "en":
        tokens = en_tokenizer.tokenize(sentence)
    else:
        tokens = es_tokenizer.tokenize(sentence)
