In [1]:
import sys,os,warnings,logging,time
from typing import Literal
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
if "google.colab" in sys.modules:
    %pip3 install -q -U "tensorflow-text==2.13.0"

import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import tensorflow_text as tftext
import tensorflow_text.tools.wordpiece_vocab.bert_vocab_from_dataset as bert_vocab
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_dark"
%matplotlib inline
%xmode Context
tf.get_logger().setLevel("ERROR")

Exception reporting mode: Context


In [2]:
examples,metadata = tfds.load("ted_hrlr_translate/pt_to_en",with_info=True,as_supervised=True)
train_raw,valid_raw = examples['train'],examples['validation']

In [3]:
for i in train_raw.take(1):
    print(i[0].numpy().decode())
    print(i[1].numpy().decode())

e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .


In [4]:
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_raw.map(lambda x,y:y),
    vocab_size=8000,
    reserved_tokens=["[PAD]","[UNK]","[START]","[END]"],
    bert_tokenizer_params=dict(lower_case=True),
    learn_params={}
    )
pt_vocab = bert_vocab.bert_vocab_from_dataset(
    train_raw.map(lambda x,y:x),
    vocab_size=8000,
    reserved_tokens=["[PAD]","[UNK]","[START]","[END]"],
    bert_tokenizer_params=dict(lower_case=True) ,
    learn_params={}
)

In [5]:
with open("en_pt_vocab.txt","w") as f:
    for token in en_vocab:
        print(token,file=f)

with open("pt_en_vocab.txt","w") as f:
    for token in pt_vocab:
        print(token,file=f)

In [6]:
en_tokenizer = tftext.BertTokenizer("en_pt_vocab.txt",lower_case=True)
pt_tokenizer = tftext.BertTokenizer("pt_en_vocab.txt",lower_case=True)

In [25]:
def upstream(text,lang:Literal["en","pt"]):
    batch_size = tf.shape(text)[0]
    if lang == "en":
        tokens = en_tokenizer.tokenize(text).merge_dims(-2,-1)
    else:
        tokens = pt_tokenizer.tokenize(text).merge_dims(-2,-1)
    start_tokens = tf.fill(dims=[batch_size,1],value=tf.constant(value=en_vocab.index("[START]"),dtype=tokens.dtype))
    end_tokens = tf.fill(dims=[batch_size,1],value=tf.constant(value=en_vocab.index("[END]"),dtype=tokens.dtype))
    return tf.concat([start_tokens,tokens,end_tokens],axis=-1)

In [39]:
for i in train_raw.batch(3).take(1).map(lambda x,y:y):
    tokens = upstream(i,"en")
    print(tokens[:4])

<tf.RaggedTensor [[2, 72, 117, 79, 1259, 1491, 2362, 13, 79, 150, 184, 311, 71, 103, 2308,
  74, 2679, 13, 148, 80, 55, 4840, 1434, 2423, 540, 15, 3]               ,
 [2, 87, 90, 107, 76, 129, 1852, 30, 3],
 [2, 87, 83, 149, 50, 9, 56, 664, 85, 2512, 15, 3]]>


In [36]:
def preprocess(pt,en):
    pt_tokens = upstream(pt,"pt")
    en_tokens = upstream(en,"en")
    pt_tokens = pt_tokens[:,:129]
    en_tokens = en_tokens[:,:128]
    return (en_tokens.to_tensor(),pt_tokens[:,:-1].to_tensor()),pt_tokens[:,1:].to_tensor()

In [37]:
train_ds = (
    train_raw
    .shuffle(20000)
    .batch(64)
    .map(preprocess)
    .prefetch(tf.data.AUTOTUNE)
)
valid_ds = (
    valid_raw
    .shuffle(20000)
    .batch(64)
    .map(preprocess)
    .prefetch(tf.data.AUTOTUNE)
)