<a href="https://colab.research.google.com/github/h4ck4l1/datasets/blob/main/NLP_with_RNN_and_Attention/Transformers_subword_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [111]:
import sys,os,warnings
from zipfile import ZipFile
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
!pip3 install -q -U "tensorflow-text==2.13.0"
!pip3 install -q -U einops
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as tftext
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import tensorflow_datasets as tfds
import einops
from IPython.display import clear_output
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default == "plotly_dark"
import matplotlib.pyplot as plt
%matplotlib inline
clear_output()

In [112]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
file_path = keras.utils.get_file(fname="spa-eng.zip",origin=url,extract=True)
with ZipFile(file_path,"r") as f:
    f.extractall("spa-eng")
with open("spa-eng/spa-eng/spa.txt","r") as f:
    text = f.read()

en_text,es_text = zip(*[line.split("\t") for line in text.splitlines()])
for en,es in zip(en_text[:10],es_text[:10]):
    print(f"{en} ----> {es}")

Go. ----> Ve.
Go. ----> Vete.
Go. ----> Vaya.
Go. ----> Váyase.
Hi. ----> Hola.
Run! ----> ¡Corre!
Run. ----> Corred.
Who? ----> ¿Quién?
Fire! ----> ¡Fuego!
Fire! ----> ¡Incendio!


# Converting Dataset to vocabulary

In [113]:
examples,ds_info = tfds.load("ted_hrlr_translate/pt_to_en",as_supervised=True,with_info=True)

In [114]:
train_ds_pt,valid_ds_pt = examples['train'],examples['validation']

In [115]:
for pt,en in train_ds_pt.take(10):
    print(pt.numpy().decode(),"----->",en.numpy().decode())

e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade . -----> and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
mas e se estes fatores fossem ativos ? -----> but what if it were active ?
mas eles não tinham a curiosidade de me testar . -----> but they did n't test for curiosity .
e esta rebeldia consciente é a razão pela qual eu , como agnóstica , posso ainda ter fé . -----> and this conscious defiance is why i , as an agnostic , can still have faith .
`` `` '' podem usar tudo sobre a mesa no meu corpo . '' -----> you can use everything on the table on me .
`` eu escrevo muito acerca do `` '' teatro de segurança '' '' , que são produtos que fazem as pessoas sentirem-se seguras mas que , na realidade , não fazem nada . '' -----> `` i write a lot about `` '' security theater , '' '' which are products that make people feel secure , but do n't actually do anything . ''
colocaram-no bem n

In [116]:
bert_tokenizer_params = dict(lower_case=True)
reserved_tokens = ["[PAD]","[UNK]","[START]","[END]"]
bert_vocab_args = dict(
    vocab_size=8000,
    reserved_tokens=reserved_tokens,
    bert_tokenizer_params=bert_tokenizer_params,
    learn_params={}
)

In [117]:
pt_vocab = bert_vocab.bert_vocab_from_dataset(
    train_ds_pt.map(lambda x,y:x).prefetch(2),
    **bert_vocab_args
)

In [118]:
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_ds_pt.map(lambda x,y:y).prefetch(2),
    **bert_vocab_args
)

In [119]:
print(pt_vocab[:10])
print(pt_vocab[100:110])
print(pt_vocab[1000:1010])
print(pt_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['no', 'por', 'mais', 'na', 'eu', 'esta', 'muito', 'isso', 'isto', 'sao']
['90', 'desse', 'efeito', 'malaria', 'normalmente', 'palestra', 'recentemente', '##nca', 'bons', 'chave']
['##–', '##—', '##‘', '##’', '##“', '##”', '##⁄', '##€', '##♪', '##♫']


In [120]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['as', 'all', 'at', 'one', 'people', 're', 'like', 'if', 'our', 'from']
['choose', 'consider', 'extraordinary', 'focus', 'generation', 'killed', 'patterns', 'putting', 'scientific', 'wait']
['##_', '##`', '##ย', '##ร', '##อ', '##–', '##—', '##’', '##♪', '##♫']


In [121]:
with open("/content/pt_vocab.txt","w") as f:
    for token in pt_vocab:
        print(token,file=f)

with open("/content/en_vocab.txt","w") as f:
    for token in en_vocab:
        print(token,file=f)

In [122]:
def standardize(sentence:str):
    sentence = tftext.normalize_utf8(sentence,"NFKD")
    sentence = tf.strings.lower(sentence)
    sentence = tf.strings.regex_replace(sentence,r"[^ a-z.,?!¡]","")
    sentence = tf.strings.regex_replace(sentence,r"[.,?!¡]",r" \0 ")
    sentence = tf.strings.strip(sentence)
    sentence = tf.strings.join(["[START]",sentence,"[END]"]," ")
    return sentence

In [123]:
en_vec_layer = keras.layers.TextVectorization(8000,standardize=standardize,ragged=True)
es_vec_layer = keras.layers.TextVectorization(8000,standardize=standardize,ragged=True)
en_vec_layer.adapt(np.array(en_text))
es_vec_layer.adapt(np.array(es_text))

In [124]:
en_vec_layer.get_vocabulary()[-10:]

['landlord',
 'lame',
 'lamb',
 'lakes',
 'lagoon',
 'lactose',
 'kumi',
 'kublai',
 'koalas',
 'knitted']

In [125]:
es_text = np.array(es_text)
en_text = np.array(en_text)

In [126]:
def standardize(sentence:str):
    sentence = tftext.normalize_utf8(sentence,"NFKD")
    sentence = tf.strings.lower(sentence)
    sentence = tf.strings.regex_replace(sentence,r"[^ a-z.,?!¡]","")
    sentence = tf.strings.regex_replace(sentence,r"[.,?!¡]",r" \0 ")
    sentence = tf.strings.strip(sentence)
    return sentence

In [127]:
spa_ds = tf.data.Dataset.from_tensor_slices(es_text)
spa_ds = spa_ds.map(standardize)

en_ds = tf.data.Dataset.from_tensor_slices(en_text)
en_ds = en_ds.map(standardize)

In [128]:
bert_tokenizer_params = dict(normalization_form="NFKD")
bert_vocab_args = dict(
    vocab_size=5000,
    reserved_tokens=["[PAD]","[UNK]","[START]","[END]"],
    bert_tokenizer_params=bert_tokenizer_params,
    learn_params={}
)

In [129]:
spa_vocab = bert_vocab.bert_vocab_from_dataset(
    spa_ds,
    **bert_vocab_args
)

In [130]:
en_vocab = bert_vocab.bert_vocab_from_dataset(
    en_ds,
    **bert_vocab_args
)

In [131]:
print(len(spa_vocab))
print(spa_vocab[:10])
print(spa_vocab[100:110])
print(spa_vocab[1000:1010])
print(spa_vocab[-10:])

4870
['[PAD]', '[UNK]', '[START]', '[END]', '!', ',', '.', '?', 'a', 'b']
['puede', 'he', 'bien', 'estas', 'mucho', '##mos', '##te', 'ellos', 'nos', 'quien']
['supongo', 'caliente', 'cielo', 'empezar', 'jefe', 'mirando', 'ninguno', 'rojo', 'viendo', '##u']
['volviera', '##!', '##,', '##.', '##?', '##f', '##j', '##q', '##v', '##¡']


In [132]:
print(len(en_vocab))
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

4563
['[PAD]', '[UNK]', '[START]', '[END]', '!', ',', '.', '?', 'a', 'b']
['one', 'doesnt', 'going', 'by', 'would', 'why', 'come', 'see', 'good', 'ill']
['mustve', 'novel', 'shot', 'surprise', 'taxi', 'voice', '##man', 'cooking', 'enemy', 'honest']
['yell', 'youngest', 'youth', '##!', '##,', '##.', '##?', '##j', '##q', '##v']


In [133]:
with open("/content/spa_vocab.txt","w") as f:
    for token in spa_vocab:
        print(token,file=f)

with open("/content/en_spa_vocab.txt","w") as f:
    for token in en_vocab:
        print(token,file=f)

In [134]:
spa_tokenizer = tftext.BertTokenizer("/content/spa_vocab.txt",**bert_tokenizer_params)
en_tokenizer = tftext.BertTokenizer("/content/en_spa_vocab.txt",**bert_tokenizer_params)

In [135]:
for es_in in spa_ds.take(10):
    print(es_in)

tf.Tensor(b've .', shape=(), dtype=string)
tf.Tensor(b'vete .', shape=(), dtype=string)
tf.Tensor(b'vaya .', shape=(), dtype=string)
tf.Tensor(b'vayase .', shape=(), dtype=string)
tf.Tensor(b'hola .', shape=(), dtype=string)
tf.Tensor(b'\xc2\xa1 corre !', shape=(), dtype=string)
tf.Tensor(b'corred .', shape=(), dtype=string)
tf.Tensor(b'quien ?', shape=(), dtype=string)
tf.Tensor(b'\xc2\xa1 fuego !', shape=(), dtype=string)
tf.Tensor(b'\xc2\xa1 incendio !', shape=(), dtype=string)


In [136]:
for en_in in en_ds.take(10):
    print(en_in)

tf.Tensor(b'go .', shape=(), dtype=string)
tf.Tensor(b'go .', shape=(), dtype=string)
tf.Tensor(b'go .', shape=(), dtype=string)
tf.Tensor(b'go .', shape=(), dtype=string)
tf.Tensor(b'hi .', shape=(), dtype=string)
tf.Tensor(b'run !', shape=(), dtype=string)
tf.Tensor(b'run .', shape=(), dtype=string)
tf.Tensor(b'who ?', shape=(), dtype=string)
tf.Tensor(b'fire !', shape=(), dtype=string)
tf.Tensor(b'fire !', shape=(), dtype=string)


In [153]:
es_examples = tf.convert_to_tensor(standardize(es_text[1000:1010]))

In [154]:
es_tokenized = spa_tokenizer.tokenize(es_examples).merge_dims(-2,-1)
es_tokenized

<tf.RaggedTensor [[37, 42, 11, 4866, 6], [34, 37, 46, 77, 4], [37, 42, 14, 1566, 4656, 6],
 [37, 42, 981, 6], [37, 42, 14, 4107, 3382, 6],
 [37, 42, 45, 832, 4867, 4387, 6], [37, 42, 427, 6],
 [37, 44, 25, 136, 233, 6], [48, 163, 37, 6], [48, 2184, 6]]>

In [155]:
for i in es_tokenized.to_list():
    print(i)

[37, 42, 11, 4866, 6]
[34, 37, 46, 77, 4]
[37, 42, 14, 1566, 4656, 6]
[37, 42, 981, 6]
[37, 42, 14, 4107, 3382, 6]
[37, 42, 45, 832, 4867, 4387, 6]
[37, 42, 427, 6]
[37, 44, 25, 136, 233, 6]
[48, 163, 37, 6]
[48, 2184, 6]


In [156]:
# The strings got joined with the suffixe separator
for i in tf.strings.reduce_join(tf.gather(spa_vocab,es_tokenized),separator=" ",axis=-1):
    print(i.numpy().decode())

el es d ##j .
¡ el esta aqui !
el es g ##en ##til .
el es amable .
el es g ##ene ##roso .
el es me ##z ##q ##uino .
el es alto .
el se r ##e ##ia .
lo hizo el .
lo logro .


In [157]:
for i in tf.strings.reduce_join(spa_tokenizer.detokenize(es_tokenized),separator=" ",axis=-1):
    print(i.numpy().decode())

el es dj .
¡ el esta aqui !
el es gentil .
el es amable .
el es generoso .
el es mezquino .
el es alto .
el se reia .
lo hizo el .
lo logro .


In [142]:
for i in en_text[-10:]:
    print(i)

You can't view Flash content on an iPad. However, you can easily email yourself the URLs of these web pages and view that content on your regular computer when you get home.
A mistake young people often make is to start learning too many languages at the same time, as they underestimate the difficulties and overestimate their own ability to learn them.
No matter how much you try to convince people that chocolate is vanilla, it'll still be chocolate, even though you may manage to convince yourself and a few others that it's vanilla.
In 1969, Roger Miller recorded a song called "You Don't Want My Love." Today, this song is better known as "In the Summer Time." It's the first song he wrote and sang that became popular.
A child who is a native speaker usually knows many things about his or her language that a non-native speaker who has been studying for years still does not know and perhaps will never know.
There are four main causes of alcohol-related death. Injury from car accidents or v

In [143]:
en_examples = en_tokenizer.tokenize(standardize(en_text[-10:]))
en_examples = en_examples.merge_dims(-2,-1)
for i in en_examples.to_list():
    print(i)

[36, 96, 1572, 13, 4252, 3586, 59, 98, 2698, 6, 2295, 5, 36, 71, 969, 1659, 409, 34, 28, 510, 1712, 41, 245, 2864, 2825, 65, 1572, 42, 3586, 59, 58, 3655, 612, 94, 36, 92, 140, 6]
[8, 498, 357, 141, 293, 164, 38, 35, 442, 873, 127, 147, 945, 67, 34, 303, 79, 5, 82, 78, 3483, 34, 3064, 65, 212, 1108, 3723, 258, 381, 1894, 35, 331, 222, 6]
[99, 423, 80, 133, 36, 294, 35, 1196, 141, 42, 1505, 38, 29, 994, 2328, 515, 5, 944, 189, 60, 1505, 5, 286, 906, 36, 217, 3089, 35, 1196, 409, 65, 8, 353, 634, 42, 76, 29, 994, 2328, 515, 6]
[40, 5, 25, 2885, 309, 20, 2328, 309, 2199, 153, 8, 518, 445, 36, 52, 62, 49, 182, 6, 174, 5, 47, 518, 38, 219, 762, 82, 40, 34, 524, 79, 6, 76, 34, 230, 518, 39, 658, 65, 1334, 42, 586, 1110, 6]
[8, 410, 113, 38, 8, 1103, 1394, 475, 292, 147, 291, 87, 55, 207, 73, 506, 42, 8, 99, 552, 3971, 1394, 113, 74, 114, 533, 50, 213, 189, 159, 63, 68, 65, 1637, 75, 124, 68, 6]
[77, 53, 670, 2141, 2490, 41, 2404, 1576, 742, 1319, 574, 6, 40, 4560, 2478, 95, 137, 1748, 207, 1

In [144]:
for i in tf.strings.reduce_join(en_tokenizer.detokenize(en_examples),separator=" ",axis=-1):
    print(i.numpy().decode())

you cant view flash content on an ipad . however , you can easily email yourself the urls of these web pages and view that content on your regular computer when you get home .
a mistake young people often make is to start learning too many languages at the same time , as they underestimate the difficulties and overestimate their own ability to learn them .
no matter how much you try to convince people that chocolate is vanilla , itll still be chocolate , even though you may manage to convince yourself and a few others that its vanilla .
in , roger miller recorded a song called you dont want my love . today , this song is better known as in the summer time . its the first song he wrote and sang that became popular .
a child who is a native speaker usually knows many things about his or her language that a nonnative speaker who has been studying for years still does not know and perhaps will never know .
there are four main causes of alcoholrelated death . injury from car accidents or vi

In [145]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
START

<tf.Tensor: shape=(), dtype=int64, numpy=2>

In [146]:
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")
END

<tf.Tensor: shape=(), dtype=int64, numpy=3>

In [147]:
start_token = tf.fill(dims=[10,1],value=START)
end_token = tf.fill(dims=[10,1],value=END)

In [148]:
start_token

<tf.Tensor: shape=(10, 1), dtype=int64, numpy=
array([[2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2]])>

In [149]:
for i in tf.strings.reduce_join(spa_tokenizer.detokenize(tf.concat([start_token,es_tokenized,end_token],axis=-1)),separator=" ",axis=-1).numpy():
    print(i.decode())

[START] ve . [END]
[START] vete . [END]
[START] vaya . [END]
[START] vayase . [END]
[START] hola . [END]
[START] ¡ corre ! [END]
[START] corred . [END]
[START] quien ? [END]
[START] ¡ fuego ! [END]
[START] ¡ incendio ! [END]


In [150]:
all_words = en_tokenizer.detokenize(en_examples)
all_words

<tf.RaggedTensor [[b'you', b'cant', b'view', b'flash', b'content', b'on', b'an', b'ipad',
  b'.', b'however', b',', b'you', b'can', b'easily', b'email', b'yourself',
  b'the', b'urls', b'of', b'these', b'web', b'pages', b'and', b'view',
  b'that', b'content', b'on', b'your', b'regular', b'computer', b'when',
  b'you', b'get', b'home', b'.']                                           ,
 [b'a', b'mistake', b'young', b'people', b'often', b'make', b'is', b'to',
  b'start', b'learning', b'too', b'many', b'languages', b'at', b'the',
  b'same', b'time', b',', b'as', b'they', b'underestimate', b'the',
  b'difficulties', b'and', b'overestimate', b'their', b'own', b'ability',
  b'to', b'learn', b'them', b'.']                                        ,
 [b'no', b'matter', b'how', b'much', b'you', b'try', b'to', b'convince',
  b'people', b'that', b'chocolate', b'is', b'vanilla', b',', b'itll',
  b'still', b'be', b'chocolate', b',', b'even', b'though', b'you', b'may',
  b'manage', b'to', b'convince', 

In [151]:
import re
bad_tokens = [re.escape(tok) for tok in reserved_tokens if not tok == "[UNK]"]
bad_words = "|".join(bad_tokens)
mask = tf.strings.regex_full_match(all_words,bad_words)
filtered_words = tf.ragged.boolean_mask(all_words,~mask)
tf.strings.reduce_join(filtered_words,separator=" ",axis=-1)

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'you cant view flash content on an ipad . however , you can easily email yourself the urls of these web pages and view that content on your regular computer when you get home .',
       b'a mistake young people often make is to start learning too many languages at the same time , as they underestimate the difficulties and overestimate their own ability to learn them .',
       b'no matter how much you try to convince people that chocolate is vanilla , itll still be chocolate , even though you may manage to convince yourself and a few others that its vanilla .',
       b'in , roger miller recorded a song called you dont want my love . today , this song is better known as in the summer time . its the first song he wrote and sang that became popular .',
       b'a child who is a native speaker usually knows many things about his or her language that a nonnative speaker who has been studying for years still does not know and perhaps wil

In [152]:
!ls

en_spa_vocab.txt  pt_vocab.txt	spa-eng
en_vocab.txt	  sample_data	spa_vocab.txt
