In [1]:
#Basic importations
import numpy as np
import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf


# This is mainly for deep understanding of paper Attention is all you need 
## Check it out here : https://arxiv.org/abs/1706.03762 .  
Basically is creating a transformer to be able to translate words from english to portuguese in this specific case.


In [2]:
#We use the data provided by Ted fundation in tensorflow_datasets
examples, metadata = tfds.load('ted_hrlr_translate/es_to_pt', with_info=True,
                               as_supervised=True)



train_examples, val_examples = examples['train'], examples['validation']

In [15]:
#we will call data a class to not have a random global variable 
class Data():
    def __init__(self):
        #alternitavely we can ask in innit what language we want to train
        i=0
        #WE WILL GET THE DATASET IN WORKING CONDITIONS EVERY LIST(TRAIN_EXMAPLES[I]) GIVES US A (EXAMPLE 1, EXAMPLE 2)
        pt=[]
        es=[]
        #WE NEED TO DECODE IT TO UTF-8 SO IT IS PRESENTABLE AND WE ONLY WANT THE NUMPY
        for es_example, pt_example in list(train_examples):
            pt.append((pt_example.numpy()).decode('utf-8'))
            es.append((es_example.numpy()).decode('utf-8'))
        self.pt=pt
        self.es=es
        #And with this we have or data


Now we need to think about input embedding .
Because obviously computers dont understand strings, we need some kind of input. We will build an embedding from zero
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset -> will help us generate a word for word construction
But for this case we need it to be object database so Data() is not useful in this case, although we will use it later


BertTokenizer is a type of embedding described in detail in https://arxiv.org/abs/1810.04805, we will not go in detail, we will assume its effectivness in embedding (as they describe is a bidirectional representation from unlabeled text )




In [16]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert

In [17]:
#Datos=Data()
# in this case lets generate a vocabulary of 5000 words
# WE USE MAP AND LAMBDA MAP MAPS ONTO ARRAY AND SIMPLY LAMBDA ES,PT: ES IS SETTING FROM BOTH ES AND PT SETTING THE RESULT TO ES
# THE RESEVED TOKENS ARE SIMPLY START FINISH conditions (recall we need to start and finish the input and output)
es_vocab = bert.bert_vocab_from_dataset(train_examples.map(lambda es,pt:es),vocab_size =5000,reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"])
pt_vocab = bert.bert_vocab_from_dataset(train_examples.map(lambda es,pt:pt),vocab_size =5000,reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"])


KeyboardInterrupt: 

In [None]:

def write_vocab_file(filepath, vocab):
    with open(filepath, 'w',encoding='utf-8') as f:
        for token in vocab:
              print(token, file=f)

In [None]:
write_vocab_file('es_vocab.txt',es_vocab)
write_vocab_file('pt_vocab.txt',pt_vocab)



And with this we have already created a vocabulary. To  encapsulate everything we could put it in a class in this way.

In [8]:
class Vocabgen:
    def __init__(self,filename1,filename2,num):
        es_vocab = bert.bert_vocab_from_dataset(train_examples.map(lambda es,pt:es),vocab_size =num,reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"])
        pt_vocab = bert.bert_vocab_from_dataset(train_examples.map(lambda es,pt:pt),vocab_size =num,reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"])
        #WE CREATE THE VOCABS AND WE WRITE THEM
        write_vocab_file(filename1,es_vocab)
        write_vocab_file(filename2,pt_vocab)
        #We could make it even better so that given two languages we create the vocabs but is not necessary at the moment
        
    def write_vocab_file(filepath, vocab):
        with open(filepath, 'w',encoding='utf-8') as f:
            for token in vocab:
                  print(token, file=f)

In [11]:
Vocabgen('es_vocab.txt','pt_vocab.txt',5000)

<__main__.Vocabgen at 0x1c56b50d550>

Now we can easily create the tokenizers using tensorflow_text.BertTokenizer(atext, **kargs)

In [18]:
class Tokenizer:
    def __init__(self):
        #the tokenizer class will have tokens of class BertTokenizer as .pr .es 
        self.pt = text.BertTokenizer('pt_vocab.txt', **{})
        self.es =text.BertTokenizer('es_vocab.txt', **{})

def add_start_end(ragged): #this function has been copied from tensorflow  guide to subwords
        #will add the start and end
        
        reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]
        START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
        END = tf.argmax(tf.constant(reserved_tokens) == "[END]")
        count = ragged.bounding_shape()[0]
        starts = tf.fill([count,1], START)
        ends = tf.fill([count,1], END)
        return tf.concat([starts, ragged, ends], axis=1)
        
#AND WE CREATE ANOTHER CLASS THAT CONTAINS OUR TOKENIZER TO TOKENIZE AND DE-TOKENIZE
class Embedding:
    Tokens=Tokenizer()
        #tokenize ---->
        #This returns an output of (batch,word,wordpiece)
        #What we mean is that we have -> I am good. output-> [ [ [I(values)] , [am(values)] , [good(values)]] ]
    
    def detokenize(self,detokenizedata):
        detokenizept=(Tokens.pt.detokenize(detokenizedata[1]))
        detokenizees=(Tokens.es.detokenize(detokenizedata[0]))
        
        detokenizees= tf.strings.reduce_join(detokenizees, separator=' ', axis=-1)
        detokenizept= tf.strings.reduce_join(detokenizept, separator=' ', axis=-1)
        merged=[]
        merged.append(detokenizees)
        merged.append(detokenizept)
        return merged
    def tokenize(self,tokenizedata):
        tokenizept=(Tokens.pt.tokenize(tokenizedata[1])).merge_dims(1,2)
        tokenizees=(Tokens.es.tokenize(tokenizedata[0])).merge_dims(1,2)
        #this part is for preperation for our purposes
        tokenizept=(add_start_end(tokenizept)).to_tensor()# we apply to tensor to make the output a normal tensor
        tokenizees=(add_start_end(tokenizees)).to_tensor()# we apply to tensor to make the output a normal tensor
        
        #THIS TO TENSOR WILL MAKE THAT ON TH OUTPUT WE GET A LOT OF PADDING but thats easily removable
        
        merged=[]
        merged.append(tokenizees)
        merged.append(tokenizept)
        return merged
    
    
    
    
    

In [19]:
Datos=Data()
Embed=Embedding()

To show the conversions lets do a simple example

In [21]:
test=[]
testes=[]
testpt=[]
testpt.append(Datos.pt[12])#WE INCLUDE TWO EXAMPLES SO WE CHECK VARIABILITY OF BATCH SIZE
testpt.append(Datos.pt[13])
testes.append(Datos.es[13])
testes.append(Datos.es[12])
print(Datos.pt[12])
test.append(testes)
test.append(testpt)
tokenizeddata=Embed.tokenize(test)
#We will try to detokenize to see if its working!
print(tokenizeddata)

print((Embed.detokenize(tokenizeddata)[1].numpy())[1].decode('utf-8')) #this goes to the pt detokenize data and decodes 
#it in a tensor
#then we are specifically getting the first text and decoding it  
print((Datos.pt[13]))
#ITS REALLY IMPORTAnt to decode de output, it will be coded in utf-8 to make it leggible we need to decode 
#AND as it can be seen we have a desired output only some random spaces that we do not need to bother about as differences
#IT IS ALSO INCLUDED THE START AND END WHICH IS EXACTLY WHAT WE NEED



`` eles não dizem : `` '' quero água quente nos chuveiros . ''
[<tf.Tensor: shape=(2, 23), dtype=int64, numpy=
array([[   2,  282,   16,   10,  281,   11,    3,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [   2,   38,   38,  197,  776,   14,   38,   38,    9,    9,  324,
         333,  442, 2805,  191,  200,   42, 1070, 2101,   16,    9,    9,
           3]], dtype=int64)>, <tf.Tensor: shape=(2, 24), dtype=int64, numpy=
array([[   2,   39,   39,  162,  117,  658,   28,   39,   39,    9,    9,
         264,  373, 1950,  133,   42,  523,  201,  557, 1674,   16,    9,
           9,    3],
       [   2,  282,   16,   10,  209,   11,    3,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0]], dtype=int64)>]
[START] obrigado . ( aplausos ) [END] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
obriga

## Usage of class Embed
Embed has Embed.tokenizedata(data) ->(tokenizedspanish,tokenizedpt)as dtype= Tensor


Embed has Embed.detokenizedata(data) ->(detokenizedspanish,detokenizedpt) as dtype= Tensor


Be careful with detokenized as it should be decoded again!
For now they are only crated for the specific case of spanish and pt.

It will include de START AND END ALSO in both 

## Usage of class Data
Data() -> Data.pt ->(Data pt)


       -> Data.es ->(Data es) 


# ALL IS HEAVILY INFLUENCED BY https://www.tensorflow.org/text/guide/subwords_tokenizer?hl=es-419%C3%A7

As it is just a learning project