In [0]:
#https://www.tensorflow.org/tutorials/load_data/text

This tutorial provides an example of how to use `tf.data.TextLineDataset` to load examples from text files. TextLineDataset is designed to create a dataset from a text file, in which each example is a line of text from the original file. This is potentially useful for any text data that is primarily line-based (for example, poetry or error logs).

In this tutorial, we'll use three different English translations of the same work, Homer's Illiad, and train a model to identify the translator given a single line of text.

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

# TensorFlow and tf.keras
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [0]:
import os
import tensorflow_datasets as tfds

In [21]:
#Getting the data from google
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
  
parent_dir = os.path.dirname(text_dir)

parent_dir

'/root/.keras/datasets'

#Load text into datasets

Iterate through the files, loading each one into its own dataset.

Each example needs to be individually labeled, so use tf.data.Dataset.map to apply a labeler function to each one. This will iterate over every example in the dataset, returning (example, label) pairs.

In [0]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64) #remember .cast changes the type of data 

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name)) #A Dataset comprising lines from one or more text files
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i)) #Creo que a TextLineDataset ya le tienes que pasar el texto como lineas
  labeled_data_sets.append(labeled_dataset)


In [23]:
lines_dataset #An array of type: <TextLineDatasetV2 shapes: (), types: tf.string>
labeled_dataset #A mapDataset <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>
labeled_data_sets #Its MapDataset, but 3 of them. (because we have 3 files)

[<MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>]

In [0]:
for i, file_name in enumerate(FILE_NAMES):
  file_name

In [0]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000


In [0]:
#Combine these labeled datasets into a single dataset, and shuffle it.
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False) #Buffer size: representing the number of elements from this dataset from which the new dataset will sample.


In [27]:
#You can use tf.data.Dataset.take and print to see what the (example, label) pairs look like. 
#The numpy property shows each Tensor's value.
for ex in all_labeled_data.take(5):
  print(ex)
#I think until now we make each line a "observation"


(<tf.Tensor: shape=(), dtype=string, numpy=b'And the wind filled it. Roared the sable flood'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'knees of Minerva, lay the largest and fairest robe you have in your'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Lived not his lusty manhood to enjoy,'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Fell Stichius and Arcesilas; the one,'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'A heron, by command of Pallas, flew'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


#Encode text lines as numbers

Machine learning models work on numbers, not words, so the string values need to be converted into lists of numbers. To do that, map each unique word to a unique integer.
Build vocabulary

First, build a vocabulary by tokenizing the text into a collection of individual unique words. There are a few ways to do this in both TensorFlow and Python. For this tutorial:

    Iterate over each example's numpy value.
    Use tfds.features.text.Tokenizer to split it into tokens.
    Collect these tokens into a Python set, to remove duplicates.
    Get the size of the vocabulary for later use.


In [28]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data: #all_labeled_data contains all lines, _ is the label, the label is to which text it belongs
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

17178

#Okay, so to tokenize text:


```
is as simple as:
stringToTokenize = stringToTokenize = "a big horse went over my house and it died hard, now i added a comma"
tokens = tokenizer.tokenize(stringToTokenize)
tokens # <- has the tokenized dic
```
then to remove repeated words:


```
vocabulary_set = set()
vocabulary_set.update(tokens)
vocabulary_set
```




#Encode examples

Create an encoder by passing the vocabulary_set to tfds.features.text.TokenTextEncoder. The encoder's encode method takes in a string of text and returns a list of integers.

In [0]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)


In [30]:
encoder #It is a tensorflow type TokenTextEncoder

<TokenTextEncoder vocab_size=17180>

In [31]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b'And the wind filled it. Roared the sable flood'


In [32]:
encoded_example = encoder.encode(example_text)
print(encoded_example) #Literal pasaron los tokens a número. 

[8877, 7411, 6862, 16693, 3627, 10645, 7411, 5476, 8013]


In [0]:
def encode(text_tensor, label): #text_tensor es la string
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64)) #https://www.tensorflow.org/api_docs/python/tf/py_function
  #I think

all_encoded_data = all_labeled_data.map(encode_map_fn)#pass that function to 



In [54]:
label_and_text = next(iter(all_labeled_data))
label_and_text[1]

encodedmapfn = encode_map_fn(label_and_text[0],label_and_text[1])
encodedmapfn #Basicamente regresa el texto como integers y la label, nota: encodedmapfn[0] es el texto y [1] es la label

[<tf.Tensor: shape=(9,), dtype=int64, numpy=array([ 8877,  7411,  6862, 16693,  3627, 10645,  7411,  5476,  8013])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>]

#Split the dataset into test and train batches

Use `tf.data.Dataset.take` and `tf.data.Dataset.skip` to create a small test dataset and a larger training set.

Before being passed into the model, the datasets need to be batched. Typically, the examples inside of a batch need to be the same size and shape. But, the examples in these datasets are not all the same size — each line of text had a different number of words. So use `tf.data.Dataset.padded_batch` (instead of batch) to pad the examples to the same size.

In [0]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE) #The take is to randomly take n ammount of 'rows'
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

test_data = all_encoded_data.take(TAKE_SIZE) #Is the train data and test data the same size
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))


Now, test_data and train_data are not collections of (example, label) pairs, but collections of batches. Each batch is a pair of (many examples, many labels) represented as arrays.

In [67]:
sample_text, sample_labels = next(iter(test_data))

sample_text[63], sample_labels[0] #all sample_text has the same shape. same with labels.
#If we place the index too high it gives an error because its outside the batch
#Batch_size is 64, therefore 63 is the last achievable index

(<tf.Tensor: shape=(15,), dtype=int64, numpy=
 array([ 4946,  5877, 15770,  5877, 12744, 13297,     0,     0,     0,
            0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>)

In [0]:
vocab_size += 1 #Because the 0 is added to the encoding (the dictionary)
#Note, dont really use this cell more than once because it will increase your dic size uneccesarly

#Build the model

In [0]:
model = tf.keras.Sequential()
#The first layer converts integer representations to dense vector embeddings. See the word embeddings tutorial or more details.
model.add(tf.keras.layers.Embedding(vocab_size, 64))
#The next layer is a Long Short-Term Memory layer, 
#which lets the model understand words in their context with other words. 
#A bidirectional wrapper on the LSTM helps it to learn about the datapoints in relationship 
#to the datapoints that came before it and after it.
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
#Finally we'll have a series of one or more densely connected layers, with the last one being the output layer. 
#The output layer produces a probability for all the labels. 
#The one with the highest probability is the models prediction of an example's label.
"""
inally we'll have a series of one or more densely connected layers, with the last one being the output layer. 
The output layer produces a probability for all the labels. 
The one with the highest probability is the models prediction of an example's label.
"""
for units in [64, 64]:# Edit the list in the `for` line to experiment with layer sizes. Note, this just gives the value 64 twice
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels. because we get probabilities for each label
model.add(tf.keras.layers.Dense(3, activation='softmax'))





Finally, compile the model. For a softmax categorization model, use sparse_categorical_crossentropy as the loss function. You can try other optimizers, but adam is very common.

In [0]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

#Train the model

In [72]:
#This model running on this data produces decent results (about 83%).
model.fit(train_data, epochs=3, validation_data=test_data)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f8f09471f98>

In [73]:
eval_loss, eval_acc = model.evaluate(test_data)
print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


     79/Unknown - 3s 37ms/step - loss: 0.4096 - accuracy: 0.8284
Eval loss: 0.410, Eval accuracy: 0.828
