# Load text

In [1]:
import tensorflow as tf

import tensorflow_datasets as tfds
import os

In [2]:
parent_dir = '../../data/text'
FILE_NAMES = os.listdir(parent_dir)

print('File names: ', FILE_NAMES)

File names:  ['butler.txt', 'derby.txt', 'cowper.txt']


In [3]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)  

labeled_datasets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_datasets.append(labeled_dataset)

In [4]:
 with open(os.path.join(parent_dir, 'butler.txt')) as f:
    first_line = f.readline()
    print("first line: ", first_line)

first line:  ﻿Sing, O goddess, the anger of Achilles son of Peleus, that brought



In [5]:
print(type(labeled_datasets[0].take(1)))

for i in labeled_datasets[0].take(1):
    print(i)

<class 'tensorflow.python.data.ops.dataset_ops.TakeDataset'>
(<tf.Tensor: shape=(), dtype=string, numpy=b'\xef\xbb\xbfSing, O goddess, the anger of Achilles son of Peleus, that brought'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


In [6]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

Merge all the datasets in one macro dataset. Then we will shuffle it.

In [7]:
all_labeled_data = labeled_datasets[0]
for labeled_dataset in labeled_datasets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE,
                                            reshuffle_each_iteration=False
                                           )

In [8]:
for ex in all_labeled_data.take(5):
    print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b"To high Olympus, and the courts of Heav'n.">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Shedding soft tears: hast thou some tidings brought'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"And broke the throttling strap of tough bull's hide.">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'One famed, and one a boxer never foiled;'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'are balanced as it were on the edge of a razor. Go then, for you are'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


#### Encode text lines as numbers

Machine learning models work on numbers, not words, so the string values need to be converted into lists of numbers. To do that, map each unique word to a unique integer.

#### Build the vocabulary

First, build a vocabulary by tokenizing the text into a collection of individual unique words:

  * Iterate over each example's numpy value.
  * Use `tfds.features.text.Tokenizer` to split it into tokens.
  * Collect these tokens into a Python set, to remove duplicates.
  * Get the size of the vocabulary for later use.

In [9]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
print("vocabulary size: ", vocab_size)

vocabulary size:  17178


#### Econde examples

In [10]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [11]:
example_text = next(iter(all_labeled_data))[0].numpy()
print("Example sentence: ", example_text)

Example sentence:  b"To high Olympus, and the courts of Heav'n."


In [12]:
encoded_example = encoder.encode(example_text)
print("Encoded sentence: ", encoded_example)

Encoded sentence:  [9683, 16395, 8977, 8788, 6627, 17057, 1416, 4680, 6074]


Run the encoder on the dataset by wrapping it in `tf.py_function` and passing that to the dataset's map method.

In [13]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

In [14]:
def encode_map_fn(text, label):
    encoded_text, label = tf.py_function(encode, 
                                         inp=[text, label], 
                                         Tout=(tf.int64, tf.int64))
    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label


all_encoded_data = all_labeled_data.map(encode_map_fn)

#### Split the dataset into test and train batches

In [15]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

Since we have introduced a new token encoding (the zero used for padding), the vocabulary size has increased by one.

In [16]:
vocab_size += 1

#### Build the model

In [17]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.Dense(3))

In [18]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

This training will not work with TensorFlow 2.0.

The error
```
CancelledError: [Derived]RecvAsync is cancelled.
[[{{node Reshape_17/_52}}]]
[[GroupCrossDeviceControlEdges_0/RMSprop/RMSprop/Const/_57]] [Op:__inference_distributed_function_24912]
 
Function call stack:
distributed_function
```

is fixed with TensorFlow 2.1

**For this training I updated TensorFlow to `tensorflow==2.1.0`**

In [19]:
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [20]:
model.fit(train_data, 
          epochs=3, 
          validation_data=test_data,
          verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f0120083b00>

In [21]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

     79/Unknown - 2s 22ms/step - loss: 0.3519 - accuracy: 0.8404
Eval loss: 0.352, Eval accuracy: 0.840
