#### Setup

In [None]:
import tensorflow as tf
import tensorflow_text as tf_text
import numpy as np
import collections

### Load Data

In [None]:
data_url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
file_names = ['cowper.txt', 'derby.txt', 'butler.txt']
dir_name = '/content/'

dataset_dir = [tf.keras.utils.get_file(
                    origin = data_url + file_name,
                    fname = file_name,
                    cache_dir = dir_name,
                    cache_subdir = '',
                ) for file_name in file_names]

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
[1m815980/815980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
[1m809730/809730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt
[1m807992/807992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


### Preprocess Data
- Add labels
- Combine all the data into one single dataset
- Shuffle data to ensure uniformity during training/validation/test split

In [None]:
# label every dataset line
labeled_ds = []
for id, filepath in enumerate(dataset_dir):
    text_ds = tf.data.TextLineDataset(filepath)
    labeled_text = text_ds.map(lambda text_line: (text_line, tf.cast(id, dtype=tf.int64)))
    labeled_ds.append(labeled_text)

# Concatenate everything into a single dataset
final_ds = labeled_ds[0]
for ds in labeled_ds[1:]:
    final_ds = final_ds.concatenate(ds)

#### Shuffle

In [None]:
final_ds = final_ds.shuffle(buffer_size=50000, seed=42, reshuffle_each_iteration=False)

for text, label in final_ds.take(10):
    t = str(np.char.decode(text.numpy()))
    print(f"Sentence [{len(t)}]: {t}, Label: {label.numpy()}")

Sentence [22]: taught to use the bow., Label: 2
Sentence [43]: This said, he sat; and Atreus' godlike son,, Label: 1
Sentence [39]: Is gone to Chrysa, and with her we send, Label: 0
Sentence [67]: He cut the boar's throat as he spoke, whereon Talthybius whirled it, Label: 2
Sentence [50]: Redden'd the east, then, thronging forth, all Troy, Label: 0
Sentence [44]: Far off, the lowest abyss beneath the earth,, Label: 1
Sentence [68]: homed stag or wild goat--he has taken shelter under rock or thicket,, Label: 2
Sentence [46]: By Trojan hands, within their fleet they stood, Label: 0
Sentence [43]: Didst to our bravest wrong, dishon'ring him, Label: 1
Sentence [69]: by the ship of Achilles, and though it is now twelve days that he has, Label: 2


## Create Vocabulary

#### Create tokenizer

- Create custom tokenizer that lowercases and tokenizes the dataset.

In [None]:
class MyTokenizer(tf.keras.Layer):
    def __init__(self) -> None:
        super().__init__()
        self.tokenizer = tf_text.UnicodeScriptTokenizer()

    def call(self, text):
        lower_case_text = tf_text.case_fold_utf8(text)
        result = self.tokenizer.tokenize(lower_case_text)
        # A batch of text will return a RaggedTensor
        if isinstance(result, tf.RaggedTensor):
            result = result.to_tensor()
        return result

tokenizer = MyTokenizer()

In [None]:
tokenized_ds = final_ds.map(lambda text, label: (tokenizer(text), label))

for tokens, label in tokenized_ds.take(1):
    break
print(f"Tokenized Sentence: {tokens}")
print(f"Label: {label}")

Tokenized Sentence: [b'taught' b'to' b'use' b'the' b'bow' b'.']
Label: 2


#### Configure dataset for optimized memory management

In [None]:
tokenized_ds = tokenized_ds.cache().prefetch(tf.data.AUTOTUNE)

### Create vocabulary

- create a fequency dictionary with all the vocabulary.
- sort tokens in the vocabulary by frequency.
- keep the top VOCAB_SIZE tokens.

In [None]:
vocab_count = collections.Counter()

for batch, labels in tokenized_ds.ragged_batch(1000):
    flat_batch = tf.reshape(batch, [-1]) # Flatten batch.
    for token in flat_batch.numpy():
        vocab_count[token] += 1

VOCAB_SIZE = 10000
vocabulary = [token for token, count in vocab_count.most_common(VOCAB_SIZE)]

In [None]:
print(f"Vocab entries: {vocabulary[:20]}")

Vocab entries: [b',', b'the', b'and', b"'", b'of', b'.', b'to', b'd', b';', b'his', b'he', b'in', b'with', b'a', b'him', b'-', b'from', b'for', b'but', b'i']


#### Assign ID to vocabulary

In [None]:
class MyVocabTable(tf.keras.Layer):
    def __init__(self, vocabulary):
        super().__init__()
        self.keys = [''] + vocabulary
        self.values = range(len(self.keys))

        self.init = tf.lookup.KeyValueTensorInitializer(
                        self.keys,
                        self.values,
                        key_dtype = tf.string,
                        value_dtype= tf.int64,
                    )
        # <other term> -> bucket_id
        # bucket_id will be between len(self.values) + num_oov_buckets - 1,
        # calculated by: hash(<term>) % num_oov_buckets + vocab_size
        num_oov_buckets = 1

        # String to Id table that assigns out-of-vocabulary keys to hash buckets.
        self.table = tf.lookup.StaticVocabularyTable(self.init, num_oov_buckets)

    def call(self, x):
        return self.table.lookup(x)

# Test
myVocab = MyVocabTable(['a', 'b', 'c'])
myVocab(tf.constant(['']+list('adgsclsd')))

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([0, 1, 4, 4, 4, 3, 4, 4, 4])>

In [None]:
vocab_table = MyVocabTable(vocabulary)

# Test
text = tf.constant("this is Pepe, he said; and his 'godlike' son Achilles.")
print(text)
tokenized_text = tokenizer(text)
print(tokenized_text)
token_ids = vocab_table(tokenized_text)
print(token_ids)

tf.Tensor(b"this is Pepe, he said; and his 'godlike' son Achilles.", shape=(), dtype=string)
tf.Tensor(
[b'this' b'is' b'pepe' b',' b'he' b'said' b';' b'and' b'his' b"'"
 b'godlike' b"'" b'son' b'achilles' b'.'], shape=(15,), dtype=string)
tf.Tensor(
[   66    45 10001     1    11    82     9     3    10     4   300     4
    27    56     6], shape=(15,), dtype=int64)


In [None]:
preprocess_text = tf.keras.Sequential([
                    tokenizer,
                    vocab_table,
                    ])

In [None]:
text = tf.constant("this is Pepe, he said; and his 'godlike' son Achilles.")
token_ids_v2 = preprocess_text(text)
print(token_ids_v2)

tf.Tensor(
[   66    45 10001     1    11    82     9     3    10     4   300     4
    27    56     6], shape=(15,), dtype=int64)


#### Create a dataset pipeline that will process text and encode the text

In [None]:
encoded_ds = final_ds.map(lambda x, y: (preprocess_text(x), y))

for ids, label in encoded_ds.take(1):
    break

print(ids)
print(label)

tf.Tensor([1594    7 1595    2  309    6], shape=(6,), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)


## Split Data


- Combines consecutive elements of the input dataset into single padded batches.


In [None]:
VALIDATION_SIZE = int(49608 * 0.2)
BUFFER_SIZE = 50000
BATCH_SIZE = 64
SEED = 42

train_ds = encoded_ds.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE, SEED).padded_batch(BATCH_SIZE)
validation_ds = encoded_ds.take(VALIDATION_SIZE).shuffle(BUFFER_SIZE, SEED).padded_batch(BATCH_SIZE)

# Setup for Performance
train_ds = train_ds.cache().prefetch(tf.data.AUTOTUNE)
validation_ds = validation_ds.cache().prefetch(tf.data.AUTOTUNE)

In [None]:
for x_batch, y_batch in train_ds.take(1):
    print(f"Shapes X: {x_batch.shape}, Y: {y_batch.shape}")
    print(f"1st Element: {x_batch[0]}, Y: {y_batch[0]}")
    break

Shapes X: (64, 19), Y: (64,)
1st Element: [5432    4    8   25   79  123 4413    4   28 2156    1    0    0    0
    0    0    0    0    0], Y: 1


## Train Model

In [None]:
def create_model(vocab_size, num_labels, vectorizer=None):
    my_layers = []
    if vectorizer:
        my_layers.append(vectorizer)

    my_layers.extend([
            tf.keras.layers.Embedding(vocab_size, 64, mask_zero=True),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Conv1D(64, 5, padding='valid', activation='relu', strides=2),
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dense(num_labels),
        ]
    )
    model = tf.keras.Sequential(my_layers)
    return model

The custom text vectorizer adds 0 for padding and n+1 for out-of-vocabulary (OOV) tokens, hence the vocabulary size increases by two.

In [None]:
NUM_LABELS = 3
model = create_model(VOCAB_SIZE+2, NUM_LABELS)

model.compile(
    optimizer = 'adam',
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = ['acc'],
)

model.summary()

In [None]:
history = model.fit(
    train_ds,
    validation_data = validation_ds,
    epochs = 5,
)

Epoch 1/5




    616/Unknown [1m45s[0m 6ms/step - acc: 0.6518 - loss: 0.7185



[1m621/621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 20ms/step - acc: 0.6527 - loss: 0.7170 - val_acc: 0.8252 - val_loss: 0.4326
Epoch 2/5
[1m621/621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - acc: 0.8389 - loss: 0.3969 - val_acc: 0.8349 - val_loss: 0.3932
Epoch 3/5
[1m621/621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - acc: 0.8683 - loss: 0.3297 - val_acc: 0.8358 - val_loss: 0.3893
Epoch 4/5
[1m621/621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - acc: 0.8833 - loss: 0.2899 - val_acc: 0.8378 - val_loss: 0.3912
Epoch 5/5
[1m621/621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - acc: 0.8989 - loss: 0.2544 - val_acc: 0.8390 - val_loss: 0.3956


### Evaluate model

In [None]:
metrics = model.evaluate(validation_ds, return_dict=True)

print("Loss: ", metrics['loss'])
print("Accuracy: {:2.2%}".format(metrics['acc']))

[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - acc: 0.8374 - loss: 0.4018
Loss:  0.3956109583377838
Accuracy: 83.90%
