### Setup

In [2]:
import tensorflow as tf
import numpy as np
import os
import time

from IPython.display import clear_output

#### Download Data

In [3]:
path_to_downloaded_file = tf.keras.utils.get_file(
    fname = 'shakespeare.txt',
    origin = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt',
    cache_subdir = '/content/',
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


#### Explore data

In [5]:
with open(path_to_downloaded_file, 'rb') as file:
    text = file.read().decode(encoding='utf-8')

print(f"Corpus in '{os.path.basename(path_to_downloaded_file)}' has {len(text):,} characters.\n")
print(f"Starts with: \n\n{text[:100]}...")

Corpus in 'shakespeare.txt' has 1,115,394 characters.

Starts with: 

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You...


In [6]:
vocab = sorted(set(text))
VOCAB_SIZE = len(vocab)

print(f"Corpus has {VOCAB_SIZE:,} unique characters:")
print(vocab)

Corpus has 65 unique characters:
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


#### **`StringLookup`**

- maps a set of arbitrary input strings into (possibly encoded) integer indices output, via a table-based vocabulary lookup.
- performs no splitting or transformation of input strings.
- The vocabulary for the layer must be either supplied on construction or learned via `adapt()`.

In [7]:
chars_to_ids_layer = tf.keras.layers.StringLookup(vocabulary = vocab)

# Test
token_ids = chars_to_ids_layer(list("You can never be overdressed\n\nOr + overeducated."))
token_ids

<tf.Tensor: shape=(48,), dtype=int64, numpy=
array([38, 54, 60,  2, 42, 40, 53,  2, 53, 44, 61, 44, 57,  2, 41, 44,  2,
       54, 61, 44, 57, 43, 57, 44, 58, 58, 44, 43,  1,  1, 28, 57,  2,  0,
        2, 54, 61, 44, 57, 44, 43, 60, 42, 40, 59, 44, 43,  9])>

In [8]:
ids_to_chars_layer = tf.keras.layers.StringLookup(
                    vocabulary = chars_to_ids_layer.get_vocabulary(),
                    invert=True,
                )

# Test
tokens = ids_to_chars_layer(token_ids)
tokens

<tf.Tensor: shape=(48,), dtype=string, numpy=
array([b'Y', b'o', b'u', b' ', b'c', b'a', b'n', b' ', b'n', b'e', b'v',
       b'e', b'r', b' ', b'b', b'e', b' ', b'o', b'v', b'e', b'r', b'd',
       b'r', b'e', b's', b's', b'e', b'd', b'\n', b'\n', b'O', b'r', b' ',
       b'[UNK]', b' ', b'o', b'v', b'e', b'r', b'e', b'd', b'u', b'c',
       b'a', b't', b'e', b'd', b'.'], dtype=object)>

In [9]:
def ids_to_text(input):
    output = ids_to_chars_layer(input)
    output = tf.strings.reduce_join(output, axis=-1).numpy()
    return output

ids_to_text(token_ids)

b'You can never be overdressed\n\nOr [UNK] overeducated.'

## The Prediction Task

The goal is to, given a character, or a sequence of characters, to predict what is the most probable next character?

Since RNNs maintain an internal state that depends on the previously seen element/s, given all the characters computed until this moment, what is the next character?

### Encode data into token ids

In [10]:
# split corpus into chars
text_chars = tf.strings.unicode_split(text, 'UTF-8')
# encode chars into int indices based on vocab
text_ids = chars_to_ids_layer(text_chars)
# create dataset from tensor
dataset = tf.data.Dataset.from_tensor_slices(text_ids)

for id in dataset.take(10):
    print(f"{id.numpy()} -> '{ids_to_chars_layer(id).numpy().decode('utf-8')}'", end=', ')

19 -> 'F', 48 -> 'i', 57 -> 'r', 58 -> 's', 59 -> 't', 2 -> ' ', 16 -> 'C', 48 -> 'i', 59 -> 't', 48 -> 'i', 

### Batch data

In [11]:
SEQ_LENGTH = 100

sequenced_ds = dataset.batch(SEQ_LENGTH+1, drop_remainder=True)

for batch in sequenced_ds.take(1):
    print(f"Batch ids:\n{batch}", end='\n\n')
    print(f"Batch chars:\n{ids_to_chars_layer(batch)}", end='\n\n')
    print(f"Batch chars joined:\n{ids_to_text(batch)}", end='\n\n')
    break

Batch ids:
[19 48 57 58 59  2 16 48 59 48 65 44 53 11  1 15 44 45 54 57 44  2 62 44
  2 55 57 54 42 44 44 43  2 40 53 64  2 45 60 57 59 47 44 57  7  2 47 44
 40 57  2 52 44  2 58 55 44 40 50  9  1  1 14 51 51 11  1 32 55 44 40 50
  7  2 58 55 44 40 50  9  1  1 19 48 57 58 59  2 16 48 59 48 65 44 53 11
  1 38 54 60  2]

Batch chars:
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' ']

Batch chars joined:
b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '



### Label data

In [12]:
def split_sequence(seq):
    input_seq = seq[:-1]
    target_seq = seq[1:]
    return (input_seq, target_seq)

split_sequence("Hello World!")

('Hello World', 'ello World!')

In [13]:
final_ds = sequenced_ds.map(lambda seq: split_sequence(seq))

for input_batch, target_batch in final_ds.take(5):
    print(ids_to_text(input_batch))
    print(ids_to_text(target_batch))

b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
b're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us k"
b"ow Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be "
b"l him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor cit'
b'ne: away, 

Alternatively

In [31]:
ds = dataset.window(SEQ_LENGTH+1, shift=1, drop_remainder=True)
ds = ds.flat_map(lambda window: window.batch(SEQ_LENGTH+1))
ds = ds.shuffle(1000).batch(64, drop_remainder=True)
ds = ds.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

next(iter(ds))

(<tf.Tensor: shape=(64, 100), dtype=int64, numpy=
 array([[51, 44, 40, ..., 60, 53, 43],
        [53, 11,  1, ..., 36, 47, 40],
        [57, 11,  2, ..., 47, 44, 48],
        ...,
        [62,  2, 16, ..., 50, 48, 51],
        [48, 65, 44, ..., 60, 59,  2],
        [44, 58, 54, ..., 62,  6, 59]])>,
 <tf.Tensor: shape=(64, 100), dtype=int64, numpy=
 array([[44, 40, 53, ..., 53, 43, 40],
        [11,  1, 28, ..., 47, 40, 59],
        [11,  2, 59, ..., 44, 48, 57],
        ...,
        [ 2, 16, 40, ..., 48, 51, 51],
        [65, 44, 53, ..., 59,  2, 59],
        [58, 54, 51, ...,  6, 59,  7]])>)

### Create training data

In [33]:
training_ds = final_ds.shuffle(10000).batch(64, drop_remainder=True)

for input_batch, target_batch in training_ds.take(1):
    print(f"Input {input_batch.shape} [0]: {ids_to_text(input_batch[0])}")
    print(f"Target {target_batch.shape} [0]: {ids_to_text(target_batch[0])}")

Input (64, 100) [0]: b'she took to quench it,\nShe would to each one sip. You are retired,\nAs if you were a feasted one and '
Target (64, 100) [0]: b'he took to quench it,\nShe would to each one sip. You are retired,\nAs if you were a feasted one and n'


## Build Model

In [32]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(chars_to_ids_layer.get_vocabulary())
print(f"Vocab size: {vocab_size}")

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

Vocab size: 66


In [39]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, embedding_dim),
    tf.keras.layers.GRU(rnn_units, return_sequences=True),
    tf.keras.layers.Dense(vocab_size),
], name="my_model_sequential")

To get actual predictions from the model we sample from the output distribution, to get the character indices. This distribution is defined by the logits over the character vocabulary.

It is important to sample from this distribution as taking the argmax of the distribution can easily get the model stuck in a loop.

In [120]:
for input_batch, target_batch in training_ds.take(1):
    print(f"Input batch {input_batch.shape} # (batch_size, sequence_length)\n")
    prediction_batch = model(input_batch)
    print(f"Prediction batch {prediction_batch.shape} # (batch_size, sequence_length, vocab_size)\n")
    print(f"1st prediction {prediction_batch[0].shape}:\n{prediction_batch[0]}\n")

    # sample from output distribution, defined by the logits over the character vocabulary
    sampled_seq = tf.random.categorical(prediction_batch[0], num_samples=1)
    sampled_seq = tf.squeeze(sampled_seq)
    print(f"Next char predictions:\n\n{ids_to_text(sampled_seq).decode('utf-8')}")

Input batch (64, 100) # (batch_size, sequence_length)

Prediction batch (64, 100, 66) # (batch_size, sequence_length, vocab_size)

1st prediction (100, 66):
[[ -3.8910062    0.7578387    4.126569   ...  -2.956409     1.9454858
   -3.6718848 ]
 [ -4.7899528    2.3632624    5.705147   ...  -3.7940722   -0.13120429
   -5.547408  ]
 [ -4.0349226    1.1451188    2.9610977  ...  -0.49251893  -0.41288298
   -1.7498076 ]
 ...
 [ -8.190473    17.590609    14.522903   ...  -8.360919     0.33438018
  -12.467685  ]
 [ -4.474937     1.2537353   -7.873265   ...  -4.800757     1.0759474
   -7.827163  ]
 [ -5.454521    -4.722344    -1.6846175  ...  -5.206412     5.873618
   -6.058078  ]]

Next char predictions:

slp aur geik,  ahe  fempecteIot;is;

FIdsenger:
Sor  Bt iou ll ppye tour mofe, fai io tour donse.
Ch


In [41]:
model.summary()

## Train Model

In [42]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    optimizer = 'adam',
    loss = loss_fn,
)

A newly initialized model shouldn't be too sure of itself, the output logits should all have similar magnitudes.

To confirm this we check that the exponential of the loss is approximately equal to the vocabulary size.

In [121]:
loss = loss_fn(target_batch, prediction_batch)
print(loss.numpy())
tf.exp(loss).numpy()

0.8395946


2.3154283

We use **`tf.keras.callbacks.ModelCheckpoint`** to ensure that checkpoints are saved during training.

In [44]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [122]:
training_ds = training_ds.cache().prefetch(tf.data.AUTOTUNE)

In [None]:
history = model.fit(
            training_ds,
            epochs = 20,
            callbacks = [checkpoint_callback],
        )
clear_output()

Epoch 1/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 57ms/step - loss: 0.8155
Epoch 2/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 58ms/step - loss: 0.7691
Epoch 3/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 60ms/step - loss: 0.7595
Epoch 4/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 62ms/step - loss: 0.7559
Epoch 5/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 60ms/step - loss: 0.7552
Epoch 6/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 58ms/step - loss: 0.7508
Epoch 7/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 57ms/step - loss: 0.7468
Epoch 8/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 57ms/step - loss: 0.7414
Epoch 9/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 57ms/step - loss: 0.7377
Epoch 10/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10

In [47]:
for input_batch, target_batch in training_ds.take(1):
    prediction_batch = model(input_batch)
    print(f"Prediction batch {prediction_batch.shape} # (batch_size, sequence_length, vocab_size)\n")
    print(f"1st prediction {prediction_batch[0].shape}:\n{prediction_batch[0]}\n")

    # sample from output distribution, defined by the logits over the character vocabulary
    sampled_seq = tf.random.categorical(prediction_batch[0], num_samples=1)
    sampled_seq = tf.squeeze(sampled_seq)
    print(f"Next char predictions:\n{ids_to_text(sampled_seq).decode('utf-8')}")

Prediction batch (64, 100, 66) # (batch_size, sequence_length, vocab_size)

1st prediction (100, 66):
[[ -3.8910062    0.7578387    4.126569   ...  -2.956409     1.9454858
   -3.6718848 ]
 [ -4.7899528    2.3632624    5.705147   ...  -3.7940722   -0.13120429
   -5.547408  ]
 [ -4.0349226    1.1451188    2.9610977  ...  -0.49251893  -0.41288298
   -1.7498076 ]
 ...
 [ -8.190473    17.590609    14.522903   ...  -8.360919     0.33438018
  -12.467685  ]
 [ -4.474937     1.2537353   -7.873265   ...  -4.800757     1.0759474
   -7.827163  ]
 [ -5.454521    -4.722344    -1.6846175  ...  -5.206412     5.873618
   -6.058078  ]]

Next char predictions:
wsdeour newk
,
ahen auatect Iot os 

RIssenger:
Sur, tf hou ll kaye your laee, ooy:to your ponse;
Mh


# Generate text

We predict the next character in a sequence given a context and a temperature parameter controls the randomness of predictions.

In [118]:
input_chars = tf.strings.unicode_split(['ROMEO:'], 'UTF-8').to_tensor()
chars_to_ids_layer(input_chars)

<tf.Tensor: shape=(1, 6), dtype=int64, numpy=array([[31, 28, 26, 18, 28, 11]])>

In [125]:
def predict_next_char(input, temperature=1.0):
    # Let's generate text using the trained model:
    input_chars = tf.strings.unicode_split(input, 'UTF-8')
    input_ids = chars_to_ids_layer(input_chars).to_tensor()

    predicted_logits = model.predict(input_ids, verbose=0)
    predicted_logits /= temperature

    predicted_ids = tf.random.categorical(predicted_logits[0, :], num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)[-1]
    return ids_to_chars_layer(predicted_ids).numpy().decode('utf-8')


#tf.random.set_seed(42)
next_char = tf.constant(['ROMEO:'])
text = ''
for _ in range(1000):
    text += predict_next_char(next_char)
    next_char = [text]

print(text)


HASTGRET:
Poison do, or breather; thick may long now!
Why, this is ready to be the vow; and knee
executed against those ears?

LEONTES:
Stay, of that good well; but, an't be satisfated.

LUCENTIO:
'Tis wisely's a sensell to the king's dogegom and nature:
Therefore amonds 't. Come, Katharina! 'Tis he
doubtles, are brave fellow. Dare thou me yet?

ALORSO:
Boldly, good man; for whence art thou do him more run?
Now, by my services that we can call'd me, but out retreman
Would blue shall poise of her only unpeople, for
Mightry, which the recoveth words that lives.

JULIET:
Gramer aims; thence will out-join'd up his worth.

ISABELLA:
Would this master Friar Johe fellow?

Father:
Thou art thieves?

DUKE VINCENTIO:
Lovel and in this few, but can be net cast
And pay most profits worth the belly.

MARCIUS:
How now, good for hither?

FRIAR LAURENCE:
Comfort! What faults shall I not serve, you hand.

BUCKINGHAM:
He does, I fear thee, friend, and tarry not; the part heart
is 'smemed; then let us a