<a href="https://colab.research.google.com/github/gitaroktato/deep-learning-exercises/blob/verne-rnn-generator/%20Verne_text_generation_using_RNN..ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text generation in the style of Jules Verne using RNN


In [2]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import platform
import time
import pathlib
import os

print('Python version:', platform.python_version())
print('Tensorflow version:', tf.__version__)
print('Keras version:', tf.keras.__version__)

Python version: 3.10.12
Tensorflow version: 2.17.0
Keras version: 3.4.1


## Loading the dataset
Load the dataset and read the data. Take a look in the text.

In [3]:
path_to_file = './utazas_a_holdra.txt'
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 662630 characters


In [4]:
# First few characters of the text
print(text[:250])

JULES VERNE
UTAZÁS A HOLDBA
______
UTAZÁS A HOLD KÖRÜL
KÉT REGÉNY
FORDÍTOTTA: KILÉNYI MÁRIA
2
TARTALOM
UTAZÁS A HOLDBA
1. A GUN CLUB
2. BARBICANE ELNÖK BEJELENTÉSE
3. A BEJELENTÉS HATÁSA
4. A CAMBRIDGE-I CSILLAGVIZSGÁLÓ VÁLASZA
5. A HOLD REGÉNYE
6. A


In [5]:
# The unique characters in the file. Note, that it contains Hungarian character set.
vocab = sorted(set(text))

print('{} unique characters'.format(len(vocab)))
print('vocab:', vocab)

118 unique characters
vocab: ['\n', ' ', '!', '%', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '°', 'Á', 'É', 'Í', 'Ó', 'Ö', 'Ú', 'Ü', 'á', 'è', 'é', 'í', 'ó', 'ô', 'ö', 'ú', 'ü', 'Ő', 'ő', 'Ű', 'ű', 'π', '’', '”', '„', '−', '\uf8eb', '\uf8ec', '\uf8ed', '\uf8ee', '\uf8ef', '\uf8f0', '\uf8f6', '\uf8f7', '\uf8f8', '\uf8f9', '\uf8fa', '\uf8fb']


# Process the text


In [6]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

print('{')
for char, _ in zip(vocab, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), ids_from_chars(char)))
print('  ...\n}')

{
  '\n':   1,
  ' ' :   2,
  '!' :   3,
  '%' :   4,
  "'" :   5,
  '(' :   6,
  ')' :   7,
  '*' :   8,
  '+' :   9,
  ',' :  10,
  '-' :  11,
  '.' :  12,
  '/' :  13,
  '0' :  14,
  '1' :  15,
  '2' :  16,
  '3' :  17,
  '4' :  18,
  '5' :  19,
  '6' :  20,
  ...
}


In [7]:
# Convert chars in text to indices.
text_as_int = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

print('text_as_int length: {}'.format(len(text_as_int)))
print('{} --> {}'.format(repr(text[:15]), repr(text_as_int[:15].numpy())))

text_as_int length: 662630
'JULES VERNE\nUTA' --> array([37, 48, 39, 32, 46,  2, 49, 32, 45, 41, 32,  1, 48, 47, 28])


## Create training examples and targets

In [8]:
# The maximum length sentence we want for a single input in characters.
sequence_length = 100
examples_per_epoch = len(text) // (sequence_length + 1)

print('examples_per_epoch:', examples_per_epoch)

examples_per_epoch: 6560


In [9]:
# Create training dataset.
ids_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for id in ids_dataset.take(10):
    print(chars_from_ids(id).numpy().decode('utf-8'))

J
U
L
E
S
 
V
E
R
N


In [10]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

# Generate batched sequences out of the char_dataset.
sequences = ids_dataset.batch(sequence_length + 1, drop_remainder=True)

# Sequences size is the same as examples_per_epoch.
print('Sequences count: {}'.format(len(list(sequences.as_numpy_iterator()))));
print()

# Sequences examples.
for item in sequences.take(5):
    print(text_from_ids(item).numpy().decode('UTF-8'))

Sequences count: 6560

JULES VERNE
UTAZÁS A HOLDBA
______
UTAZÁS A HOLD KÖRÜL
KÉT REGÉNY
FORDÍTOTTA: KILÉNYI MÁRIA
2
TARTALO
M
UTAZÁS A HOLDBA
1. A GUN CLUB
2. BARBICANE ELNÖK BEJELENTÉSE
3. A BEJELENTÉS HATÁSA
4. A CAMBRIDGE-
I CSILLAGVIZSGÁLÓ VÁLASZA
5. A HOLD REGÉNYE
6. AMIT MINDEN AMERIKAINAK TUDNIA KELL, ÉS AMIT EGYETLEN 
AMERIKAINAK SEM
SZABAD HINNIE TÖBBÉ
7. AZ ÁGYÚGOLYÓ HIMNUSZA
8. AZ ÁGYÚ TÖRTÉNETE
9. A LŐPORKÉRDÉS
10
. HUSZONÖTMILLIÓ BARÁT ÉS EGYETLENEGY ELLENSÉG
11. FLORIDA ÉS TEXAS
12. URBI ET ORBI
13. STONE’S HILL


In [11]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [12]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [13]:
dataset = sequences.map(split_input_target)

# Dataset size is the same as examples_per_epoch.
# But each element of a sequence is now has length of `sequence_length`
# and not `sequence_length + 1`.
print('dataset size: {}'.format(len(list(dataset.as_numpy_iterator()))))

dataset size: 6560


In [14]:
for input_example, target_example in dataset.take(1):
    print('Input sequence size:', repr(len(input_example.numpy())))
    print('Target sequence size:', repr(len(target_example.numpy())))
    print()
    print('Input:', text_from_ids(input_example).numpy().decode('UTF-8'))
    print('Target:', text_from_ids(target_example).numpy().decode('UTF-8'))

Input sequence size: 100
Target sequence size: 100

Input: JULES VERNE
UTAZÁS A HOLDBA
______
UTAZÁS A HOLD KÖRÜL
KÉT REGÉNY
FORDÍTOTTA: KILÉNYI MÁRIA
2
TARTAL
Target: ULES VERNE
UTAZÁS A HOLDBA
______
UTAZÁS A HOLD KÖRÜL
KÉT REGÉNY
FORDÍTOTTA: KILÉNYI MÁRIA
2
TARTALO


In [15]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print('Step {:2d}'.format(i))
    print('  input: {} ({:s})'.format(input_idx, chars_from_ids(input_idx).numpy().decode('UTF-8')))
    print('  expected output: {} ({:s})'.format(target_idx, chars_from_ids(target_idx).numpy().decode('UTF-8')))

Step  0
  input: 37 (J)
  expected output: 48 (U)
Step  1
  input: 48 (U)
  expected output: 39 (L)
Step  2
  input: 39 (L)
  expected output: 32 (E)
Step  3
  input: 32 (E)
  expected output: 46 (S)
Step  4
  input: 46 (S)
  expected output: 2 ( )


## Create training batches

In [16]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [17]:
print('Batched dataset size: {}'.format(len(list(dataset.as_numpy_iterator()))))

Batched dataset size: 102


In [18]:
for input_text, target_text in dataset.take(1):
    print('1st batch: input_text:', input_text)
    print()
    print('1st batch: target_text:', target_text)

1st batch: input_text: tf.Tensor(
[[ 2 62 59 ... 74 63  2]
 [66 59  2 ...  2 63 66]
 [58  2 11 ... 72  2 55]
 ...
 [62 55 68 ... 67 89 72]
 [55 74 74 ... 55 66  2]
 [80 80 89 ... 74 64 75]], shape=(64, 100), dtype=int64)

1st batch: target_text: tf.Tensor(
[[62 59 66 ... 63  2 63]
 [59  2 74 ... 63 66 66]
 [ 2 11 10 ...  2 55  2]
 ...
 [55 68 59 ... 89 72  2]
 [74 74 55 ... 66  2 67]
 [80 89 65 ... 64 75 65]], shape=(64, 100), dtype=int64)


## Build The Model

In [19]:
# Let's do a quick detour and see how Embeding layer works.
# It takes several char indices sequences (batch) as an input.
# It encodes every character of every sequence to a vector of tmp_embeding_size length.
tmp_vocab_size = 10
tmp_embeding_size = 5
tmp_input_length = 8
tmp_batch_size = 2

tmp_model = tf.keras.models.Sequential()
tmp_model.add(tf.keras.layers.Embedding(
  input_dim=tmp_vocab_size,
  output_dim=tmp_embeding_size,
  input_length=tmp_input_length
))
# The model will take as input an integer matrix of size (batch, input_length).
# The largest integer (i.e. word index) in the input should be no larger than 9 (tmp_vocab_size).
# Now model.output_shape == (None, 10, 64), where None is the batch dimension.
tmp_input_array = np.random.randint(
  low=0,
  high=tmp_vocab_size,
  size=(tmp_batch_size, tmp_input_length)
)
tmp_model.compile('rmsprop', 'mse')
tmp_output_array = tmp_model.predict(tmp_input_array)

print('tmp_input_array shape:', tmp_input_array.shape)
print('tmp_input_array:')
print(tmp_input_array)
print()
print('tmp_output_array shape:', tmp_output_array.shape)
print('tmp_output_array:')
print(tmp_output_array)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
tmp_input_array shape: (2, 8)
tmp_input_array:
[[0 1 3 8 5 7 9 9]
 [0 2 1 5 2 8 8 5]]

tmp_output_array shape: (2, 8, 5)
tmp_output_array:
[[[-0.04568676 -0.01284541 -0.00155491 -0.02499045  0.00863118]
  [ 0.03218542  0.02734811 -0.00964219  0.03639444 -0.01959059]
  [ 0.04910115  0.03903227  0.00176507  0.0377253   0.04589237]
  [ 0.00459959  0.02586207  0.03284928  0.04320792 -0.01061209]
  [-0.03622679 -0.01264     0.02246698  0.02434431 -0.01442967]
  [-0.04674475 -0.02353743 -0.03020905  0.00873753 -0.02319526]
  [ 0.01862843 -0.04423301 -0.01099458  0.03614916  0.00987977]
  [ 0.01862843 -0.04423301 -0.01099458  0.03614916  0.00987977]]

 [[-0.04568676 -0.01284541 -0.00155491 -0.02499045  0.00863118]
  [-0.00576985  0.03029778  0.02817782 -0.04048645 -0.03561816]
  [ 0.03218542  0.02734811 -0.00964219  0.03639444 -0.01959059]
  [-0.03622679 -0.01264     0.02246698  0.02434431 -0.01442967]
  [-0.00576985  0.



In [20]:
# Length of the vocabulary in chars.
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension.
embedding_dim = 256

# Number of RNN units.
rnn_units = 1024

In [21]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.models.Sequential()

    model.add(tf.keras.layers.Embedding(
      input_dim=vocab_size,
      output_dim=embedding_dim
    ))

    model.add(tf.keras.layers.GRU(
      units=rnn_units,
      return_sequences=True,
      stateful=True,
      recurrent_initializer=tf.keras.initializers.GlorotNormal()
    ))

    model.add(tf.keras.layers.Dense(vocab_size))

    return model

In [22]:
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

## Try the model

In [23]:
for input_example_batch, target_example_batch in dataset.take(10):
    print(input_example_batch.shape)
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)


In [24]:
model.summary()

In [25]:
print('Prediction for the 1st letter of the batch 1st sequense:')
print(example_batch_predictions[0, 0])

Prediction for the 1st letter of the batch 1st sequense:
tf.Tensor(
[ 5.53638581e-03  8.46946333e-03  1.10695744e-02  1.46031128e-02
  6.61942933e-04 -7.25526176e-03 -6.20857487e-03  1.52903190e-03
  1.98550709e-02  8.41939636e-03  9.69224609e-03 -3.03096548e-02
  2.22517010e-02  5.41167334e-03  2.92498758e-03  1.21060535e-02
 -1.26944352e-02  9.51780379e-03 -4.24773665e-03 -1.63760446e-02
 -6.56844303e-03 -1.39942896e-02 -8.92112032e-03 -4.34356090e-03
 -7.98467919e-03 -1.01369340e-02  7.54665723e-03 -1.16918329e-02
  1.17211053e-02 -5.64851845e-03  1.28133623e-02  5.22732968e-03
 -7.55425123e-03  7.76874367e-05 -7.47969234e-03  5.24080358e-04
  3.46918264e-03 -4.83493600e-03  6.87741861e-03  2.20147846e-03
  9.87463817e-03 -3.72948125e-05 -9.80658922e-03 -5.85268997e-03
 -7.13795191e-04 -5.16781118e-03 -1.16792014e-02 -2.46986654e-02
  1.15940999e-02 -3.42167635e-03 -7.79477134e-03 -7.24930968e-03
 -5.31043764e-03 -8.37845635e-03  9.58454981e-03  9.00609419e-03
 -5.27997501e-03 -3.54

## Train

In [26]:
# An objective function.
# The function is any callable with the signature scalar_loss = fn(y_true, y_pred).
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(
      y_true=labels,
      y_pred=logits,
      from_logits=True
    )

example_batch_loss = loss(target_example_batch, example_batch_predictions)

print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 119)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.7788243


In [27]:
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(
    optimizer=adam_optimizer,
    loss=loss
)

## Configure checkpoints

In [28]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS=40

import keras

keras.config.disable_traceback_filtering()
# keras.backend.set_image_data_format("channels_last")

history = model.fit(
  x=dataset,
  epochs=EPOCHS,
  callbacks=[
    checkpoint_callback
  ]
)

Epoch 1/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m546s[0m 5s/step - loss: 3.5625
Epoch 2/40
[1m 93/102[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m47s[0m 5s/step - loss: 2.3383