<a href="https://colab.research.google.com/github/gitaroktato/deep-learning-exercises/blob/verne-rnn-generator/%20Verne_text_generation_using_RNN..ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text generation in the style of Jules Verne using RNN


In [1]:
import os
# Setting up TensorFlow log level
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

In [3]:
import tensorflow as tf
# import matplotlib.pyplot as plt
import numpy as np
import platform
import time
import pathlib
import os

print('Python version:', platform.python_version())
print('Tensorflow version:', tf.__version__)
print('Keras version:', tf.keras.__version__)
print('NumPy version:', np.__version__)

Python version: 3.9.20
Tensorflow version: 2.17.0
Keras version: 3.5.0
NumPy version: 1.26.4


## Loading the dataset
Load the dataset and read the data. Take a look in the text.

In [4]:
path_to_file = './docs/utazas_a_holdra.txt'
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 662630 characters


In [5]:
# First few characters of the text
print(text[:250])

JULES VERNE
UTAZÁS A HOLDBA
______
UTAZÁS A HOLD KÖRÜL
KÉT REGÉNY
FORDÍTOTTA: KILÉNYI MÁRIA
2
TARTALOM
UTAZÁS A HOLDBA
1. A GUN CLUB
2. BARBICANE ELNÖK BEJELENTÉSE
3. A BEJELENTÉS HATÁSA
4. A CAMBRIDGE-I CSILLAGVIZSGÁLÓ VÁLASZA
5. A HOLD REGÉNYE
6. A


In [6]:
# The unique characters in the file. Note, that it contains Hungarian character set.
vocab = sorted(set(text))

print('{} unique characters'.format(len(vocab)))
print('vocab:', vocab)

118 unique characters
vocab: ['\n', ' ', '!', '%', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '°', 'Á', 'É', 'Í', 'Ó', 'Ö', 'Ú', 'Ü', 'á', 'è', 'é', 'í', 'ó', 'ô', 'ö', 'ú', 'ü', 'Ő', 'ő', 'Ű', 'ű', 'π', '’', '”', '„', '−', '\uf8eb', '\uf8ec', '\uf8ed', '\uf8ee', '\uf8ef', '\uf8f0', '\uf8f6', '\uf8f7', '\uf8f8', '\uf8f9', '\uf8fa', '\uf8fb']


# Process the text


In [9]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

print('{')
for char, _ in zip(vocab, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), ids_from_chars(char)))
print('  ...\n}')

{
  '\n':   1,
  ' ' :   2,
  '!' :   3,
  '%' :   4,
  "'" :   5,
  '(' :   6,
  ')' :   7,
  '*' :   8,
  '+' :   9,
  ',' :  10,
  '-' :  11,
  '.' :  12,
  '/' :  13,
  '0' :  14,
  '1' :  15,
  '2' :  16,
  '3' :  17,
  '4' :  18,
  '5' :  19,
  '6' :  20,
  ...
}


In [10]:
# Convert chars in text to indices.
text_as_int = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

print('text_as_int length: {}'.format(len(text_as_int)))
print('{} --> {}'.format(repr(text[:15]), repr(text_as_int[:15].numpy())))

text_as_int length: 662630
'JULES VERNE\nUTA' --> array([37, 48, 39, 32, 46,  2, 49, 32, 45, 41, 32,  1, 48, 47, 28])


## Create training examples and targets

In [11]:
# The maximum length sentence we want for a single input in characters.
sequence_length = 100
examples_per_epoch = len(text) // (sequence_length + 1)

print('examples_per_epoch:', examples_per_epoch)

examples_per_epoch: 6560


In [12]:
# Create training dataset.
ids_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for id in ids_dataset.take(10):
    print(chars_from_ids(id).numpy().decode('utf-8'))

J
U
L
E
S
 
V
E
R
N


In [13]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

# Generate batched sequences out of the char_dataset.
sequences = ids_dataset.batch(sequence_length + 1, drop_remainder=True)

# Sequences size is the same as examples_per_epoch.
print('Sequences count: {}'.format(len(list(sequences.as_numpy_iterator()))));
print()

# Sequences examples.
for item in sequences.take(5):
    print(text_from_ids(item).numpy().decode('UTF-8'))

Sequences count: 6560

JULES VERNE
UTAZÁS A HOLDBA
______
UTAZÁS A HOLD KÖRÜL
KÉT REGÉNY
FORDÍTOTTA: KILÉNYI MÁRIA
2
TARTALO
M
UTAZÁS A HOLDBA
1. A GUN CLUB
2. BARBICANE ELNÖK BEJELENTÉSE
3. A BEJELENTÉS HATÁSA
4. A CAMBRIDGE-
I CSILLAGVIZSGÁLÓ VÁLASZA
5. A HOLD REGÉNYE
6. AMIT MINDEN AMERIKAINAK TUDNIA KELL, ÉS AMIT EGYETLEN 
AMERIKAINAK SEM
SZABAD HINNIE TÖBBÉ
7. AZ ÁGYÚGOLYÓ HIMNUSZA
8. AZ ÁGYÚ TÖRTÉNETE
9. A LŐPORKÉRDÉS
10
. HUSZONÖTMILLIÓ BARÁT ÉS EGYETLENEGY ELLENSÉG
11. FLORIDA ÉS TEXAS
12. URBI ET ORBI
13. STONE’S HILL


In [14]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [15]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [16]:
dataset = sequences.map(split_input_target)

# Dataset size is the same as examples_per_epoch.
# But each element of a sequence is now has length of `sequence_length`
# and not `sequence_length + 1`.
print('dataset size: {}'.format(len(list(dataset.as_numpy_iterator()))))

dataset size: 6560


In [17]:
for input_example, target_example in dataset.take(1):
    print('Input sequence size:', repr(len(input_example.numpy())))
    print('Target sequence size:', repr(len(target_example.numpy())))
    print()
    print('Input:', text_from_ids(input_example).numpy().decode('UTF-8'))
    print('Target:', text_from_ids(target_example).numpy().decode('UTF-8'))

Input sequence size: 100
Target sequence size: 100

Input: JULES VERNE
UTAZÁS A HOLDBA
______
UTAZÁS A HOLD KÖRÜL
KÉT REGÉNY
FORDÍTOTTA: KILÉNYI MÁRIA
2
TARTAL
Target: ULES VERNE
UTAZÁS A HOLDBA
______
UTAZÁS A HOLD KÖRÜL
KÉT REGÉNY
FORDÍTOTTA: KILÉNYI MÁRIA
2
TARTALO


In [18]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print('Step {:2d}'.format(i))
    print('  input: {} ({:s})'.format(input_idx, chars_from_ids(input_idx).numpy().decode('UTF-8')))
    print('  expected output: {} ({:s})'.format(target_idx, chars_from_ids(target_idx).numpy().decode('UTF-8')))

Step  0
  input: 37 (J)
  expected output: 48 (U)
Step  1
  input: 48 (U)
  expected output: 39 (L)
Step  2
  input: 39 (L)
  expected output: 32 (E)
Step  3
  input: 32 (E)
  expected output: 46 (S)
Step  4
  input: 46 (S)
  expected output: 2 ( )


## Create training batches

In [19]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [20]:
print('Batched dataset size: {}'.format(len(list(dataset.as_numpy_iterator()))))

Batched dataset size: 102


In [21]:
for input_text, target_text in dataset.take(1):
    print('1st batch: input_text:', input_text)
    print()
    print('1st batch: target_text:', target_text)

1st batch: input_text: tf.Tensor(
[[ 2 67 59 ... 73 80 89]
 [68 91 68 ... 72 80 91]
 [12  1 28 ... 74 69 74]
 ...
 [57 55 68 ...  2 55 65]
 [67 69 68 ... 72 97 66]
 [66 65 59 ...  2 56 55]], shape=(64, 100), dtype=int64)

1st batch: target_text: tf.Tensor(
[[67 59 74 ... 80 89 67]
 [91 68 65 ... 80 91 73]
 [ 1 28  2 ... 69 74 74]
 ...
 [55 68 59 ... 55 65 65]
 [69 68 58 ... 97 66 68]
 [65 59 58 ... 56 55 72]], shape=(64, 100), dtype=int64)


## Build The Model

In [23]:
# Let's do a quick detour and see how Embeding layer works.
# It takes several char indices sequences (batch) as an input.
# It encodes every character of every sequence to a vector of tmp_embeding_size length.
tmp_vocab_size = 10
tmp_embeding_size = 5
tmp_input_length = 8
tmp_batch_size = 2

tmp_model = tf.keras.models.Sequential()
tmp_model.add(tf.keras.layers.Embedding(
  input_dim=tmp_vocab_size,
  output_dim=tmp_embeding_size,
  input_length=tmp_input_length
))
# The model will take as input an integer matrix of size (batch, input_length).
# The largest integer (i.e. word index) in the input should be no larger than 9 (tmp_vocab_size).
# Now model.output_shape == (None, 10, 64), where None is the batch dimension.
tmp_input_array = np.random.randint(
  low=0,
  high=tmp_vocab_size,
  size=(tmp_batch_size, tmp_input_length)
)
tmp_model.compile('rmsprop', 'mse')
tmp_output_array = tmp_model.predict(tmp_input_array)

print('tmp_input_array shape:', tmp_input_array.shape)
print('tmp_input_array:')
print(tmp_input_array)
print()
print('tmp_output_array shape:', tmp_output_array.shape)
print('tmp_output_array:')
print(tmp_output_array)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
tmp_input_array shape: (2, 8)
tmp_input_array:
[[9 0 6 7 2 8 3 6]
 [8 7 0 7 2 0 2 0]]

tmp_output_array shape: (2, 8, 5)
tmp_output_array:
[[[-0.00966778  0.04374819  0.00912174  0.00637436 -0.00536316]
  [ 0.03929282  0.02441926 -0.00540962 -0.03715355  0.028134  ]
  [ 0.0287972  -0.04842454 -0.00875903  0.03096397 -0.00944003]
  [-0.04878235  0.03456947 -0.0036199  -0.00601719  0.00349481]
  [-0.02845841 -0.01485568  0.01543272 -0.00357848 -0.04306713]
  [-0.01974423 -0.01239475 -0.00883179 -0.03303087 -0.02780288]
  [ 0.02681014 -0.03628205  0.01062544 -0.0042259   0.03576546]
  [ 0.0287972  -0.04842454 -0.00875903  0.03096397 -0.00944003]]

 [[-0.01974423 -0.01239475 -0.00883179 -0.03303087 -0.02780288]
  [-0.04878235  0.03456947 -0.0036199  -0.00601719  0.00349481]
  [ 0.03929282  0.02441926 -0.00540962 -0.03715355  0.028134  ]
  [-0.04878235  0.03456947 -0.0036199  -0.00601719  0.00349481]
  [-0.02845841 -0.0

In [24]:
# Length of the vocabulary in chars.
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension.
embedding_dim = 256

# Number of RNN units.
rnn_units = 1024

In [25]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.models.Sequential()

    model.add(tf.keras.layers.Embedding(
      input_dim=vocab_size,
      output_dim=embedding_dim
    ))

    model.add(tf.keras.layers.GRU(
      units=rnn_units,
      return_sequences=True,
      stateful=True,
      recurrent_initializer=tf.keras.initializers.GlorotNormal()
    ))

    model.add(tf.keras.layers.Dense(vocab_size))

    return model

In [26]:
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

## Try the model

In [27]:
for input_example_batch, target_example_batch in dataset.take(10):
    print(input_example_batch.shape)
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)
(64, 100)
(64, 100, 119) # (batch_size, sequence_length, vocab_size)


In [28]:
model.summary()

In [29]:
print('Prediction for the 1st letter of the batch 1st sequense:')
print(example_batch_predictions[0, 0])

Prediction for the 1st letter of the batch 1st sequense:
tf.Tensor(
[ 3.26852710e-03 -1.28624793e-02 -2.20517572e-02  1.00101717e-02
  2.70425482e-03  3.19789676e-03 -4.59093275e-03  1.02390293e-02
  8.40921421e-03  2.50221156e-02  1.61201581e-02 -9.49472841e-03
  1.00166844e-02 -4.02102387e-03  1.47341397e-02 -4.94778482e-03
 -1.39987674e-02 -1.97633971e-02  3.47917303e-02 -1.07196933e-02
  7.66686583e-03  9.85303614e-03  4.75160219e-03 -6.50190655e-03
  3.26241972e-03 -1.09491069e-02  1.42350020e-02 -1.32487398e-02
 -3.43950908e-03 -1.10880602e-02  7.24065490e-03  1.00938207e-03
 -2.16816249e-03 -6.67137513e-03  5.79622341e-03  7.57358794e-05
 -1.95990205e-02  1.62121821e-02  1.54080503e-02 -1.15883434e-02
 -2.49495753e-03 -4.37521795e-03 -8.93631193e-04 -3.36307324e-02
  4.18191601e-04 -1.16866757e-03 -1.37997186e-02  8.57010856e-03
  9.82117746e-03  5.32331550e-03 -2.37127896e-02 -5.70450118e-03
 -2.26037558e-02 -7.68663036e-03  6.27195975e-03 -6.88095903e-03
  6.33839564e-03 -1.19

## Train

In [30]:
# An objective function.
# The function is any callable with the signature scalar_loss = fn(y_true, y_pred).
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(
      y_true=labels,
      y_pred=logits,
      from_logits=True
    )

example_batch_loss = loss(target_example_batch, example_batch_predictions)

print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 119)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.779363


In [31]:
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(
    optimizer=adam_optimizer,
    loss=loss
)

## Configure checkpoints

In [32]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [28]:
EPOCHS=40

import keras

keras.config.disable_traceback_filtering()
# keras.backend.set_image_data_format("channels_last")

history = model.fit(
  x=dataset,
  epochs=EPOCHS,
  callbacks=[
    checkpoint_callback
  ]
)

Epoch 1/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - loss: 3.6104
Epoch 2/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - loss: 2.3499
Epoch 3/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - loss: 2.0544
Epoch 4/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - loss: 1.8317
Epoch 5/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 64ms/step - loss: 1.6695
Epoch 6/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - loss: 1.5515
Epoch 7/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - loss: 1.4740
Epoch 8/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - loss: 1.4002
Epoch 9/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - loss: 1.3436
Epoch 10/40
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms

In [33]:
# num_generate
# - number of characters to generate.
#
# temperature
# - Low temperatures results in more predictable text.
# - Higher temperatures results in more surprising text.
# - Experiment to find the best setting.
def generate_text(model, start_string, num_generate = 1000, temperature=1.0):
    # Evaluation step (generating text using the learned model)

    # Converting our start string to numbers (vectorizing).
    input_indices = [ids_from_chars(s) for s in start_string]
    input_indices = tf.expand_dims(input_indices, 0)

    # Empty string to store our results.
    text_generated = []

    # Here batch size == 1.
    # model.reset_states()
    for char_index in range(num_generate):
        predictions = model(input_indices)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # Using a categorical distribution to predict the character returned by the model.
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(
        predictions,
        num_samples=1
        )[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state.
        input_indices = tf.expand_dims([predicted_id], 0)

        text_generated.append(chars_from_ids(predicted_id).numpy().decode('UTF-8'))

    return (start_string + ''.join(text_generated))

In [34]:
tf.train.latest_checkpoint(checkpoint_dir)

In [35]:
simplified_batch_size = 1
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.build(tf.TensorShape([simplified_batch_size, None]))
model.load_weights('./training_checkpoints/ckpt_40.weights.h5')
model.summary()

In [36]:
# Generate the text with default temperature (1.0).
print(generate_text(model, start_string=u"A holdra úgy jutunk el, hogy", temperature=0.4))

A holdra úgy jutunk el, hogy minden nehézséget állított elő - folytatta Barbicane -, miután a torkok számára. Ezek a térképek megközelítőerejének ellen a lövedék kilövésénél, de addig szó sem lehet ilyen tárgyak esése azonban s a tömegek kívánsága.
S a tömeg egy
egyszerre két tengert is megérteni az űrben ment végbe, ahol a levegő molekuláit semmi sem tartaná az állapotában. Ez a férfiaké. Latona és Jupiter e
leárnyala izgalmas pillanatában keletkező lökést. Mert éppenséggel
nem mindegy, hogy a lövedék már csak egy meteort a világűr magassága a
megfigyeléseket.
A megfigyelők szeme előtt ismét feltűnt a holdtányér egyik szélétől a másikig
követhette volna. Barbicane elnök elgondolása kezdetben helyes magassága ezáltal is megérzik, mert a vonatok az olvasztott fémnek kellett kitölteni, amelyek a bolygók közé, a Herschel-földek itt kellett kikeresni egy pontot, ahol a két égitest vonzóereje
kiegyenlítődik, a lövedék kilövésénél, de addig szó sem jutott társainkat, mint az északi sarkig te