<a href="https://colab.research.google.com/github/gitaroktato/deep-learning-exercises/blob/verne-rnn-generator/Verne_text_generation_using_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text generation in the style of Jules Verne using RNN


In [6]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import platform
import time
import pathlib
import os

print('Python version:', platform.python_version())
print('Tensorflow version:', tf.__version__)
print('Keras version:', tf.keras.__version__)

Python version: 3.10.12
Tensorflow version: 2.17.0
Keras version: 3.4.1


## Loading the dataset
Load the dataset and read the data. Take a look in the text.

In [2]:
path_to_file = './utazas_a_holdra.txt'
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 662630 characters


In [3]:
# First few characters of the text
print(text[:250])

JULES VERNE
UTAZÁS A HOLDBA
______
UTAZÁS A HOLD KÖRÜL
KÉT REGÉNY
FORDÍTOTTA: KILÉNYI MÁRIA
2
TARTALOM
UTAZÁS A HOLDBA
1. A GUN CLUB
2. BARBICANE ELNÖK BEJELENTÉSE
3. A BEJELENTÉS HATÁSA
4. A CAMBRIDGE-I CSILLAGVIZSGÁLÓ VÁLASZA
5. A HOLD REGÉNYE
6. A


In [4]:
# The unique characters in the file. Note, that it contains Hungarian character set.
vocab = sorted(set(text))

print('{} unique characters'.format(len(vocab)))
print('vocab:', vocab)

118 unique characters
vocab: ['\n', ' ', '!', '%', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '°', 'Á', 'É', 'Í', 'Ó', 'Ö', 'Ú', 'Ü', 'á', 'è', 'é', 'í', 'ó', 'ô', 'ö', 'ú', 'ü', 'Ő', 'ő', 'Ű', 'ű', 'π', '’', '”', '„', '−', '\uf8eb', '\uf8ec', '\uf8ed', '\uf8ee', '\uf8ef', '\uf8f0', '\uf8f6', '\uf8f7', '\uf8f8', '\uf8f9', '\uf8fa', '\uf8fb']


# Process the text


In [9]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

print('{')
for char, _ in zip(vocab, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), ids_from_chars(char)))
print('  ...\n}')

{
  '\n':   1,
  ' ' :   2,
  '!' :   3,
  '%' :   4,
  "'" :   5,
  '(' :   6,
  ')' :   7,
  '*' :   8,
  '+' :   9,
  ',' :  10,
  '-' :  11,
  '.' :  12,
  '/' :  13,
  '0' :  14,
  '1' :  15,
  '2' :  16,
  '3' :  17,
  '4' :  18,
  '5' :  19,
  '6' :  20,
  ...
}
