# H.P. Lovecraft Character-level RNN Text Generator
This is the notebook that was used to train the TensorFlow generator model. The code is based on the TensorFlow RNN tutorial that can be found at https://www.tensorflow.org/text/tutorials/text_generation.


# Imports

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import numpy as np
import os
import time

# Download corpus (2.75 MB) from Github

In [None]:
!wget https://github.com/droesler/HP_Lovecraft_RNN_Text_Generator/raw/main/lovecraft_split_sentences.txt

--2021-09-17 21:36:49--  https://github.com/droesler/HP_Lovecraft_RNN_Text_Generator/raw/main/lovecraft_split_sentences.txt
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/droesler/HP_Lovecraft_RNN_Text_Generator/main/lovecraft_split_sentences.txt [following]
--2021-09-17 21:36:49--  https://raw.githubusercontent.com/droesler/HP_Lovecraft_RNN_Text_Generator/main/lovecraft_split_sentences.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2886674 (2.8M) [text/plain]
Saving to: ‘lovecraft_split_sentences.txt’


2021-09-17 21:36:50 (45.7 MB/s) - ‘lovecraft_split_sentences.txt’ saved [2886674/2

# Prepare Dataset

In [None]:
# Read corpus file
path_to_file = './lovecraft_split_sentences.txt'
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f'Length of text: {len(text)} characters')

Length of text: 2886590 characters


In [None]:
# Take a look at the first 250 characters in text
print(text[:250])

THE NAMELESS CITY

When I drew nigh the nameless city I knew it was accursed.
I was traveling in a parched and terrible valley under the moon, and afar I saw it protruding uncannily above the sands as parts of a corpse may protrude from an ill-made g


In [None]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

91 unique characters


In [None]:
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None)

In [None]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(2886590,), dtype=int64, numpy=array([47, 35, 32, ..., 46, 12,  1])>

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [None]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'T' b'H' b'E' b' ' b'N' b'A' b'M' b'E' b'L' b'E' b'S' b'S' b' ' b'C'
 b'I' b'T' b'Y' b'\n' b'\n' b'W' b'h' b'e' b'n' b' ' b'I' b' ' b'd' b'r'
 b'e' b'w' b' ' b'n' b'i' b'g' b'h' b' ' b't' b'h' b'e' b' ' b'n' b'a'
 b'm' b'e' b'l' b'e' b's' b's' b' ' b'c' b'i' b't' b'y' b' ' b'I' b' '
 b'k' b'n' b'e' b'w' b' ' b'i' b't' b' ' b'w' b'a' b's' b' ' b'a' b'c'
 b'c' b'u' b'r' b's' b'e' b'd' b'.' b'\n' b'I' b' ' b'w' b'a' b's' b' '
 b't' b'r' b'a' b'v' b'e' b'l' b'i' b'n' b'g' b' ' b'i' b'n' b' ' b'a'
 b' ' b'p' b'a'], shape=(101,), dtype=string)


In [None]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'THE NAMELESS CITY\n\nWhen I drew nigh the nameless city I knew it was accursed.\nI was traveling in a pa'
b'rched and terrible valley under the moon, and afar I saw it protruding uncannily above the sands as p'
b'arts of a corpse may protrude from an ill-made grave.\nFear spoke from the age-worn stones of this hoa'
b'ry survivor of the deluge, this great-grandfather of the eldest pyramid; and a viewless aura repelled'
b' me and bade me retreat from antique and sinister secrets that no man should see, and no man else had'


In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'THE NAMELESS CITY\n\nWhen I drew nigh the nameless city I knew it was accursed.\nI was traveling in a p'
Target: b'HE NAMELESS CITY\n\nWhen I drew nigh the nameless city I knew it was accursed.\nI was traveling in a pa'


In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

# Create the language model

In [None]:
# Length of the vocabulary in chars
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024        

In [None]:
# If/else conditions are used so that the model is able to predict based on 
# sequences, or read in a previous pair of hidden states in a single step mode.

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru_1 = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.gru_2 = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states_list=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)

    if states_list is None:
      x, state_1 = self.gru_1(x, training=training)
      x, state_2 = self.gru_2(x, training=training)
      
    else:
      x, state_1 = self.gru_1(x, initial_state=states_list[0], training=training)
      x, state_2 = self.gru_2(x, initial_state=states_list[1], training=training)
    
    x = self.dense(x, training=training)

    if return_state:
      states_list = []
      states_list.extend([state_1, state_2])
      return x, states_list
    else:
      return x

In [None]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 92) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  23552     
_________________________________________________________________
gru_3 (GRU)                  multiple                  3938304   
_________________________________________________________________
gru_4 (GRU)                  multiple                  6297600   
_________________________________________________________________
dense_1 (Dense)              multiple                  94300     
Total params: 10,353,756
Trainable params: 10,353,756
Non-trainable params: 0
_________________________________________________________________


In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
# Show the loss before training
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 92)  # (batch_size, sequence_length, vocab_size)
Mean loss:         4.5221515


In [None]:
model.compile(optimizer='adam', loss=loss)

# Train model

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS = 30

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Create generator model

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=0.25):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states_list=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states_list = self.model(inputs=input_ids, states_list=states_list,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states_list

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

# Generate text

In [None]:
start = time.time()
states_list = None
initial_prompt = 'The blasphemous'
next_char = tf.constant([initial_prompt])
result = [next_char]

for n in range(1000):
  next_char, states_list = one_step_model.generate_one_step(next_char, states_list=states_list)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

The blasphemous place he had written me a very perceptible resemblance--or it was a paint and almost hidden and unaccountably lower to travel alone.
When I did glance down the street at a time without part that a mere mistake had been gruesomely dangerous that Kalos showed them in the air of the missing shed no light, and the curious inhibitions suggested by his companions and came again, and for all the human race had been the recorded speech concerned with the planetary glass of a human skull at a point clutched at its memory and more frequent is carved in London-window.
At last he had seen what we read of proportion in the antarctic, while at one side I could not well uncompany her.
I handed the door and all the time for position at all, but which had been a mistake; for they were not because they were so running with the supply of the guidance or purclasion.
She was not much more than a mask for almost unassistable fright--for in all that world of day the land dwelt in the black wo