<a href="https://colab.research.google.com/github/colivarese/Text-Generator-using-LSTM/blob/main/Text_Generator_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generator with a One-to-Many Neural Network LSTM Architecture

## Import dependencies

In [None]:
import os
import numpy as np
import re
import shutil
import tensorflow as tf

## Let's set a directory path to save the model on each run, we will see this later.

In [None]:
DATA_DIR = "./"
CHECKPOINT_DIR = os.path.join(DATA_DIR, "chekpoints")

# Preparing the dataset
## Lets define a function to download and read the text data from an URL, in this case we will use the popular book Alice's Adventures in Wonderland by Lewis Carroll from the Gutenberg Project.
### We will erase the bir marks, replace the new lines for spaces to have just a long sentence and concatenate the whole text.

In [None]:
def download_and_read(urls):
  texts = []
  for i, url in enumerate(urls):
    p = tf.keras.utils.get_file("ex1-{:d}.txt".format(i), url,cache_dir=".")
  text = open(p, "r").read()
  # Erase bit mark
  text = text.replace("\ufeff", "")
  # Change new lines for spaces
  text = text.replace('\n', ' ')
  text = re.sub(r'\s+', " ", text)
  # Add each text to a list
  texts.extend(text)
  return texts
  
texts = download_and_read([
"http://www.gutenberg.org/cache/epub/28885/pg28885.txt",
"https://www.gutenberg.org/files/12/12-0.txt"
])

Downloading data from http://www.gutenberg.org/cache/epub/28885/pg28885.txt
Downloading data from https://www.gutenberg.org/files/12/12-0.txt


## We will get the whole vocabulary of the text creating a set from the text list and sort it.

In [None]:
vocab = sorted(set(texts))
print('vocav size: {:d}'.format(len(vocab)))

vocav size: 86


## Map each word in the vocab to an index so we can use it on the LSTM network.

In [None]:
char2idx = {c:i for i,c in enumerate(vocab)}
idx2char = {i:c for c,i in char2idx.items()}

## Apply the function to the text.

In [None]:
texts_as_ints = np.array([char2idx[c] for c in texts])
data = tf.data.Dataset.from_tensor_slices(texts_as_ints)

## Define the length of each sequence to create, and set it as a batch.

In [None]:
seq_length = 100
sequences = data.batch(seq_length +1, drop_remainder=True)

## Lets create a function to define the sequences, lets use a batch size of 64.

In [None]:
def split_train_labels(sequence):
  input_seq = sequence[0:-1]
  output_seq = sequence[1:]
  return input_seq, output_seq

sequences = sequences.map(split_train_labels)
batch_size = 64
steps_per_epoch = len(texts) // seq_length // batch_size
dataset = sequences.shuffle(10000).batch(
batch_size, drop_remainder=True)

## Lets define the LSTM Model, we will use the GRU layer from Keras, with a glorot uniform initializer and a sigmoid activation function.

In [None]:
class CharGenModel(tf.keras.Model):

    def __init__(self, vocab_size, num_timesteps, 
            embedding_dim, **kwargs):
        super(CharGenModel, self).__init__(**kwargs)
        self.embedding_layer = tf.keras.layers.Embedding(
            vocab_size,
            embedding_dim
        )
        self.rnn_layer = tf.keras.layers.GRU(
            num_timesteps,
            recurrent_initializer="glorot_uniform",
            recurrent_activation="sigmoid",
            stateful=True,
            return_sequences=True
        )
        self.dense_layer = tf.keras.layers.Dense(vocab_size)

    def call(self, x):
        x = self.embedding_layer(x)
        x = self.rnn_layer(x)
        x = self.dense_layer(x)
        return x

## Lets build the model

In [None]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_output_dim = 1024

model = CharGenModel(vocab_size, seq_length, embedding_dim)
model.build(input_shape=(batch_size, seq_length))
model.summary()

Model: "char_gen_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  22016     
_________________________________________________________________
gru (GRU)                    multiple                  107400    
_________________________________________________________________
dense (Dense)                multiple                  8686      
Total params: 138,102
Trainable params: 138,102
Non-trainable params: 0
_________________________________________________________________


## Lets define a sparse categorical crossentropy function.

In [None]:
def loss(labels, predictions):
  return tf.losses.sparse_categorical_crossentropy(
      labels,
      predictions,
      from_logits=True
  )
model.compile(optimizer=tf.optimizers.Adam(), loss=loss)

## Lets define a function to generate the text using the LSTM Network, we will generate 1000 characters.

In [None]:
def generate_text(model, prefix_string, char2idx, idx2char,
        num_chars_to_generate=1000, temperature=1.0):
    input = [char2idx[s] for s in prefix_string]
    input = tf.expand_dims(input, 0)
    text_generated = []
    model.reset_states()
    for i in range(num_chars_to_generate):
        preds = model(input)
        preds = tf.squeeze(preds, 0) / temperature
        pred_id = tf.random.categorical(preds, num_samples=1)[-1, 0].numpy()
        text_generated.append(idx2char[pred_id])
        input = tf.expand_dims([pred_id], 0)

    return prefix_string + "".join(text_generated)

## TO NOT PRINT WARNINGS

In [None]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

## Lets train the model for 200 epochs, and print the generate text every 10 epochs, we can see that on each epoch the text gets better.

In [None]:
num_epochs = 200
for i in range(num_epochs // 10):
    model.fit(
        dataset.repeat(),
        epochs=10,
        steps_per_epoch=steps_per_epoch,
        verbose = 0
    )
    checkpoint_file = os.path.join(
        CHECKPOINT_DIR, "model_epoch_{:d}".format(i+1))
    model.save_weights(checkpoint_file)
    gen_model = CharGenModel(vocab_size, seq_length, embedding_dim)
    gen_model.load_weights(checkpoint_file)
    gen_model.build(input_shape=(1, seq_length))
    
    #print("after epoch: {:d}".format(i+1)*10)
    print(generate_text(gen_model, "Alice ", char2idx, idx2char))
    print("---")

Alice long, now, then _I_ frouthe one that go in a tHed you won’t, she head on a conent.” “LIt’s holding of horse!’ ‘Bew! Feim, you all mean, for free of elence. “I was lowight: “there’s tome, till you’d bedn poor creath that it make. Now it when they hureds. “But it’s not lighation than the horse the froll of ‘Whead from in the shelf, you know, I must only a Queen.” “I was verg to fore’s a five that tw, emp, there was in the way, what held, but taking them jour during to you concein visiver at one I begoring her feet.” And he four you.” But called and the Sheep: “questidy. “It’s think bies of it.” “I’m sut he looked as it’s got proplest took turn; and what _must_ very go over the are way side,” Alice liler people if you don’t head and trying up the wepting on till wheerd little with the sit quite bricked any more, and anyone brown are in that Alice gently heaphing oa sight a borts watching themm out of charge on her next to childument it besard, but I stood you any vairn at the Projec