<a href="https://colab.research.google.com/github/chambai/Deep_Learning_Course/blob/main/Week%204%20DL%20NLP%202/AustenCharLevelTextGenKeras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Character level text generation in the style of Jane Austen
Adapted from: https://stackabuse.com/python-for-nlp-deep-learning-text-generation-with-keras/ and https://keras.io/examples/generative/lstm_character_level_text_generation/

Uses LSTM model to generate text character by character

In [None]:
# import python libraries
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.utils import to_categorical
from random import randint
import re

Use the Natural Language Tool Kit (NLTK) library to download the dataset.  We are using the **Gutenberg Dataset** which contains 3036 english books written by 142 authors

In [None]:
import nltk   # natural language tool kit library
nltk.download('gutenberg')  # downloads a library that NLTK uses

from nltk.corpus import gutenberg as gut  # downloads the gutenberg dataset
print(gut.fileids())    # prints the name of the files in the dataset

The file austen-sense.txt contains raw text for the novel Sense and Sensibility by Jane Austen

In [None]:
# get the book text
book_text = nltk.corpus.gutenberg.raw('austen-sense.txt')

In [None]:
# KERAS - DO NOT RUN
text = book_text.replace("\n", " ")  # We remove newlines chars for nicer display
# text is a list of the characters in the text (each element of the list is a character)
print("First 10 elements of the corpus: ",text[:10])  # print the first 10 elements of the list
print("Corpus length:", len(text))  # pritns the number of characters in teh text

# the list of characters is stored in a dictionary where the key is each character
# chars is a unique sorted list of the list of the characters in the text
chars = sorted(list(set(text)))  
print("unique charaters: ",chars)
print("Total chars:", len(chars))
# create a dictionary with the characters as the keys
char_indices = dict((c, i) for i, c in enumerate(chars))
print("dictionary with characters as the keys: ",char_indices)
# create a dictionary with the index as the keys
indices_char = dict((i, c) for i, c in enumerate(chars))
print("dictionary with indexes as the keys: ", indices_char)

# cut the text in semi-redundant 'sentences' of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sentences:", len(sentences))
print("sentence 1: ", sentences[0])
print("sentence 2: ", sentences[1])
print("sentence 3: ", sentences[2])
# each element of the sentences has been shifted by maxlen
print("next character for sentence 1: ", next_chars[0])
print("next character for sentence 1: ", next_chars[1])
print("next character for sentence 1: ", next_chars[2])
# each element of the sentences has been shifted by maxlen
# next_chars contains the next character in each sentence

# specify the character data as booleans so that the network can use it
# creates 3D shapes for each character in the sentence and sets that co-ordiate to True (x data)
# creates 2D shapes for the next character in the sentence and sets that co-ordiate to True  (y data)

# create 3D shape of (num sentences, max length of sentence, number of unique chars in corpus)
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
# create 2D shape of (num sentences, number of unique chars in corpus)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):    # for each sentence
    for t, char in enumerate(sentence):     # for each character in each sentence
        x[i, t, char_indices[char]] = 1     # [index of sentence, index of character in sentence, unique index of the character for this corpus] set to true
    y[i, char_indices[next_chars[i]]] = 1   # [index of sentence, unique index of next character in the sentence]

print("boolean representation of sentence 0: ", x[0])
print("boolean representation of the next character after sentence 0: ", y[0])

In [None]:
# build the model
from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np
import random
import io

model = keras.Sequential()
model.add(Input(shape=(maxlen, len(chars))))
model.add(LSTM(128))
model.add(Dense(len(chars), activation="softmax"))

optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)
model.summary()

In [None]:
def sample(preds):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
# train
epochs = 25
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ""
    sentence = text[start_index : start_index + maxlen]
    print('...Generating with seed: "' + sentence + '"')

    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.0
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds)
        next_char = indices_char[next_index]
        sentence = sentence[1:] + next_char
        generated += next_char

    print("...Generated: ", generated)
    print()