<a href="https://colab.research.google.com/github/fhizal/Implementing-character-level-LSTM-text-generation/blob/main/Implementing_character_level_LSTM_text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow.keras
import numpy as np

path = tensorflow.keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
Corpus length: 600893


In [None]:
# Length of extracted character sequences
maxlen = 60

# We sample a new sequence every `step` characters
step = 3

# This holds our extracted sequences
sentences = []

# This holds the targets (the follow-up characters)
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

# List of unique characters in the corpus
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
# Dictionary mapping unique characters to their index in `chars`
char_indices = dict((char, chars.index(char)) for char in chars)

# Next, one-hot encode the characters into binary arrays.
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
print(x.shape)
print(x[0,0,0])
print(y.shape)
print(y[0,0])
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Number of sequences: 200278
Unique characters: 57
Vectorization...
(200278, 60, 57)
False
(200278, 57)
False


In [None]:
sentences[0]

'preface\n\n\nsupposing that truth is a woman--what then? is the'

In [None]:
sentences[0][0]

'p'

In [None]:
char_indices['r']

44

In [None]:
y[0,44]

True

In [None]:
x[0,0,42]

True

In [None]:
sum(x[0,:,:])

array([3, 8, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 5, 0, 1, 0, 4, 1, 1, 5, 3, 0, 0, 0, 1, 3, 2, 3, 0,
       2, 4, 7, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0])

In [None]:
chars[-5:]

['z', 'ä', 'æ', 'é', 'ë']

In [None]:
len(sentences)

200278

In [None]:
len(sentences[3])

60

In [None]:
sentences[4]

'pposing that truth is a woman--what then? is there not groun'

In [None]:
len(next_chars)

200278

In [None]:
len(next_chars[5])

1

In [None]:
next_chars[0]

'r'

In [None]:
len(char_indices)

57

In [None]:
char_indices.keys()

dict_keys(['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'æ', 'é', 'ë'])

In [None]:
char_indices['d']

30

In [None]:
from tensorflow.keras import layers

model = tensorflow.keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

optimizer = tensorflow.keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
import random
import sys

for epoch in range(1, 60):
    print('epoch', epoch)
    # Fit the model for 1 epoch on the available training data
    model.fit(x, y,
              batch_size=128,
              epochs=1)

    # Select a text seed at random
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated_text = text[start_index: start_index + maxlen]
    print('--- Generating with seed: "' + generated_text + '"')

    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ temperature:', temperature)
        sys.stdout.write(generated_text)

        # We generate 400 characters
        for i in range(400):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

epoch 1
--- Generating with seed: "e
savage is to forgetfulness, how his mind, after the least "
------ temperature: 0.2
e
savage is to forgetfulness, how his mind, after the least of the more of the most to the would to the most man is and the proses and the most to the most perhaps and the more of the most to the present and the seess to the most to the one to the world to the present to the more of the most to the most personess and the most perso and the present to the senses and and to the more and the present in the most different of the philosophers and the preses and
------ temperature: 0.5
in the most different of the philosophers and the preses and soul and also and the suest in its its untorman in purso the one more suffere and surenes and restent to its who one are
to he has such and in the greaterous it is the man in the greets of the mened with the preasure
in the most can such are supersine to should man
and every who are to the langer of that in the lought and the prove

  This is separate from the ipykernel package so we can avoid doing imports until


than the a stinger soul, the subject and beginning of the desire to account--to their science, and that it is a sunsion of the same other does not only the belief in the and
his
true the good every still individualitied and also its habit of the deteriorate and condition, indivirable for the soul of the same words of the hast could have still as the our tested the philosopher 
------ temperature: 1.0
the hast could have still as the our tested the philosopher herporcation, als, also pleasicing, undivingers, to balarnian still standlame and his still rosing. how its
invery know--a sun bet musy--as to you permit witness that all god. i upshin become befire on of its lip"
psocirity
indifferent incerned for the priy-theur power aso are inausposed a still meland its spirit,
often mynesy "
necepobl-ne beling in the dution;" aedot turn refundedesty for elso o
------ temperature: 1.2
ne beling in the dution;" aedot turn refundedesty for elso of hive
art upon an mraint
interlect,
a, to him, onl