In [1]:
'''
From Keras examples:
https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

Example script to generate text from Nietzsche's writings.

At least 20 epochs are required before the generated text
starts sounding coherent.

It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.

If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

Using Theano backend.


In [2]:
# Download the corpus
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
print('corpus length:', len(text))

corpus length: 600901


In [5]:
# Variable text is a string.
# set(text) gets the list of all characters occurring here (26 letters, plus any other symbols)
chars = sorted(list(set(text)))
print('total chars:', len(chars))

total chars: 59


In [6]:
# Create two dictionaries to make it easy to look up the index of a character or the character for an index
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [17]:
# Now scroll through the corpus, (step) characters at a time, 
# and chop up the original text into sequences of (maxlen) characters.
maxlen = 20
step = 20

sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 30045


In [18]:
"""
This is important:

Here we vectorize the text representation...
Turn it into X and Y vectors that a neural network can handle.

X is a matrix: 
* number of rows equal to number of 40-character chunks
* number of columns equal to number of characters in the 40 character chunk
"""

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [19]:
# Construct the neurla network:
# Long-Short Term Memory sequential neural network
print('Build model...')
model = Sequential()
model.add(LSTM(64, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

Build model...


In [20]:
# Set the optimizer that will actually solve for the neural network weights
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', 
              optimizer=optimizer)

In [21]:
"""
Helper function: sample()

Given an experiment performed multiple times,
with a fixed set of possible outcomes (preds),
we can use a multinomial distribution to quantify 
the probability of a certain set of outcomes.

Sampling the experimental outcomes `n` times in `n` experiments 
will quantify the probability of an outcome `X_i = [X_0, X_1, ..., X_p]`
representing  the number of times the outcome was `i`.

This samples the multinomial distribution randomly,
and returns the most likely outcome.
"""

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [23]:
"""
Training the Model:


"""



# train the model, output generated text after each iteration
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y, batch_size=64, nb_epoch=1)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 1


INFO (theano.gof.compilelock): Refreshing lock /Volumes/noospace/Users/charles/.theano/compiledir_Darwin-15.5.0-x86_64-i386-64bit-i386-2.7.10-64/lock_dir/lock
INFO (theano.gof.compilelock): Refreshing lock /Volumes/noospace/Users/charles/.theano/compiledir_Darwin-15.5.0-x86_64-i386-64bit-i386-2.7.10-64/lock_dir/lock


Epoch 1/1

----- diversity: 0.2
----- Generating with seed: "is to say, to a move"
is to say, to a move

INFO (theano.gof.compilelock): Refreshing lock /Volumes/noospace/Users/charles/.theano/compiledir_Darwin-15.5.0-x86_64-i386-64bit-i386-2.7.10-64/lock_dir/lock


 the and and the ande the rever and and and the erelis as and the rolity and and and and the ingere the and and the ande the and and and the and as and the and and the relity the and and the romals and conder the and the is and and the and the and the and the andes of and the and and the ares and the relfen the and the and and and and and to the ander and the relical of and the and the relile and 

----- diversity: 0.5
----- Generating with seed: "is to say, to a move"
is to say, to a move so the sily and it is eneriage prolly of a thoury hact mose to the asues and and the ands betheret and, assulite the as as of contt is and as the promentingaly the an ertean a dera us the is ponce mans as as the ascese of comalale and the and to the and the domale the erome un to the hamer do prore sorat of the resting the who ane the leat the bece fhim so the of the wicilist the ander the forel 

----- diversity: 1.0
----- Generating with seed: "is to say, to a move"
is to say, to a movesh therrty t

KeyboardInterrupt: 