In [1]:
# Deep Learning with Python Ch8: Nietzsche
#####################
#  text generation  #
#####################
# doawload and parse original text file to lowercase
import keras as kr
import numpy as np

path = kr.utils.get_file('nietzsche.txt', origin='http://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path, encoding='utf8').read().lower()
print('Corpus length:', len(text))

Using TensorFlow backend.


Corpus length: 600893


In [2]:
# vectorize sequences using one-hot encoding
maxlen = 60 # # of characters to extract per sequence
step = 3    # sample a new sequence every 3 characters

sentences = [] # hold extracted sequence
nextchars = [] # hold follow-up characters (or targets)
for i in range(0, len(text)-maxlen, step):
    sentences.append(text[i:i+maxlen])
    nextchars.append(text[i+maxlen])
print('# of sequences:', len(sentences))

# list of unique characters in corpus
chars        = sorted(list(set(text)))
# dictionary maps each unique character to its index in list chars
char_indices = dict((char, chars.index(char)) for char in chars) 
print('# of unique characters:', len(chars))

# one-hot encoding
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]]  = 1
    y[i, char_indices[nextchars[i]]] = 1

# of sequences: 200278
# of unique characters: 57


In [6]:
# build network with LSTM to predict next character
from keras import layers, models, optimizers

model = models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

optimizer = optimizers.RMSprop(lr=0.01)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 128)               95232     
_________________________________________________________________
dense_3 (Dense)              (None, 57)                7353      
Total params: 102,585
Trainable params: 102,585
Non-trainable params: 0
_________________________________________________________________


In [7]:
# train network and generate text from it
# fn to sample the next character given the model's prediction
# it reweights the original probability distribution the model outputs
# and draws a character index from the newly-weighted distribution
def sample(preds, temperature=1.0):
    preds     = np.asarray(preds).astype('float64')
    preds     = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds     = exp_preds / np.sum(exp_preds)
    probs     = np.random.multinomial(1, preds, 1)
    return np.argmax(probs)

# train network and generate text using it
import random, sys

# only train the model for 2 epochs to save time
for epoch in range(1, 3): 
    print('epoch #', epoch)
    # fit model for one iteration of the data
    model.fit(x, y, batch_size=128, epochs=1) 
    
    # randomly select a text seed
    start_index = random.randint(0, len(text)-maxlen-1)
    seed_text   = text[start_index : start_index+maxlen]
    print('--- generating with seed: " '+ seed_text + ' " ')
    
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print(' ')
        print('------ temperature:', temperature)
        sys.stdout.write(seed_text)
        
        # generate 100 characters starting from the seed text
        generated_text = seed_text
        for i in range(100):
            # one-hot encodes the character generated so far
            sampled = np.zeros([1, maxlen, len(chars)])
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1
            
            # sample the next character
            preds      = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char  = chars[next_index]
            
            generated_text += next_char
            generated_text  = generated_text[1:]
            
            sys.stdout.write(next_char)
        print(' ')

epoch # 1
Epoch 1/1
--- generating with seed: " lived; so that at
first the old motives of vehement passion  " 
 
------ temperature: 0.2
lived; so that at
first the old motives of vehement passion of the strued and the strength in the strenged of the suppicion of a so the signing the strength, an 
 
------ temperature: 0.5
lived; so that at
first the old motives of vehement passion that it we soul and strect of comple and of the person the self-distring mas to methonghous to so a  
 
------ temperature: 1.0
lived; so that at
first the old motives of vehement passion howed the hows see bath delicide? serive and a aysord find, conraraling, the empe ten ervatayed crad 
 
------ temperature: 1.2
lived; so that at
first the old motives of vehement passion the mo sumpiritabul
cimine,
churotion
charding, they refulapheroded develictas. far orrdinncogon rel 
epoch # 2
Epoch 1/1
--- generating with seed: "  the approval of all the positivists of
france and germany ( " 
 
------ temperature: 0.