# Character Level RNN using LSTM cells.

- Trained on 1MB of Shakespeare.
- Outputs "fake" Shakespeare.

Much comes from a [Keras example](https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py).

## Setup Environment

- Import Keras
- Open up the Shakespeare corpus
- Give each leter an index and create dictionaries to translate from index to character.

In [1]:
## Much borrowed from https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import load_model
import numpy as np
import random
import sys

#path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open("tiny-shakespeare.txt").read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
vocabulary_size = len(chars)
print('total chars:', vocabulary_size)
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

Using TensorFlow backend.


corpus length: 1115394
total chars: 39


## Setup Training Data

- Cut up the corpus into sequences of 40 characters.
- Change indexes into "one-hot" vector encodings.

In [2]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 50
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

X = np.zeros((len(sentences), maxlen), dtype=int)
y = np.zeros((len(sentences), vocabulary_size), dtype=np.bool)

for i in range(len(sentences)):
    X[i] = np.array(map((lambda x: char_indices[x]), sentences[i]))
    y[i, char_indices[next_chars[i]]] = True
print("Done converting y to one-hot.")
print("Done preparing training corpus, shapes of sets are:")
print("X shape: " + str(X.shape))
print("y shape: " + str(y.shape))

nb sequences: 371782
Done converting y to one-hot.
Done preparing training corpus, shapes of sets are:
X shape: (371782, 50)
y shape: (371782, 39)


## Model

- Model has one hidden layer of 128 LSTM cells.
- Input layer is an Embedding to convert from indices to a vector encoding automatically (common trick - but does it work?)

In [3]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(Embedding(vocabulary_size, vocabulary_size, input_length=maxlen))
model.add(LSTM(128))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01))
model.summary()

# model.add(LSTM(128, return_sequences=True))
#model.add(Dropout(0.5))
#model.add(LSTM(128, return_sequences=True))
#model.add(Dropout(0.5))

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 39)            1521      
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               86016     
_________________________________________________________________
dense_1 (Dense)              (None, 39)                5031      
_________________________________________________________________
activation_1 (Activation)    (None, 39)                0         
Total params: 92,568.0
Trainable params: 92,568.0
Non-trainable params: 0.0
_________________________________________________________________


## Training

- Train on batches of 128 examples

In [4]:
# Training the Model.
model.fit(X, y, batch_size=128, epochs=1)
#model.save("keras-shakespeare-LSTM-model-emb.h5")

Epoch 1/1
 12032/371782 [..............................] - ETA: 633s - loss: 2.6997 

KeyboardInterrupt: 

## Test the Model

- Take a quote then add 400 characters.

In [5]:
model = load_model("keras-shakespeare-LSTM-model-emb.h5")
quote = "Be not afraid of greatness: some are born great, some achieve greatness, and some have greatness thrust upon them."
quote = quote.lower()

def sample_model(seed, length=400):
    generated = ''
    sentence = seed.lower()[:50]
    generated += sentence
    print("Seed: ", generated)
    
    for i in range(length):
        x = np.array(map((lambda x: char_indices[x]), sentence))
        x = np.reshape(x,(1,50))
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, 0.5)
        next_char = indices_char[next_index]
        
        generated += next_char
        sentence = sentence[1:] + next_char
    print("Generated: ", generated)

sample_model(quote, 1000)

Seed:  be not afraid of greatness: some are born great, s




Generated:  be not afraid of greatness: some are born great, see
to has the house be tood the ponubent the pray their good provilly
that him, and an ab'ertant, and i have they shall one made
and mother off one a thing and the many sold you:
and the seen the man sund thy life dead.

bevoltinis:
thou art a fine the puts that i shall you are
the courten the break the pite a man in this or peerse
to hourm this provess the life to power to bow
with shall present, and sented son, the way!

bionces:
sir, so not the the whom and the hang in the caus
the great thou thou spake his and see the could no gaint,
and an absolence and the speak our no the first sonse and her brother
they a duke and i presence and such withbour.

secuietan:
who well a previness a partion.

first i hasty,
say the from it a like one we say, and and repent,
and the thing the signess bods thee,
perk! then a first be breaks in the since his stones.

first senate of york:
sweet a comment the little from is that i have you a 