# Character Level RNN using LSTM cells.

- Trains on Star Trek episode titles
- Outputs "fake" titles.

Much comes from a [Keras example](https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py).

## Setup Environment

- Import Keras
- Open up the Star Trek corpus
- Give each leter an index and create dictionaries to translate from index to character.

In [1]:
## Much borrowed from https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import load_model
import numpy as np
import random
import sys

text = open("startrekepisodes.txt").read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
vocabulary_size = len(chars)
print('total chars:', vocabulary_size)
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

Using TensorFlow backend.


corpus length: 11017
total chars: 52


In [2]:
# How long is a title?
titles = text.split('\n')
lengths = np.array([len(n) for n in titles])
print("Max:", np.max(lengths))
print("Mean:", np.mean(lengths))
print("Median:", np.median(lengths))
print("Min:", np.min(lengths))

# hence choose 30 as seuence length to train on.

Max: 50
Mean: 14.0108991826
Median: 13.0
Min: 2


## Setup Training Data

- Cut up the corpus into sequences of 40 characters.
- Change indexes into "one-hot" vector encodings.

In [3]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 30
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

X = np.zeros((len(sentences), maxlen), dtype=int)
y = np.zeros((len(sentences), vocabulary_size), dtype=np.bool)

for i in range(len(sentences)):
    X[i] = np.array([char_indices[x] for x in sentences[i]])
    y[i, char_indices[next_chars[i]]] = True
print("Done preparing training corpus, shapes of sets are:")
print("X shape: " + str(X.shape))
print("y shape: " + str(y.shape))
print("Vocabulary of characters:", vocabulary_size)

nb sequences: 3663
Done preparing training corpus, shapes of sets are:
X shape: (3663, 30)
y shape: (3663, 52)
Vocabulary of characters: 52


In [4]:
def model_maker(model, layer_size=64, dropout_rate=0.5, num_layers=1, vocab_size=20, input_length=1, lr=0.01, train_mode=True):
    """Builds a charRNN model with variable layer size, number of layers, droupout, learning rate, and a training mode."""
    if train_mode:
        stateful = False
        input_shape = (None, input_length)
    else:
        stateful = True
        input_shape = (1, input_length)
    
    # Input embedding
    model.add(Embedding(vocab_size, layer_size, input_length=input_length, batch_input_shape=input_shape))
              
    # LSTM layers + 1
    for i in range(num_layers - 1):
        model.add(Dropout(dropout_rate))
        model.add(LSTM(layer_size, return_sequences=True, stateful=stateful))
    
    # Final LSTM layer
    model.add(Dropout(dropout_rate))
    model.add(LSTM(layer_size, stateful=stateful))

    # Project back to vocabulary
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr))
    model.summary()


In [9]:
m = Sequential()
model_maker(m, layer_size=128, vocab_size=vocabulary_size, input_length=30, train_mode=True)

Building a model!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 128)           6272      
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 49)                6321      
Total params: 144,177
Trainable params: 144,177
Non-trainable params: 0
_________________________________________________________________


In [10]:
m.fit(X, y, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11d56a208>

Building a model!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 64)             1280      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 64)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1300      
Total params: 35,604
Trainable params: 35,604
Non-trainable params: 0
_________________________________________________________________


## Model

- Model has one hidden layer of 128 LSTM cells.
- Input layer is an Embedding to convert from indices to a vector encoding automatically (common trick - but does it work?)

In [8]:
layer_size = 256
dropout_rate = 0.5
# build the model: a single LSTM
print('Build model...')
model_train = Sequential()
model_train.add(Embedding(vocabulary_size, layer_size, input_length=maxlen))

# LSTM part
model_train.add(Dropout(dropout_rate))
model_train.add(LSTM(layer_size, return_sequences=True))
model_train.add(Dropout(dropout_rate))
model_train.add(LSTM(layer_size))

# Project back to vocabulary
model_train.add(Dense(vocabulary_size))
model_train.add(Activation('softmax'))
model_train.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01))
model_train.summary()



Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 256)           13312     
_________________________________________________________________
dropout_5 (Dropout)          (None, 30, 256)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 30, 256)           525312    
_________________________________________________________________
dropout_6 (Dropout)          (None, 30, 256)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_3 (Dense)              (None, 52)                13364     
_________________________________________________________________
activation_3 (Activation)    (None, 52)                0     

## Training

- Train on batches of 128 examples

In [9]:
# Training the Model.
model_train.fit(X, y, batch_size=64, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11ed61c10>

In [16]:
model_train.fit(X, y, batch_size=64, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x124bdca10>

In [17]:
# Save model if necessary
model_train.save("keras-startrek-LSTM-model.h5")

## Test the Model

- Take a quote then add 400 characters.

### Make a Decoder model

- Needs input length of 1.
- Needs batch size of 1
- Needs LSTM to be stateful
- check that params is the same as model_train

In [5]:
# Load model if necessary.
model_train = load_model("keras-startrek-LSTM-model.h5")

In [10]:
# Build a decoding model (input length 1, batch size 1, stateful)
layer_size = 256
dropout_rate = 0.5

model_dec = Sequential()
model_dec.add(Embedding(vocabulary_size, layer_size, input_length=1, batch_input_shape=(1,1)))

# LSTM part
model_dec.add(Dropout(dropout_rate))
model_dec.add(LSTM(layer_size, stateful=True, return_sequences=True))
model_dec.add(Dropout(dropout_rate))
model_dec.add(LSTM(layer_size, stateful=True))

# project back to vocabulary
model_dec.add(Dense(vocabulary_size))
model_dec.add(Activation('softmax'))
model_dec.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01))
model_dec.summary()

# set weights from training model
model_dec.set_weights(model_train.get_weights())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (1, 1, 256)               13312     
_________________________________________________________________
dropout_7 (Dropout)          (1, 1, 256)               0         
_________________________________________________________________
lstm_7 (LSTM)                (1, 1, 256)               525312    
_________________________________________________________________
dropout_8 (Dropout)          (1, 1, 256)               0         
_________________________________________________________________
lstm_8 (LSTM)                (1, 256)                  525312    
_________________________________________________________________
dense_4 (Dense)              (1, 52)                   13364     
_________________________________________________________________
activation_4 (Activation)    (1, 52)                   0         
Total para

In [11]:
## Sampling function

def sample_model(seed, model_name, length=400):
    '''Samples a charRNN given a seed sequence.'''
    generated = ''
    sentence = seed.lower()[:]
    generated += sentence
    print("Seed: ", generated)
    
    for i in range(length):
        x = np.array(map((lambda x: char_indices[x]), sentence))
        x = np.reshape(x,(1,1))
        preds = model_name.predict(x, verbose=0)[0]
        next_index = sample(preds, 0.5)
        next_char = indices_char[next_index]
        
        generated += next_char
        sentence = sentence[1:] + next_char
    print("Generated: ", generated)

In [12]:
# Sample 1000 characters from the model using a random seed from the vocabulary.
sample_model(indices_char[random.randint(0,vocabulary_size-1)], model_dec, 1000)

Seed:  �
Generated:  �ort the barenthe tont
bars
the the the bare
the the the ponthe ont
garthe
ses
the she the the the part
she partrenine the parenentunt
nenthe santhe brertinerer
bartrine
the sonthe the bare
the pars
dine
ban
the sre the bonthe the the the partintins
the ponthe the anesenine
partert parts
serthe panthe the ertres
the cre inens
the anensonthe the banthe
the sons
sre the the the sint the ant
srenthe part
fame
porert
the partont the sre
the the the senthe the oncere
the pane
the sayenre
the the parene
the se
part
contre the sinthe the inenthe the the entunthe sintre sint the bronsranthe the the the the surthe part the sintins
cort sint banthe the re
the ante
fonthe inthe the barthe the the prert the the the dale
the s onenens ors
nonthe the pare
the the anerenthestertre
the barthe the parte
the parthe ent santhe part
the barturthe payensoy
pre partrere
sunse the pare
the the partrens
inthe enthe parthe she
pares
frens
the sint the porer
the pre the whe the bartrentonin