In [1]:
import json
import random
import numpy as np

# load up our text
zhuangzi = open('../data/zhuangzi.txt', 'r').read().lower()
mencius = open('../data/mencius.txt', 'r').read().lower()
confucius = open('../data/confucius.txt', 'r').read().lower()

text = '\n'.join([zhuangzi, mencius, confucius])


# extract all (unique) characters
# these are our "categories" or "labels"
chars = list(set(text))

# set a fixed vector size
# so we look at specific windows of characters
max_len = 32

Now we'll define our RNN. Keras makes this trivial:

In [2]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation, Dropout

model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(max_len, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

Using Theano backend.


We're framing our task as a classification task. Given a sequence of characters, we want to predict the next character. We equate each character with some label or category (e.g. "a" is 0, "b" is 1, etc).

We use the softmax activation function on our output layer - this function is used for categorical output. It turns the output into a probability distribution over the categories (i.e. it makes the values the network outputs sum to 1). So the network will essentially tell us how strongly it feels about each character being the next one.

The categorical cross-entropy loss the standard loss function for multilabel classification.

We use dropout here to prevent overfitting - we don't want the network to just return things already in the text, we want it to have some wiggle room and create novelty! Dropout is a technique where, in training, some percent (here, 20%) of random neurons of the associated layer are "turned off" for that epoch. This prevents overfitting but preventing the network from relying on particular neurons.

That's it for the network architecture!

To train, we have to do some additional preparation. We need to chop up the text into character sequences of the length we specified (`max_len`) - these are our training inputs. We match them with the character that immediately follows each sequence. These are our expected training outputs.

For example, say we have the following text (this quote is from Zhuang Zi). With `max_len=20`, we could manually create the first couple training examples like so:

In [3]:
example = "The fish trap exists because of the fish. Once you've gotten the fish you can forget the trap. The rabbit snare exists because of the rabbit. Once you've gotten the rabbit, you can forget the snare. Words exist because of meaning. Once you've gotten the meaning, you can forget the words. Where can I find a man who has forgotten words so that I may have a word with him?"

# step size here is 3, but we can vary that
input_1 = example[0:20]
true_output_1 = example[20]
# >>> 'The fish trap exists'
# >>> ' '

print(input_1)
print(true_output_1)

input_2 = example[3:23]
true_output_2 = example[23]
# >>> 'fish trap exists be'
# >>> 'c'

print(input_2)
print(true_output_2)

input_3 = example[6:26]
true_output_3 = example[26]
# >>> 'sh trap exists becau'
# >>> 's'

# etc

The fish trap exists
 
 fish trap exists be
c


In [2]:
# We can generalize this like so:
step = 3
inputs = []
outputs = []
for i in range(0, len(text) - max_len, step):
    inputs.append(text[i:i+max_len])
    outputs.append(text[i+max_len])
    
print(inputs[0])
print(outputs[0])

when the shoe fits, the foot is 
f


In [3]:
# We also need to map each character to a label and create a reverse mapping to use later:
char_labels = {ch:i for i, ch in enumerate(chars)}
labels_char = {i:ch for i, ch in enumerate(chars)}

# load previous ones
char_labels = json.load(open('../data/char_labels.json', 'r'))
labels_char = {int(label): char for label, char in json.load(open('../data/labels_char.json', 'r')).items()} # make sure labels stay ints

print(char_labels)
print('---')
print(labels_char)

{'7': 2, 'k': 56, '–': 19, '’': 3, 'v': 11, '*': 52, '0': 9, 'e': 0, '&': 43, 'j': 36, 'a': 42, ';': 18, '.': 15, 'r': 47, '—': 28, 'o': 37, ':': 27, 'f': 35, 'w': 54, '6': 1, '4': 21, ']': 45, ')': 58, 'l': 16, 'i': 49, '“': 4, 'y': 7, '?': 40, '\n': 50, '"': 5, '9': 31, 'z': 51, 'q': 57, 'n': 20, 'x': 55, 'd': 24, '/': 22, '”': 17, ' ': 6, 'm': 46, 'u': 8, '3': 10, '-': 26, '[': 14, 't': 29, '1': 12, '5': 33, ',': 30, 's': 32, '(': 44, "'": 48, '2': 53, '‘': 41, 'b': 23, '!': 39, 'h': 25, 'p': 38, 'c': 34, 'g': 13}
---
{0: 'e', 1: '6', 2: '7', 3: '’', 4: '“', 5: '"', 6: ' ', 7: 'y', 8: 'u', 9: '0', 10: '3', 11: 'v', 12: '1', 13: 'g', 14: '[', 15: '.', 16: 'l', 17: '”', 18: ';', 19: '–', 20: 'n', 21: '4', 22: '/', 23: 'b', 24: 'd', 25: 'h', 26: '-', 27: ':', 28: '—', 29: 't', 30: ',', 31: '9', 32: 's', 33: '5', 34: 'c', 35: 'f', 36: 'j', 37: 'o', 38: 'p', 39: '!', 40: '?', 41: '‘', 42: 'a', 43: '&', 44: '(', 45: ']', 46: 'm', 47: 'r', 48: "'", 49: 'i', 50: '\n', 51: 'z', 52: '*', 53: 

Now we can start constructing our numerical input 3-tensor and output matrix. Each input example (i.e. a sequence of characters) is turned into a matrix of one-hot vectors; that is, a bunch of vectors where the index corresponding to the character is set to 1 and all the rest are set to zero.

For example, if we have the following:

In [6]:
# assuming max_len = 7
# so our examples have 7 characters
example = 'cab dab'
example_char_labels = {
    'a': 0,
    'b': 1,
    'c': 2,
    'd': 3,
    ' ' : 4
}

# matrix form
# only five characters, so the vectors only need to have five components
[
    [0, 0, 1, 0, 0], # c
    [1, 0, 0, 0, 0], # a
    [0, 1, 0, 0, 0], # b
    [0, 0, 0, 0, 1], # (space)
    [0, 0, 0, 1, 0], # d
    [1, 0, 0, 0, 0], # a
    [0, 1, 0, 0, 0]  # b
]

[[0, 0, 1, 0, 0],
 [1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1, 0],
 [1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0]]

In [4]:
def text_to_matrix(text):
    assert len(text) == max_len
    X = np.zeros((1, max_len, len(chars)), dtype=np.bool)
    for i, char in enumerate(text):
        X[0, i, char_labels[char]] = 1
    return X   
    
print(char_labels['t'])
text_to_matrix('this is a testing input sentence').astype(int)[0][0]

29


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

That matrix represents a single training example, so we have a stack of those matrices (hence a 3-tensor). The outputs for each example are each a one-hot vector. With that in mind:

In [5]:
# using bool to reduce memory usage
X = np.zeros((len(inputs), max_len, len(chars)), dtype=np.bool)
y = np.zeros((len(inputs), len(chars)), dtype=np.bool)

# set the appropriate indices to 1| in each one-hot vector
for i, example in enumerate(inputs):
    for t, char in enumerate(example):
        X[i, t, char_labels[char]] = 1
    y[i, char_labels[outputs[i]]] = 1

Now that we have our training data, we can start training. Keras also makes this easy:

In [None]:
# more epochs is usually better, but training can be very slow if not on a GPU
#epochs = 10
#model.fit(X, y, batch_size=128, nb_epoch=epochs)

It's much more fun to see your network's ramblings as it's training, so let's write a function to produce text from the network:

In [9]:
def generate(temperature=0.35, seed=None, predicate=lambda x: len(x) < 100):
    if seed is not None and len(seed) < max_len:
        raise Exception('Seed text must be at least {} chars long'.format(max_len))

    # if no seed text is specified, randomly select a chunk of text
    else:
        start_idx = random.randint(0, len(text) - max_len - 1)
        seed = text[start_idx:start_idx + max_len]

    sentence = seed
    generated = sentence

    while predicate(generated):
        # generate the input tensor
        # from the last max_len characters generated so far
        X = text_to_matrix(sentence)

        # this produces a probability distribution over characters
        probs = model.predict(X, verbose=0)[0]

        # sample the character to use based on the predicted probabilities
        next_idx = sample(probs, temperature)
        next_char = labels_char[next_idx]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

def sample(probs, temperature):
    """samples an index from a vector of probabilities"""
    a = np.log(probs)/temperature
    a = np.exp(a)/np.sum(np.exp(a))
    return np.argmax(np.random.multinomial(1, a, 1))

The temperature controls how random we want the network to be. Lower temperatures favors more likely values, whereas higher temperatures introduce more and more randomness. At a high enough temperature, values will be chosen at random.

With this generation function we can modify how we train the network so that we see some output at each step:

In [10]:
# training can take a looong time, so load pre-trained weights
model.load_weights('../data/rnn_weights.h5')

In [11]:
# we can keep training off the existing model too
epochs = 1
for i in range(epochs):
    print('epoch', i)

    # set nb_epoch to 1 since we're iterating manually
    model.fit(X, y, batch_size=128, nb_epoch=1)

    # preview
    for temp in [0.2, 0.5, 1., 1.2]:
        print('\n\ttemperature:', temp)
        print(generate(temperature=temp))

epoch 0
Epoch 1/1
  384/53743 [..............................] - ETA: 2961s - loss: 0.0222

KeyboardInterrupt: 

In [12]:
input = 'when the shoe fits, the foot is '
X = text_to_matrix(input)
probs = model.predict(X, verbose=0)[0]
print(probs)

[  6.52413057e-09   1.41806213e-05   1.73697881e-05   2.41744306e-06
   4.69948718e-05   3.51794938e-06   2.91247183e-04   1.60155323e-04
   6.60652063e-07   3.33907446e-05   5.23559902e-06   3.95067490e-09
   2.14113843e-05   7.86875989e-06   1.55424950e-05   1.32164061e-07
   4.34719212e-03   1.51409549e-05   1.21332732e-07   5.33563480e-06
   1.59135489e-05   1.71539232e-05   2.08672573e-05   1.04889465e-10
   2.60246091e-09   5.29083773e-04   4.52232143e-06   5.61972920e-05
   2.17745910e-05   2.73212891e-06   1.52345869e-08   1.66212540e-05
   9.49820533e-05   2.79653141e-05   1.14836304e-07   9.93711412e-01
   3.02455883e-05   2.44788880e-05   4.16842922e-05   3.94904964e-06
   2.32454950e-05   1.58153871e-05   1.64365574e-05   1.56704482e-05
   7.28394707e-06   2.85933220e-05   9.63689644e-08   4.75186152e-06
   1.32395485e-06   1.00007048e-06   4.71413358e-07   3.11559506e-06
   7.88980396e-05   2.33535739e-05   9.94135462e-06   7.72734202e-05
   4.52672703e-06   7.75303779e-05

In [14]:
# see the probability of an individual character
label = char_labels['f']
probs[label]

0.99371141195297241

In [15]:
idx = np.argmax(probs)
print(idx)

35


In [16]:
labels_char[idx]

'f'

In [17]:
generate(temperature=1)

'meanness. but meanness is a far lost soutten to be perfole in what is real, searmen –nes deal with t'

In [None]:
# save these for later
#model.save_weights('../data/rnn_weights.h5', overwrite=True)
#json.dump(char_labels, open('../data/char_labels.json', 'w'))
#json.dump(labels_char, open('../data/labels_char.json', 'w'))