In [1]:
import numpy as np
from keras.layers import Activation, Dense
from keras.layers.recurrent import SimpleRNN
from keras.models import Sequential
from keras.utils.vis_utils import plot_model

Using TensorFlow backend.


In [6]:
ALICE_PATH = '/Users/dsp/nltk_data/corpora/gutenberg/carroll-alice.txt'

In [7]:
with open(ALICE_PATH) as f:
    lines = []
    for line in f:
        line = line.strip().lower()
        if len(line) == 0:
            continue
        lines.append(line)
    text = ' '.join(lines)

In [8]:
text[:50]

"[alice's adventures in wonderland by lewis carroll"

In [21]:
chars = set([c for c in text])
n_chars = len(chars)
print(n_chars)

char2index = {c: i for i, c in enumerate(chars)} 
# 'a': 40, 'b': 18, 'c': 10...

index2char = {i: c for i, c in enumerate(chars)} 
# 0: ')', 1: 'u', 2: 'r'...

46


In [10]:
SEQ_LEN = 10
STEP = 1

In [11]:
input_chars = []
label_chars = []
for i in range(0, len(text) - SEQ_LEN, STEP) :
    input_chars.append(text[i:i + SEQ_LEN])
    label_chars.append(text[i + SEQ_LEN])

In [12]:
print(f'{input_chars[0]} -> {label_chars[0]}')
print(f'{input_chars[1]} -> {label_chars[1]}')
print(f'{input_chars[2]} -> {label_chars[2]}')

[alice's a -> d
alice's ad -> v
lice's adv -> e


Each sample in X is:
SEQ_LEN = 10 x n_chars = 46 letters + punctuation, etc.
 = 460 cells

In [13]:
X = np.zeros([len(input_chars), SEQ_LEN, n_chars], dtype=np.bool)
y = np.zeros([len(input_chars), n_chars], dtype=np.bool)
for i, input_char in enumerate(input_chars):
    for j, ch in enumerate(input_char):
        X[i, j, char2index[ch]] = 1
    y[i, char2index[label_chars[i]]] = 1

In [27]:
HIDDEN_SIZE = 128
BATCH = 128
ITER = 25
EPOCHS = 1
PREDS_PER_EPOCH = 100

In [28]:
model = Sequential()
model.add(SimpleRNN(HIDDEN_SIZE, 
                    return_sequences=False, 
                    input_shape=[SEQ_LEN, n_chars],
                    unroll=True))
model.add(Dense(n_chars))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [29]:
for i in range(ITER):
    print('=' * 50)
    print(f'Iteration {i + 1}')
    model.fit(X, y, batch_size=BATCH, epochs=EPOCHS)
    test_idx = np.random.randint(len(input_chars))
    test_chars = input_chars[test_idx]
    print(f'Generating from seed: {test_chars}')
    print(test_chars, end='')
    for i in range(PREDS_PER_EPOCH):
        X_test = np.zeros((1, SEQ_LEN, n_chars))
        for i, ch in enumerate(test_chars):
            X_test[0, i, char2index[ch]] = 1
        pred = model.predict(X_test, verbose=0)[0]
        y_pred = index2char[np.argmax(pred)]
        print(y_pred, end='')
        test_chars = test_chars[1:] + y_pred # shift
    print()
print()

Iteration 1
Epoch 1/1
Generating from seed: iculty was
iculty wast and and alice aad the the wast on and and alice aad the the wast on and and alice aad the the wast
Iteration 2
Epoch 1/1
Generating from seed: half my pl
half my plong the hand and then the hat she hand and then the hat she hand and then the hat she hand and then 
Iteration 3
Epoch 1/1
Generating from seed:  she remar
 she remars all the the hat sal cous and all the the hat sal cous and all the the hat sal cous and all the the
Iteration 4
Epoch 1/1
Generating from seed:  the hatte
 the hatter she was and theard thear sheard thear sheard thear sheard thear sheard thear sheard thear sheard t
Iteration 5
Epoch 1/1
Generating from seed: iness?' th
iness?' the she she said to she she said to she she said to she she said to she she said to she she said to sh
Iteration 6
Epoch 1/1
Generating from seed: head over 
head over hel she wald the gryphon, who sald the malt be which laster sout to the dolles in the dild an the do
Iter