In [156]:
import numpy as np
import pandas as pd
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D
from keras.layers import LSTM
from keras.datasets import imdb, reuters
from keras.utils import to_categorical
import matplotlib.pyplot as plt
%matplotlib inline

In [157]:
words = []
with open('data/alice-in-wonderland.txt') as f:
    for line in f:
        line = line.strip().lower()
        if line:
            words = words + line.split(' ')

In [158]:
words[:10]

['i--down',
 'the',
 'rabbit-hole',
 'alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired']

In [159]:
word_to_index = {w:i for i, w in enumerate(set(words))}
index_to_word = {v:k for k,v in word_to_index.items()}

In [160]:
list(word_to_index.items())[:10]

[('', 0),
 ('turkey,', 1),
 ('party.', 2),
 ('trouble', 3),
 ('an', 4),
 ("'tis", 5),
 ('dead', 6),
 ("king's", 7),
 ('his', 8),
 ("we've", 9)]

In [161]:
list(index_to_word.items())[:10]

[(0, ''),
 (1, 'turkey,'),
 (2, 'party.'),
 (3, 'trouble'),
 (4, 'an'),
 (5, "'tis"),
 (6, 'dead'),
 (7, "king's"),
 (8, 'his'),
 (9, "we've")]

In [162]:
num_docs = int(len(words)/10)
num_docs

971

In [163]:
def doc_generator(docs, offsets):
    X_w = []
    Y_w = []
    for offset in range(offsets):
        for doc in range(docs):
            x = words[doc*10+offset:doc*10+offset+10]
            y = x.pop()
            X_w.append(x)
            Y_w.append(y)        
    return (X_w, Y_w,
        np.array([[word_to_index[word] for word in doc] for doc in X_w]), 
        to_categorical(np.array([word_to_index[word] for word in Y_w])))

In [164]:
X_w, Y_w, X_i, Y_i = doc_generator(950, 50)

In [165]:
len(X_w)

47500

In [166]:
X_w[:10]

[['i--down',
  'the',
  'rabbit-hole',
  'alice',
  'was',
  'beginning',
  'to',
  'get',
  'very'],
 ['of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank,', 'and'],
 ['having', 'nothing', 'to', 'do.', 'once', 'or', 'twice', 'she', 'had'],
 ['into', 'the', 'book', 'her', 'sister', 'was', 'reading,', 'but', 'it'],
 ['no', 'pictures', 'or', 'conversations', 'in', 'it,', '"and', 'what', 'is'],
 ['use',
  'of',
  'a',
  'book,"',
  'thought',
  'alice,',
  '"without',
  'pictures',
  'or'],
 ['so', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', '(as'],
 ['as', 'she', 'could,', 'for', 'the', 'day', 'made', 'her', 'feel'],
 ['sleepy',
  'and',
  'stupid),',
  'whether',
  'the',
  'pleasure',
  'of',
  'making',
  'a'],
 ['would', 'be', 'worth', 'the', 'trouble', 'of', 'getting', 'up', 'and']]

In [167]:
Y_w[:10]

['tired',
 'of',
 'peeped',
 'had',
 'the',
 'conversations?"',
 'well',
 'very',
 'daisy-chain',
 'picking']

In [168]:
X_i[:10]

array([[2419, 1346,  728, 1379, 1069, 1196, 1935, 2091,  909],
       [1565, 2284, 2300, 1434,  632,  804, 1346,  201, 1895],
       [2074,  666, 1935, 2224,  345,  379, 1573, 2257, 1113],
       [ 603, 1346,  812, 1434,  632, 1069,  735, 2447, 2340],
       [1241, 1463,  379, 2277,  318, 2096, 1490, 1078, 2444],
       [ 683, 1565,  283,  528,  537, 1744, 1493, 1463,  379],
       [ 320, 2257, 1069, 2073,  318, 1434,  885,  996, 1191],
       [ 552, 2257, 1738, 1739, 1346, 1121, 1997, 1434,  717],
       [ 482, 1895, 2330, 1238, 1346, 1267, 1565, 1642,  283],
       [1754, 1399,  380, 1346,    3, 1565, 1035,  293, 1895]])

In [169]:
Y_i[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [170]:
Y_i.shape

(47500, 2464)

In [171]:
unique_words = len(word_to_index.keys())
unique_words

2464

In [195]:
model = Sequential()
model.add(Embedding(unique_words, 128))
model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(unique_words, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 128)         315392    
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_5 (Dense)              (None, 2464)              317856    
Total params: 764,832
Trainable params: 764,832
Non-trainable params: 0
_________________________________________________________________


In [196]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_i, Y_i,
          batch_size=32,
          epochs=10,
          verbose=1, validation_split=0.3, shuffle=True)

Train on 33250 samples, validate on 14250 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [197]:
model.predict_classes([1,2,3])

array([1346, 1346, 1346])

In [198]:
nums = [1,2,3,4,5,6,7,8,9,10,11,12]
preds = model.predict_classes(nums)
print([index_to_word[n] for n in nums])
print([index_to_word[n] for n in preds])

['turkey,', 'party.', 'trouble', 'an', "'tis", 'dead', "king's", 'his', "we've", 'dear?"', 'sulky', 'off,']
['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']


In [194]:
model.input

<tf.Tensor 'embedding_4_input:0' shape=(?, ?) dtype=float32>