In [None]:
import numpy as np
import pandas as pd
import keras
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D
from keras.layers import LSTM
from keras.datasets import imdb, reuters
from keras.utils import to_categorical, multi_gpu_model
import matplotlib.pyplot as plt
%matplotlib inline

# Getting the data

***
***
***

In [None]:
with open('alice-in-wonderland.txt') as f:
    book = f.read()

In [None]:
words = text.text_to_word_sequence(book)

In [None]:
book[:300]

In [None]:
words[:10]

In [None]:
unique_words = set(words)
n_unique_words = len(unique_words)
n_words = len(words)

print('unique words:', n_unique_words)
print('total words:', n_words)

In [None]:
word_to_index = {w:i for i, w in enumerate(unique_words)}
index_to_word = {v:k for k,v in word_to_index.items()}

In [None]:
list(word_to_index.items())[:10]

In [None]:
list(index_to_word.items())[:10]

In [None]:
def doc_generator(docs, offsets):
    X_w = []
    Y_w = []
    for offset in range(offsets):
        for doc in range(docs):
            x = words[doc*10+offset:doc*10+offset+10]
            y = x.pop()
            X_w.append(x)
            Y_w.append(y)        
    return (
        X_w,
        Y_w,
        np.array([[word_to_index[word] for word in doc] for doc in X_w]), 
        to_categorical(np.array([word_to_index[word] for word in Y_w]))
    )

In [None]:
# generate lots of documents and the resulting predicted next word
# make the "4" into "200" for more data
X_w, Y_w, X_i, Y_i = doc_generator(950, 4)

In [None]:
# total documents
len(X_w)

In [None]:
X_w[:10]

In [None]:
Y_w[:10]

In [None]:
X_i[:10]

In [None]:
Y_i[:10]

In [None]:
Y_i.shape

# Building the model

***
***
***

In [None]:
model = Sequential()
model.add(Embedding(n_unique_words, 128, input_length=9))
model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(n_unique_words, activation='softmax'))
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# ************************************************
# CHANGE THE EPOCHS, BELOW, TO GET HIGHER ACCURACY
# ***********************************************

history = model.fit(X_i, Y_i,
          batch_size=32,
          epochs=5,
          verbose=1, validation_split=0.3, shuffle=True)

In [None]:
fig, ax1 = plt.subplots(1,1,figsize=(12,7))
ax1.plot(history.epoch, history.history['loss'], marker='^', color='purple')
ax1.set_xlabel('epochs')
ax1.set_ylabel('loss', color='purple')
ax1.tick_params('y', colors='purple')

ax2 = ax1.twinx()
plt.plot(history.epoch, history.history['acc'], marker='+', color='green', label='train')
ax2.set_ylim(0,1)

ax3 = ax1.twinx()
plt.plot(history.epoch, history.history['val_acc'], marker='*', color='red', label='validation')
ax3.set_ylim(0,1)

fig.suptitle('alice in wonderland');
fig.legend();

In [None]:
model.save('alice.h5')

# Model predictions

***
***
***

In [None]:
sample = [ 786, 1405,  726,  748, 1071, 1436,  963,  890,  927]

In [None]:
' '.join([index_to_word[s] for s in sample])

In [None]:
pred = model.predict(np.array([sample]))
pred

In [None]:
pred.argmax()

In [None]:
top5 = pred.argsort()[0][::-1][:5]
top5

In [None]:
[index_to_word[t] for t in top5]