# Recurrent Neural Network to generate (predict) text data using Keras
* LSTM (Long Short-Term Memory) Network
* Code based on this article https://towardsdatascience.com/recurrent-neural-networks-by-example-in-python-ffd204f99470
* Data: full text of Alice in Wonderland taken from https://archive.org/stream/alicesadventures19033gut/19033.txt

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Masking
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# SOURCE: https://archive.org/stream/alicesadventures19033gut/19033.txt
text = open('Alice_in_Wonderland.txt').read().split(' ')

# Use Tokenizer class to turn text into numeric data

In [13]:
# set num_words to some int value to reduce size of label array, and number of params in last layer of model
tokenizer = Tokenizer(num_words=None, char_level=False, split=' ')
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
# for some reason, tokenizer returns sequences
flat_seq = []
for sub_arr in sequences:
    for item in sub_arr:
        flat_seq.append(item)
seq = np.array(flat_seq, dtype=np.int32)
print(seq[:10])
print(len(seq), 'total words')

[ 10   8 336   4  81  28 337   6 246  58]
9732 total words


In [14]:
print(len(tokenizer.index_word), ' words in the dictionary') # number of unique words
print([tokenizer.index_word[i] for i in range(1,10)]) # first ten words in dictionary (indexing starts at 1)

1522  words in the dictionary
['the', 'and', 'a', 'to', 'she', 'of', 'it', 'was', 'in']


# Prepare training data and labels
* Features (model input): 50 consecutive words from the text
* Labels (model output): The next word in that sequence

In [15]:
features = []
labels = []

training_len = 50

for i in range(training_len, len(seq)):
    # sub_seq has length of trainin_len + 1
    sub_seq = seq[i-training_len : i+1]
    features.append(sub_seq[:-1]) # all but last word
    labels.append(sub_seq[-1]) # last word in sub-sequence

# hold on to sorted data/labels for text generation after we train the model
X_sorted, y_sorted = np.array(features), np.array(labels)
# use shuffled data for training and testing
X,y = shuffle(X_sorted, y_sorted)

print(X.shape, y.shape)
print(X[0], y[0])

print([tokenizer.index_word[i] for i in X[0]], 
      tokenizer.index_word[y[0]])

(9682, 50) (9682,)
[  65  415  769  199    3  766   32    5  206    1   22  529   24    2
  222   91 1312    4   83    7 1313  106  724   33    1  281   10    8
    3   22  770   58 1314    1  333  114  246   20    3 1315    6    3
  283    3  471 1316   46    1  114  127] 759
['more', 'nor', 'less', 'than', 'a', 'pig', 'so', 'she', 'set', 'the', 'little', 'creature', 'down', 'and', 'felt', 'quite', 'relieved', 'to', 'see', 'it', 'trot', 'away', 'quietly', 'into', 'the', 'wood', 'alice', 'was', 'a', 'little', 'startled', 'by', 'seeing', 'the', 'cheshire', 'cat', 'sitting', 'on', 'a', 'bough', 'of', 'a', 'tree', 'a', 'few', 'yards', 'off', 'the', 'cat', 'only'] grinned


# One-Hot encoding of labels, Train/Test split
* One-hot scheme: 3 == [0 0 0 1 0 ... 0] 
* Binary array represents class probabilities (each unique word is a class)
* NOTE: the one-hot encoding will take up significantly more memory than the integer representation of labels. To reduce the size of the encoded labels, set the `num_words` parameter in the Tokenizer object. This will also reduce the number of parameters in the final layer

In [16]:
# + 1 because word dictionary indexing starts at 1
n_words = len(tokenizer.index_word) + 1

# one-hot encoding for labels
y_onehot = to_categorical(y, n_words)
#print(list(y_onehot[0]).index(1), y[0]) # these two should be the same int value
#print(y_onehot.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7745, 50) (7745, 1523)
(1937, 50) (1937, 1523)


# Create Model: LSTM (Long Short-Term Memory) Network
* Wikipedia: https://en.wikipedia.org/wiki/Long_short-term_memory
* Mathematical explanation of LSTM: https://colah.github.io/posts/2015-08-Understanding-LSTMs/
* LSTMs by example: https://towardsdatascience.com/recurrent-neural-networks-by-example-in-python-ffd204f99470

In [17]:
model = Sequential()

# https://keras.io/layers/embeddings/
# can reduce model complexity by pre-computing embedding matrix, setting trainable=False
model.add(Embedding(input_dim=n_words, input_length=training_len, output_dim=100,
                    trainable=True, mask_zero=True))

# https://keras.io/layers/core/#Masking
#model.add(Masking(mask_value=0.0))

# https://keras.io/layers/recurrent/
model.add(LSTM(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))

# fully-connected layer with dropout
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

# output layer - softmax activation for predicted class probabilities
model.add(Dense(n_words, activation='softmax'))


# Compile the model. Same loss function as with MNIST model (multi-class classification score)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           152300    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1523)              98995     
Total params: 297,695
Trainable params: 297,695
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# need to train for many more epochs to get good results. 
# model will train faster if we use pre-trained embeddings. 
# When using pre-trained embeddings, have to make sure embeddings were trained on task similar to ours
# random guessing accuracy = 1/1523 = 0.00065659
hist = model.fit(X_train, y_train, batch_size=64, epochs=20)

Train on 7745 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20

In [19]:
scores = model.evaluate(X_test, y_test, batch_size=64) 
# bugged output? Displays way too many '=' in progress bar



In [20]:
print(scores)

[6.034136129644154, 0.0588539]


In [30]:
np.argmax(model.predict(np.expand_dims(X_sorted[7], axis=0))[0])

1

# Use trained model to predict next word

In [24]:
rand_ind = np.random.randint(len(X_sorted)-training_len)

seed_seq = X_sorted[rand_ind]
actual_seq = y_sorted[rand_ind : rand_ind+training_len]

pred_seq = np.empty((training_len,))
for i in range(training_len):
    pred_seq[i] = np.argmax(model.predict(np.expand_dims(X_sorted[rand_ind+1, :], axis=0))[0])
    

print('Seed Sequence: ')
print('\"' + ' '.join([tokenizer.index_word[i] for i in seed_seq]) + '\"')
print('\nPredicted next 50 words: ')
print('\"' + ' '.join([tokenizer.index_word[i] for i in pred_seq]) + '\"')
print('\nActual next 50 words: ')
print('\"' + ' '.join([tokenizer.index_word[i] for i in actual_seq]) + '\"')

Seed Sequence: 
"it turned a corner oh my ears and whiskers how late it's getting she was close behind it when she turned the corner but the rabbit was no longer to be seen she found herself in a long low hall which was lit up by a row of lamps hanging"

Predicted next 50 words: 
"the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the"

Actual next 50 words: 
"from the roof there were doors all 'round the hall but they were all locked and when alice had been all the way down one side and up the other trying every door she walked sadly down the middle wondering how she was ever to get out again suddenly she"
