In [20]:
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from nltk import ngrams
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def tokenizeFile(fileName: str) -> list:
    '''Will take in the name of a txt file located in the base directory 
    of the drive and return a list of tokens. Tokens are defined to be
    any nonzero sequence of characters.'''
    tokens = []
    with open(fileName, 'r', encoding="ISO-8859-1") as file:
        tokens = word_tokenize(file.read())
    return list(tokens)

In [34]:
# Cut number of words so we can fit everything in memory
tokens_all = tokenizeFile("TheLordOfTheRings_Book1.txt")
tokens = tokens_all[:len(tokens_all)//50]
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

# Change this parameter to decrease or increase the size of the training samples
N = 30
ngrams_tokens = list(ngrams(tokens, N))
np.random.shuffle(ngrams_tokens)
print('Total ngrams: %d' % len(ngrams_tokens))

# Combine tokens into training sample phrases
ngrams_samples = [" ".join(x) for x in ngrams_tokens]
print("Total Samples: %d" % len(ngrams_samples))

Total Tokens: 4435
Unique Tokens: 1261
Total ngrams: 4406
Total Samples: 4406


Assign a unique ID to each token and make an inverted index so we can look up terms easily later.

Determine the vocabulary size for the Embedding layer.

In [35]:
tokenizer = Tokenizer(filters='\t\n')
tokenizer.fit_on_texts(ngrams_samples)
sequencies = np.array(tokenizer.texts_to_sequences(ngrams_samples))
# sequencies = tokenizer.texts_to_sequences(ngrams_samples)
vocab_size = len(tokenizer.word_index) + 1
print("Sequences Shape: " + str(sequencies.shape))
print("Vocabulary size: %d" % vocab_size)

Sequences Shape: (4406, 30)
Vocabulary size: 1190


Split into training, validation, and testing

Split into input and expected output where the input is all but the last token and the output is the last token

In [36]:
from numpy import array as ar

tr = int(len(sequencies) * 0.9) # Training samples is 90% of data
va = int(len(sequencies) * 0.95) # Validation samples is 5% of data 
te = len(sequencies) # Testing samples is last 5% of data
training, validation, testing = sequencies[:tr], sequencies[tr:va], sequencies[va:te]

tr_input, tr_output = ar([x[:-1] for x in training]), [x[-1] for x in training]
tr_output = to_categorical(tr_output, num_classes=vocab_size)

va_input, va_output = ar([x[:-1] for x in validation]), [x[-1] for x in validation]
va_output = to_categorical(va_output, num_classes=vocab_size)

te_input, te_output = ar([x[:-1] for x in testing]), [x[-1] for x in testing]
te_output = to_categorical(te_output, num_classes=vocab_size)

In [37]:
seq_length = tr_input.shape[1]
print(seq_length)

29


In [38]:
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=seq_length))
model.add(LSTM(300, return_sequences=True))
model.add(LSTM(300))
model.add(Dense(300, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 29, 300)           357000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 29, 300)           721200    
_________________________________________________________________
lstm_8 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_7 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_8 (Dense)              (None, 1190)              358190    
Total params: 2,247,890
Trainable params: 2,247,890
Non-trainable params: 0
_________________________________________________________________
None


In [39]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(tr_input, tr_output, batch_size=32, epochs=100, validation_data=(va_input, va_output))

Train on 3965 samples, validate on 220 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [40]:
from random import randint
seed_text = ngrams_samples[randint(0, len(ngrams_samples))]
print(seed_text)

of Dearth ( 1158-60 ) were at the time of this tale long past and the Hobbits had again become accustomed to plenty . The land was rich and kindly


In [41]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

, and though it had to been deserted when they entered it , it had before been well in order to provide the necessary background of 'history to its manner , and always plodded on , mostly by night , till i stood by balin 's tomb in moria .
