In [16]:
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from nltk import ngrams
from keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def tokenizeFile(fileName: str) -> list:
    '''Will take in the name of a txt file located in the base directory 
    of the drive and return a list of tokens. Tokens are defined to be
    any nonzero sequence of characters.'''
    tokens = []
    with open(fileName, 'r', encoding="ISO-8859-1") as file:
        tokens = word_tokenize(file.read())
    return list(tokens)

In [84]:
# Cut number of words so we can fit everything in memory
tokens_all = tokenizeFile("TheLordOfTheRings_Book1.txt")
tokens = tokens_all[:len(tokens_all)//50]
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

# Change this parameter to decrease or increase the size of the training samples
N = 21
ngrams_tokens = list(ngrams(tokens, N))
np.random.shuffle(ngrams_tokens)
print('Total ngrams: %d' % len(ngrams_tokens))

# Combine tokens into training sample phrases
ngrams_samples = [" ".join(x) for x in ngrams_tokens]
print("Total Samples: %d" % len(ngrams_samples))

Total Tokens: 4435
Unique Tokens: 1261
Total ngrams: 4415
Total Samples: 4415


Assign a unique ID to each token and make an inverted index so we can look up terms easily later.

Determine the vocabulary size for the Embedding layer.

In [113]:
tokenizer = Tokenizer(filters='\t\n')
tokenizer.fit_on_texts(ngrams_samples)
sequencies = np.array(tokenizer.texts_to_sequences(ngrams_samples))
# sequencies = tokenizer.texts_to_sequences(ngrams_samples)
vocab_size = len(tokenizer.word_index) + 1
print("Sequences Shape: " + str(sequencies.shape))
print("Vocabulary size: %d" % vocab_size)

Sequences Shape: (4415, 21)
Vocabulary size: 1190


Split into training, validation, and testing

Split into input and expected output where the input is all but the last token and the output is the last token

In [119]:
from numpy import array as ar

tr = int(len(sequencies) * 0.9) # Training samples is 90% of data
va = int(len(sequencies) * 0.95) # Validation samples is 5% of data 
te = len(sequencies) # Testing samples is last 5% of data
training, validation, testing = sequencies[:tr], sequencies[tr:va], sequencies[va:te]

tr_input, tr_output = ar([x[:-1] for x in training]), [x[-1] for x in training]
tr_output = to_categorical(tr_output, num_classes=vocab_size)

va_input, va_output = [x[:-1] for x in validation], [x[-1] for x in validation]
va_output = to_categorical(va_output, num_classes=vocab_size)

te_input, te_output = [x[:-1] for x in testing], [x[-1] for x in testing]
te_output = to_categorical(te_output, num_classes=vocab_size)

In [123]:
print(tr_input.shape)
# print(tr_input)
print(tr_output.shape)

# l = -1
# c = 0
# for x in sequencies:
#     if l == -1:
#         l = len(x)
#     else:
#         if len(x) != l:
#             print(c, end=": ")
#             print(len(x), end=" --- ")
#             print(x)
#     c += 1

(3973, 20)
(3973, 1190)


In [125]:
seq_length = tr_input.shape[1]
print(seq_length)

20


In [126]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 50)            59500     
_________________________________________________________________
lstm_5 (LSTM)                (None, 20, 100)           60400     
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_6 (Dense)              (None, 1190)              120190    
Total params: 330,590
Trainable params: 330,590
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(tr_input, tr_output, batch_size=128, epochs=10)

Epoch 1/10
