In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer()

data = 'In the town of Athy one Jeremy Lanigan \n Battered away'
corpus = data.lower().split('\n')

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [None]:
input_sequences = []

for line in corpus: 
  # texts_to_sequences returns a list of lists, grab the first one 
  token_list = tokenizer.texts_to_sequences([line])[0]
  
  # loop through sentence creating n-gram sentences out of the tokens 
  # ex: 
  #   [4 2]
  #   [4 2 66]
  #   [4 2 55 8]
  #   ...
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

In [None]:
# find the longest sentence in the corpus 
max_sequence_len = max([len(x) for x in input_sequences])

In [None]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [None]:
print(input_sequences)

[[ 0  0  0  0  0  0  1  2]
 [ 0  0  0  0  0  1  2  3]
 [ 0  0  0  0  1  2  3  4]
 [ 0  0  0  1  2  3  4  5]
 [ 0  0  1  2  3  4  5  6]
 [ 0  1  2  3  4  5  6  7]
 [ 1  2  3  4  5  6  7  8]
 [ 0  0  0  0  0  0  9 10]]


In [None]:
# now take everything but the last value of each sequence as the X and the last as the y
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

print(X)
print(y)

[[0 0 0 0 0 0 1]
 [0 0 0 0 0 1 2]
 [0 0 0 0 1 2 3]
 [0 0 0 1 2 3 4]
 [0 0 1 2 3 4 5]
 [0 1 2 3 4 5 6]
 [1 2 3 4 5 6 7]
 [0 0 0 0 0 0 9]]
[ 2  3  4  5  6  7  8 10]


In [None]:
# one-hot encode the labels 
# treat this as classification
# number of classes is the total number of words 
# this means that the y would be an array of length=total_words
# where the token value is used as the index and is set to 1
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [None]:
model = tf.keras.models.Sequential()
# subtract one from the max_sequence_len because we use the last element as label
model.add(tf.keras.layers.Embedding(total_words, 64, input_length=max_sequence_len - 1))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)))
model.add(tf.keras.layers.Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=500, verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x7f50f5576670>

In [None]:
# predicting sentence
seed_text = 'Laurence went to Dublin'

token_list = tokenizer.texts_to_sequences([seed_text])[0]

In [None]:
token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

In [None]:
predicted = model.predict(token_list)
predicted = np.argmax(predicted, axis=-1)[0]



In [None]:
output_word = tokenizer.index_word[predicted]
print(f'{seed_text} {output_word}')

Laurence went to Dublin away


In [None]:
# predict 10 words ahead
for _ in range(10):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
  predicted = model.predict(token_list, verbose=0)
  predicted = np.argmax(predicted, axis=-1)[0]
  output_word = tokenizer.index_word[predicted]
  seed_text += f' {output_word}'

print(seed_text)

Laurence went to Dublin away away away the town athy one jeremy lanigan lanigan


In [None]:
# model for a larger corpus 
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(total_words, 100, input_length=max_sequence_len-1),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150)),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.01), metrcis=['accuracy'])
history = model.fit(X, y, epochs=100, verbose=1)