In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
tokenizer = Tokenizer()

data = 'In the town of Athy one Jeremy Lanigan \n Battered away'
corpus = data.lower().split('\n')

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [17]:
input_sequences = []

for line in corpus: 
  # texts_to_sequences returns a list of lists, grab the first one 
  token_list = tokenizer.texts_to_sequences([line])[0]
  
  # loop through sentence creating n-gram sentences out of the tokens 
  # ex: 
  #   [4 2]
  #   [4 2 66]
  #   [4 2 55 8]
  #   ...
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

In [18]:
# find the longest sentence in the corpus 
max_sequence_len = max([len(x) for x in input_sequences])

In [19]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [21]:
print(input_sequences)

[[ 0  0  0  0  0  0  1  2]
 [ 0  0  0  0  0  1  2  3]
 [ 0  0  0  0  1  2  3  4]
 [ 0  0  0  1  2  3  4  5]
 [ 0  0  1  2  3  4  5  6]
 [ 0  1  2  3  4  5  6  7]
 [ 1  2  3  4  5  6  7  8]
 [ 0  0  0  0  0  0  9 10]]


In [22]:
# now take everything but the last value of each sequence as the X and the last as the y
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

print(X)
print(y)

[[0 0 0 0 0 0 1]
 [0 0 0 0 0 1 2]
 [0 0 0 0 1 2 3]
 [0 0 0 1 2 3 4]
 [0 0 1 2 3 4 5]
 [0 1 2 3 4 5 6]
 [1 2 3 4 5 6 7]
 [0 0 0 0 0 0 9]]
[ 2  3  4  5  6  7  8 10]


In [24]:
# one-hot encode the labels 
# treat this as classification
# number of classes is the total number of words 
# this means that the y would be an array of length=total_words
# where the token value is used as the index and is set to 1
y = tf.keras.utils.to_categorical(y, num_classes=total_words)