In [4]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.utils import to_categorical

In [176]:
# Sample dataset
data = [
    "The cat sat on the mat",
    "The dog sat on the log",
    "Dogs and cats are friends"
]

In [177]:
# Tokenization of the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
tokenizer.word_index

{'the': 1,
 'sat': 2,
 'on': 3,
 'cat': 4,
 'mat': 5,
 'dog': 6,
 'log': 7,
 'dogs': 8,
 'and': 9,
 'cats': 10,
 'are': 11,
 'friends': 12}

In [178]:
# Preparing the input sequence

input_sequences = list()
for line in data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [179]:
tokenizer.texts_to_sequences(data)

[[1, 4, 2, 3, 1, 5], [1, 6, 2, 3, 1, 7], [8, 9, 10, 11, 12]]

In [180]:
input_sequences

[[1, 4],
 [1, 4, 2],
 [1, 4, 2, 3],
 [1, 4, 2, 3, 1],
 [1, 4, 2, 3, 1, 5],
 [1, 6],
 [1, 6, 2],
 [1, 6, 2, 3],
 [1, 6, 2, 3, 1],
 [1, 6, 2, 3, 1, 7],
 [8, 9],
 [8, 9, 10],
 [8, 9, 10, 11],
 [8, 9, 10, 11, 12]]

In [181]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences

array([[ 0,  0,  0,  0,  1,  4],
       [ 0,  0,  0,  1,  4,  2],
       [ 0,  0,  1,  4,  2,  3],
       [ 0,  1,  4,  2,  3,  1],
       [ 1,  4,  2,  3,  1,  5],
       [ 0,  0,  0,  0,  1,  6],
       [ 0,  0,  0,  1,  6,  2],
       [ 0,  0,  1,  6,  2,  3],
       [ 0,  1,  6,  2,  3,  1],
       [ 1,  6,  2,  3,  1,  7],
       [ 0,  0,  0,  0,  8,  9],
       [ 0,  0,  0,  8,  9, 10],
       [ 0,  0,  8,  9, 10, 11],
       [ 0,  8,  9, 10, 11, 12]], dtype=int32)

In [182]:
X, labels = input_sequences[:,:-1], input_sequences[:,-1]
y = to_categorical(labels, num_classes=13)

In [183]:
X

array([[ 0,  0,  0,  0,  1],
       [ 0,  0,  0,  1,  4],
       [ 0,  0,  1,  4,  2],
       [ 0,  1,  4,  2,  3],
       [ 1,  4,  2,  3,  1],
       [ 0,  0,  0,  0,  1],
       [ 0,  0,  0,  1,  6],
       [ 0,  0,  1,  6,  2],
       [ 0,  1,  6,  2,  3],
       [ 1,  6,  2,  3,  1],
       [ 0,  0,  0,  0,  8],
       [ 0,  0,  0,  8,  9],
       [ 0,  0,  8,  9, 10],
       [ 0,  8,  9, 10, 11]], dtype=int32)

In [184]:
labels

array([ 4,  2,  3,  1,  5,  6,  2,  3,  1,  7,  9, 10, 11, 12],
      dtype=int32)

In [185]:
y

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [186]:
model = Sequential([
    Embedding(13, 10),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(13, activation='softmax')
])

In [187]:
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [188]:
hist = model.fit(X, y, epochs=200)

Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.2143 - loss: 2.5643
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.1429 - loss: 2.5623
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1429 - loss: 2.5603
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1429 - loss: 2.5582
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.1429 - loss: 2.5559
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.1429 - loss: 2.5536
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.1429 - loss: 2.5511
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.1429 - loss: 2.5484
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.5000 - loss: 1.4771
Epoch 70/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.5000 - loss: 1.4368
Epoch 71/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.5000 - loss: 1.3965
Epoch 72/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.5000 - loss: 1.3567
Epoch 73/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.5714 - loss: 1.3177
Epoch 74/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.5714 - loss: 1.2799
Epoch 75/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.7143 - loss: 1.2439
Epoch 76/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.7143 - loss: 1.2094
Epoch 77/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.8571 - loss: 0.5224
Epoch 138/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.8571 - loss: 0.5175
Epoch 139/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.8571 - loss: 0.5149
Epoch 140/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.8571 - loss: 0.5111
Epoch 141/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.8571 - loss: 0.5058
Epoch 142/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.8571 - loss: 0.5019
Epoch 143/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.8571 - loss: 0.4990
Epoch 144/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.8571 - loss: 0.4947
Epoch 145/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [215]:
data

['The cat sat on the mat',
 'The dog sat on the log',
 'Dogs and cats are friends']

In [212]:
def predict_next_word(text):
    sequence = tokenizer.texts_to_sequences([text])[0]
    sequence = pad_sequences([sequence], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(sequence)
    idx = np.argmax(predicted)
    return tokenizer.index_word[idx]

In [228]:
predict_next_word("the dog sat on the log")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step


'log'