In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import numpy as np

In [15]:
# 1. Collect Data
data = """I like to eat apples
i like to eat bananas
i like to eat oranges
he likes to eat grapes
she likes to eat watermelon
they like to eat peaches
we enjoy eating mangoes
they enjoy eating pineapples
i enjoy reading books
she enjoys reading novels
he enjoys watching movies
they enjoy watching football
she likes to play soccer
he likes to play cricket
we like to play basketball
they like playing games
They enjoy watching movies"""

# 2. Prepare Data
corpus = data.lower().split("\n")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

total_words = len(tokenizer.word_index) + 1

In [16]:
# Create input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_sequences.append(n_gram_seq)

# Pad sequences
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# Split data
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

# 3. Build Model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_seq_len - 1))
model.add(SimpleRNN(64))
model.add(Dense(total_words, activation='softmax'))

In [17]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(xs, ys, epochs=300, verbose=1)

Epoch 1/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.0326 - loss: 3.4864
Epoch 2/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.0875 - loss: 3.4648
Epoch 3/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.1090 - loss: 3.4363
Epoch 4/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.1403 - loss: 3.3999
Epoch 5/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.1083 - loss: 3.3627
Epoch 6/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0979 - loss: 3.3192
Epoch 7/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.0979 - loss: 3.2576
Epoch 8/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0979 - loss: 3.2166
Epoch 9/300
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7fc123b36e10>

In [18]:
# 4. Generate Text
def predict_next_words(model, tokenizer, text, num_words=3):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word.get(predicted_index, '')
        if predicted_word == '':
            break
        text += ' ' + predicted_word
    return text

In [21]:

# Example usage
seed_text = "They enjoy"
generated_text = predict_next_words(model, tokenizer, seed_text, num_words=5)
print("Generated:", generated_text)

Generated: They enjoy watching movies peaches soccer watermelon
