In [1]:
# ==========================
# STEP 1: Install & Import
# ==========================
!pip install tensorflow nltk

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU
import nltk
import re
nltk.download('punkt')

# ==========================
# STEP 2: Load Dataset
# ==========================

text = """
Once upon a time there was a young girl who loved to read books.
She spent hours every day in the library, dreaming of adventures and magic.
One day, she found a secret passage hidden behind a shelf.
The passage led her to a mysterious world filled with wonders she had never imagined.
"""

# ==========================
# STEP 3: Data Preprocessing
# ==========================
def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r"[^a-zA-Z0-9]+", " ", txt)
    return txt

clean_corpus = clean_text(text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([clean_corpus])
total_words = len(tokenizer.word_index) + 1

# Convert text to sequences
input_sequences = []
words = clean_corpus.split()

for i in range(1, len(words)):
    n_gram_sequence = words[:i+1]
    seq = tokenizer.texts_to_sequences([' '.join(n_gram_sequence)])[0]
    input_sequences.append(seq)

# Padding sequences
max_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_len, padding='pre'))

# ==========================
# STEP 4: Input & Output
# ==========================
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# ==========================
# STEP 5: Build Model
# ==========================
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# ==========================
# STEP 6: Train Model
# ==========================
history = model.fit(X, y, epochs=200, verbose=1)

# ==========================
# STEP 7: Generate Text
# ==========================
def generate_text(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# ==========================
# STEP 8: Test Generator
# ==========================
print(generate_text("once upon a", next_words=15))
print(generate_text("the passage led", next_words=15))




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Epoch 1/200




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 89ms/step - accuracy: 0.0000e+00 - loss: 3.7856
Epoch 2/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.1522 - loss: 3.7729
Epoch 3/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 0.1162 - loss: 3.7593
Epoch 4/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.1162 - loss: 3.7452
Epoch 5/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - accuracy: 0.1058 - loss: 3.7239
Epoch 6/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.0954 - loss: 3.6871
Epoch 7/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 0.1058 - loss: 3.6098
Epoch 8/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.0849 - loss: 3.5606
Epoch 9/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 