In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
import string

# Sample text data for illustration (replace with your text corpus)
text = """In the beginning God created the heavens and the earth. Now the earth was formless and empty, 
darkness was over the surface of the deep, and the Spirit of God was hovering over the waters. 
And God said, “Let there be light,” and there was light. God saw that the light was good, 
and he separated the light from the darkness."""

### Step 1: Data Preprocessing ###
# Clean text: Convert text to lowercase and remove punctuation
def clean_text(txt):
    txt = txt.lower()
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    return txt

cleaned_text = clean_text(text)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned_text])
total_words = len(tokenizer.word_index) + 1  # Total number of unique words
print(f"Total words: {total_words}")

# Create sequences of words
input_sequences = []
for line in cleaned_text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Pad sequences to ensure uniform length
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Split the data into input (X) and output (y)
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# One-hot encode the output labels
y = to_categorical(y, num_classes=total_words)

### Step 2: Build the RNN Model ###
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))  # Embedding layer
model.add(LSTM(150, return_sequences=True))  # First LSTM layer
model.add(LSTM(100))  # Second LSTM layer
model.add(Dense(total_words, activation='softmax'))  # Output layer with softmax activation

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model.summary()

### Step 3: Train the Model ###
history = model.fit(X, y, epochs=100, verbose=1)

### Step 4: Generate New Text Using the Model ###
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Example of generating text
seed_text = "God said"
generated_text = generate_text(seed_text, next_words=10, model=model, max_sequence_len=max_sequence_len)
print(generated_text)


Total words: 33


Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 346ms/step - accuracy: 0.0333 - loss: 3.4966
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 411ms/step - accuracy: 0.1743 - loss: 3.4774
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 341ms/step - accuracy: 0.1847 - loss: 3.4432
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 312ms/step - accuracy: 0.1951 - loss: 3.3453
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 372ms/step - accuracy: 0.1743 - loss: 3.2144
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 334ms/step - accuracy: 0.1951 - loss: 3.1543
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 329ms/step - accuracy: 0.1639 - loss: 3.1872
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 403ms/step - accuracy: 0.1743 - loss: 3.1279
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━