In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.utils import to_categorical



In [2]:
# Sample text
text = "The quick brown fox jumps over the lazy dog. The quick brown fox is very quick."



In [3]:
# Step 1: Tokenize and preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokenizer = Tokenizer()  # Initialize tokenizer
    tokenizer.fit_on_texts([text])  # Tokenize text
    sequences = tokenizer.texts_to_sequences([text])[0]  # Convert to integer sequence
    return sequences, tokenizer



In [4]:
# Preprocess text and get sequences
sequences, tokenizer = preprocess_text(text)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding

# Step 2: Create bigram input-output pairs
def create_bigram_pairs(sequences):
    X, y = [], []
    for i in range(len(sequences) - 1):
        X.append([sequences[i]])  # Input word
        y.append(sequences[i + 1])  # Target word (next word)
    return np.array(X), np.array(y)

# Generate bigram pairs
X, y = create_bigram_pairs(sequences)



In [5]:
# Step 3: Build the GRU model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=1))  # Embedding layer
model.add(GRU(64, return_sequences=False))  # GRU layer
model.add(Dense(vocab_size, activation='softmax'))  # Output layer






In [6]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [7]:
# Step 4: Train the model
model.fit(X, y, epochs=100, verbose=1)



Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.2000 - loss: 2.3997
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.2000 - loss: 2.3933
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.2667 - loss: 2.3869
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.2000 - loss: 2.3806
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.2000 - loss: 2.3742
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.2000 - loss: 2.3677
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.4000 - loss: 2.3612
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.5333 - loss: 2.3546
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x7e674fc49ae0>

In [8]:
# Step 5: Function for generating text
def generate_text(seed_text, next_words, model, tokenizer):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]  # Convert seed text to sequence
        token_list = np.array([token_list[-1]])  # Only the last word as input
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)  # Predict next word
        output_word = tokenizer.index_word[predicted[0]]  # Convert to word
        seed_text += " " + output_word
    return seed_text



In [9]:
# Example of text generation
seed_text = "the quick"
print(generate_text(seed_text, 5, model, tokenizer))

the quick brown fox jumps over the


In [10]:


# Example of text generation
seed_text = "the quick"
print(generate_text(seed_text, 5, model, tokenizer))

the quick brown fox jumps over the


In [11]:


# Example of text generation
seed_text = "brown fox"
print(generate_text(seed_text, 5, model, tokenizer))

brown fox jumps over the quick brown
