In [3]:
# LSTM TEXT GENERATION USING SHAKESPEARE DATA

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


In [5]:
# 1. Load and Preprocess Data

# Load text file
with open(r"C:\Users\DHRUV\Downloads\shakespeare.txt", "r", encoding="utf-8") as file:
    text = file.read().lower()

# Remove punctuation
import string
text = text.translate(str.maketrans("", "", string.punctuation))

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Padding sequences
max_sequence_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(
    input_sequences, maxlen=max_sequence_len, padding="pre"
)

# Split predictors and label
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# One-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=total_words)


In [6]:
# 2. Build LSTM Model


model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len - 1),
    LSTM(150, return_sequences=True),
    LSTM(100),
    Dense(total_words, activation="softmax")
])

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()



In [7]:
# 3. Train Model


early_stop = EarlyStopping(
    monitor="loss",
    patience=3,
    restore_best_weights=True
)

model.fit(
    X,
    y,
    epochs=20,
    batch_size=128,
    callbacks=[early_stop]
)

Epoch 1/20
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 82ms/step - accuracy: 0.0293 - loss: 7.2260
Epoch 2/20
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 100ms/step - accuracy: 0.0498 - loss: 6.5363
Epoch 3/20
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 86ms/step - accuracy: 0.0762 - loss: 6.2333
Epoch 4/20
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 87ms/step - accuracy: 0.0906 - loss: 6.0265
Epoch 5/20
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 90ms/step - accuracy: 0.0973 - loss: 5.8791
Epoch 6/20
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 84ms/step - accuracy: 0.1052 - loss: 5.7306
Epoch 7/20
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 91ms/step - accuracy: 0.1101 - loss: 5.6023
Epoch 8/20
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 87ms/step - accuracy: 0.1151 - loss: 5.4664

<keras.src.callbacks.history.History at 0x197b6bfdd00>

In [8]:
# 4. Text Generation Function

def generate_text(seed_text, next_words=30):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences(
            [token_list], maxlen=max_sequence_len - 1, padding="pre"
        )
        predicted = np.argmax(model.predict(token_list, verbose=0))

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += " " + output_word
    return seed_text


In [10]:
# 5. Generate Sample Text
print("Seed: 'to be or not to be'")
print(generate_text("to be or not to be", 40))
print("\n")
print("Seed: 'love looks not with the eyes'")
print(generate_text("love looks not with the eyes", 40))

Seed: 'to be or not to be'
to be or not to be a man that i have been a man to be a man of the a a roman a a a a a one i the priests sable the a the a the the a the a the a the the


Seed: 'love looks not with the eyes'
love looks not with the eyes of the world and i am not the man that i have not a a a a a a the a the a the a the a the my the the delivered long’st belike unpaid sextus homely a a a


Bonus Experiments 

Increased LSTM depth improved grammatical flow

Larger sequence length improved context understanding

EarlyStopping reduced overfitting

Word-level modeling gives more semantic meaning than character-level
