In [16]:
# Import libraries
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import string

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


In [17]:
# 1. Load and clean text
def load_and_clean_text(csv_path):
    df = pd.read_csv(csv_path)
    text = ' '.join(df['PlayerLine'].dropna().astype(str).tolist()).lower()
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text


In [18]:
# 2. Tokenize and prepare sequences
def tokenize_text(text, sequence_length=5, max_sequences=20000):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    total_words = len(tokenizer.word_index) + 1

    input_sequences = []
    words = text.split()

    for i in range(sequence_length, len(words)):
        seq = words[i-sequence_length:i+1]
        line = tokenizer.texts_to_sequences([' '.join(seq)])[0]
        if len(line) == sequence_length + 1:
            input_sequences.append(line)
        if len(input_sequences) >= max_sequences:
            break  

    input_sequences = np.array(input_sequences)
    X, y = input_sequences[:, :-1], input_sequences[:, -1]
    y = to_categorical(y, num_classes=total_words)

    return X, y, tokenizer, total_words


In [19]:
# 3. Build LSTM model
def build_model(total_words, seq_length):
    model = Sequential()
    model.add(Embedding(input_dim=total_words, output_dim=32, input_length=seq_length))
    model.add(LSTM(32, return_sequences=True))
    model.add(LSTM(16))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [20]:
# 4. Generate text from a seed
def generate_text(seed_text, next_words, model, tokenizer, seq_length):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]
        output_word = ''
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += ' ' + output_word
    return seed_text


In [21]:
# 5. Main execution
if __name__ == "__main__":
    print(" Loading and cleaning text...")
    dataset_path = "/kaggle/input/shakespeare-plays/Shakespeare_data.csv"  # Update path if needed
    raw_text = load_and_clean_text(dataset_path)
    print("Text cleaned. Length:", len(raw_text))

    print("\n Tokenizing...")
    sequence_length = 5
    X, y, tokenizer, total_words = tokenize_text(raw_text, sequence_length)
    print(" Tokenized. X shape:", X.shape, "| y shape:", y.shape)
    print(" Vocabulary size:", total_words)

    print("\n Building model...")
    model = build_model(total_words, sequence_length)

    print("\n Training model...")
    es = EarlyStopping(monitor='loss', patience=1, restore_best_weights=True)
    model.fit(X, y, epochs=5, batch_size=64, callbacks=[es], verbose=1)

    # 6. Generate and print text
    seed = "to be or not"
    generated = generate_text(seed, next_words=15, model=model, tokenizer=tokenizer, seq_length=sequence_length)
    print("\n Generated Output:")
    print(f'Seed: "{seed}"')
    print(f'Generated: "{generated}"')


 Loading and cleaning text...
Text cleaned. Length: 4164192

 Tokenizing...
 Tokenized. X shape: (20000, 5) | y shape: (20000, 27366)
 Vocabulary size: 27366

 Building model...

 Training model...




Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.0288 - loss: 9.0491
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.0321 - loss: 6.6044
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.0357 - loss: 6.4808
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.0358 - loss: 6.4328
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.0336 - loss: 6.4025

 Generated Output:
Seed: "to be or not"
Generated: "to be or not the the the the the the the the the the the the the the the"
