### 📌 1. Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import string
import re


###  📌 2. Load Dataset

In [2]:
with open('sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as file:
    data = file.read()

### 📌 3. Preprocessing

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[[^]]*\]', '', text)  # remove [text]
    text = re.sub(r'\([^)]*\)', '', text)  # remove (text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\d', '', text)  # remove digits
    text = re.sub(r'\s+', ' ', text)  # normalize whitespace
    return text.strip()

cleaned_data = clean_text(data)


### 📌 4. Tokenization & Sequence Creation

In [4]:
"""tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned_data])
word_index = tokenizer.word_index
total_words = len(word_index) + 1  # +1 because indexing starts from 1

# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([cleaned_data])[0]

# Creating n-gram sequences
for i in range(1, len(tokens)):
    n_gram_sequence = tokens[:i+1]
    input_sequences.append(n_gram_sequence)
"""
from tensorflow.keras.preprocessing.text import Tokenizer

# Ensure cleaned_data is a list of sentences
if isinstance(cleaned_data, str):
    cleaned_data = cleaned_data.split('.')  # Split long text into sentences

# Initialize Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_data)  # Tokenizing correctly

# Extract word index and total words count
word_index = tokenizer.word_index
total_words = len(word_index) + 1  # +1 for padding or unknown token

# Convert text into sequences
tokens = tokenizer.texts_to_sequences(cleaned_data)

# Creating n-gram sequences
input_sequences = []
max_tokens = 5000  # Limit to avoid memory overload

for seq in tokens:
    for i in range(1, min(len(seq), max_tokens)):  # Avoid large memory allocation
        n_gram_sequence = seq[:i+1]  # Build sequence progressively
        input_sequences.append(n_gram_sequence)

# Print sample sequences to check correctness
print("Sample sequences:", input_sequences[:5])
print("Total sequences generated:", len(input_sequences))


Sample sequences: [[1, 1513], [1, 1513, 5], [1, 1513, 5, 128], [1, 1513, 5, 128, 33], [1, 1513, 5, 128, 33, 601]]
Total sequences generated: 4999


### 📌 5. Padding Sequences & Preparing Labels

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_seq_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)


### 📌 6. Model Architecture

In [6]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_seq_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




### ✅ 7. Model Training

In [None]:
history = model.fit(X, y, epochs=20, verbose=1)


Epoch 1/20
[1m 14/157[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m49:35[0m 21s/step - accuracy: 0.0408 - loss: 9.0251

### ✅ 8. Save the Trained Model

In [None]:
# Save the model in HDF5 format
model.save("model2.h5")


### ✅ 9. Create a Text Generation Function

In [None]:
def generate_phrase(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=1)[0]

        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word
                break
    return seed_text


### ✅ 10. Example Prediction

In [None]:
# Example input
print(generate_phrase("the case of the"))


### 📊 11. Optional: Visualize Accuracy or Loss



In [None]:
import matplotlib.pyplot as plt

# Plot training accuracy
plt.plot(history.history['accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.grid()
plt.show()

# Plot training loss
plt.plot(history.history['loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.grid()
plt.show()
