# Title:  Next Word Prediction with Sherlock Holmes Text using LSTM

# Author:  Abd Ur Rehman

# Date:  12_01_2024

# Project Purpose
## Objective:
### Develop a Next Word Prediction model using Natural Language Processing (NLP) techniques, specifically Long Short-Term Memory (LSTM) networks, to anticipate the next word in sentences from the Sherlock Holmes text dataset.

## Dataset:

### The dataset consists of plain-text versions of the renowned Sherlock Holmes stories by Sir Arthur Conan Doyle. This collection provides a rich source of textual data for training the model.

## Key Tasks:

### 1. Data Preprocessing: Clean and tokenize the text, create sequences of words.
### 2. Model Building: Construct an LSTM-based neural network for sequence prediction.
### 3. Model Training: Train the model on the processed dataset to learn language patterns.
### 4. Evaluation: Assess the model's accuracy in predicting the next word.
### 5. Application: Implement a prediction function and a Flask web application for interactive use.

## Expected Outcome:
### By the end of this project, we aim to have a functional Next Word Prediction model capable of suggesting the most probable word following a given sequence of words from the Sherlock Holmes stories. This model can serve as a foundation for applications such as predictive text input and content generation.

# Data Preprocessing

In [None]:
import tensorflow as tf

# Clear any previous session
tf.keras.backend.clear_session()


In [None]:
# # Import necessary libraries
# import numpy as np
# import nltk
# import re
# import tensorflow as tf
# import matplotlib.pyplot as plt
# from keras.preprocessing.text import Tokenizer
# from keras.utils import to_categorical
# from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Import necessary libraries
import numpy as np
import nltk
import re
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Limit GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Clear GPU memory after each significant operation
tf.keras.backend.clear_session()

# Use mixed precision if available and beneficial
if tf.test.is_gpu_available():
    tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Your code continues here

In [None]:
# Download and import necessary NLTK data
#nltk.download('punkt')

# Load the dataset
with open('sherlock-holm.es_stories_plain-text_advs.txt', 'r') as file:
    text = file.read()

# Preprocess the text
text = text.lower()  # Convert to lowercase
text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuations and non-alphabetic characters

# Tokenize the text
tokens = nltk.word_tokenize(text)

# Create sequences of words
sequence_length = 5
sequences = []

for i in range(sequence_length, len(tokens)):
    seq = tokens[i-sequence_length:i]
    sequences.append(' '.join(seq))

# Tokenize sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)

# Convert to numpy array
sequences = np.array(sequences)

# Split into input (X) and output (y)
X = sequences[:, :-1]
y = sequences[:, -1]

# One-hot encode the output words
vocab_size = len(tokenizer.word_index) + 1
y = to_categorical(y, num_classes=vocab_size)

# Pad the input sequences
X = pad_sequences(X, maxlen=sequence_length-1, padding='pre')

In [None]:
# Print the shapes of X and y to verify
print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')

# Model Building

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from keras.optimizers import Adam

# Define a simpler model for debugging
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=sequence_length-1))
model.add(Bidirectional(LSTM(50, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(50)))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])


In [None]:
# from keras.models import Sequential
# from keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
# from keras.optimizers import Adam

# # Define the model
# model = Sequential()
# model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=sequence_length-1))
# model.add(Bidirectional(LSTM(150, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(150)))
# model.add(Dropout(0.2))
# model.add(Dense(vocab_size, activation='softmax'))

# # Compile the model
# model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])


# Model Training

In [None]:
# Train the model
epochs = 100
batch_size = 32

history = model.fit(X, y, epochs=epochs, batch_size=batch_size, validation_split=0.2)


# Evaluation

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X, y)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

# plot graph for accuracy and loss

In [None]:
# Plotting the accuracy and loss
# Retrieve training and validation accuracy
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Retrieve training and validation loss
loss = history.history['loss']
val_loss = history.history['val_loss']

# Create epochs array for x-axis
epochs_range = range(epochs)

# Plot training accuracy and loss
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, accuracy, label='Training Accuracy')
plt.plot(epochs_range, loss, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Accuracy/Loss')
plt.legend(loc='best')
plt.title('Training Accuracy and Loss')

# Plot validation accuracy and loss
plt.subplot(1, 2, 2)
plt.plot(epochs_range, val_accuracy, label='Validation Accuracy')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Accuracy/Loss')
plt.legend(loc='best')
plt.title('Validation Accuracy and Loss')

plt.show()

# Prediction Function

In [None]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text_seq, seq_length):
    text_seq = text_seq.lower()
    text_seq = re.sub(r'[^a-z\s]', '', text_seq)  # Preprocess input text
    tokens = nltk.word_tokenize(text_seq)
    encoded = tokenizer.texts_to_sequences([tokens])
    encoded = pad_sequences(encoded, maxlen=seq_length-1, padding='pre')
    
    pred_prob = model.predict(encoded, verbose=0)
    pred_word = tokenizer.index_word[np.argmax(pred_prob)]
    
    return pred_word

# Example usage
input_text = "sherlock holmes is"
next_word = predict_next_word(model, tokenizer, input_text, sequence_length)
print(f'Next word prediction: {next_word}')
