<a href="https://colab.research.google.com/github/chidu19/Next-Word-Prediction/blob/main/Next_Word_Prediction_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📘 Next Word Prediction using LSTM

This project is inspired by the LGM Data Science Internship assignment. We build a simplified version of a GPT-like model that predicts the next word given a few input words.

In [22]:
!pip install tensorflow numpy




## 📥 Step 1: Load and Preprocess Text
We start by reading the eBook (`book.txt`) and cleaning it.

In [28]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Your sample or custom text
text = """
Once upon a time there was a king who ruled a vast kingdom.
The people of the kingdom were happy and prosperous.
One day, a stranger arrived in the kingdom.
He brought stories of a distant land full of magic and mystery.
"""

# Clean and split text
text = text.lower().replace("\n", " ")
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1


with open('book.txt', 'r', encoding='utf-8') as f:
    text = f.read().lower()

import re
words = re.sub(r"[^a-zA-Z\s]", "", text).split()

print(f"Sample words: {words[:20]}")
print(f"Total words: {len(words)}")

Sample words: ['project', 'gutenbergs', 'the', 'adventures', 'of', 'sherlock', 'holmes', 'by', 'arthur', 'conan', 'doyle', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere']
Total words: 107430


## 📊 Step 2: EDA - Word Frequency & Bigrams
We analyze word frequency and common bigrams.

In [24]:
from collections import Counter
from nltk import bigrams
import nltk
nltk.download('punkt')

word_freq = Counter(words)
print("Most common words:", word_freq.most_common(10))
print("Least common words:", word_freq.most_common()[-10:])

bi_freq = Counter(bigrams(words))
print("\nCommon bigrams:")
for pair, count in bi_freq.most_common(10):
    print(f"{pair} -> {count}")

Most common words: [('the', 5805), ('and', 3070), ('i', 2995), ('of', 2778), ('to', 2762), ('a', 2683), ('in', 1818), ('that', 1750), ('it', 1710), ('you', 1545)]
Least common words: [('michael', 1), ('hart', 1), ('originator', 1), ('network', 1), ('necessarily', 1), ('edition', 1), ('pg', 1), ('includes', 1), ('subscribe', 1), ('newsletter', 1)]

Common bigrams:
('of', 'the') -> 743
('in', 'the') -> 522
('it', 'is') -> 336
('to', 'the') -> 319
('i', 'have') -> 299
('it', 'was') -> 276
('that', 'i') -> 256
('at', 'the') -> 240
('and', 'i') -> 214
('and', 'the') -> 203


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 🧠 Step 3: Prepare Sequences
We create a sliding window of 5 words as input and the 6th word as the label.

In [25]:
SEQUENCE_LENGTH = 5
input_sequences = []
output_words = []

for i in range(len(words) - SEQUENCE_LENGTH):
    input_sequences.append(words[i:i + SEQUENCE_LENGTH])
    output_words.append(words[i + SEQUENCE_LENGTH])

print("Sample input-output pair:")
print(input_sequences[0], "->", output_words[0])

Sample input-output pair:
['project', 'gutenbergs', 'the', 'adventures', 'of'] -> sherlock


## 🔤 Step 4: Tokenization & One-Hot Encoding
We map words to indices and one-hot encode them.

In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

tokenizer = Tokenizer()
tokenizer.fit_on_texts(words)

vocab_size = len(tokenizer.word_index) + 1

# Integer encode input sequences
X_int = np.array([[tokenizer.word_index[word] for word in seq] for seq in input_sequences])
y_int = np.array([tokenizer.word_index[word] for word in output_words])

# Pad sequences (though they should already be fixed-length)
X = X_int
y = to_categorical(y_int, num_classes=vocab_size)

print(f"X shape: {X.shape}, y shape: {y.shape}")


X shape: (107425, 5), y shape: (107425, 8601)


## 🏗️ Step 5: Build and Train LSTM Model

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=SEQUENCE_LENGTH),
    LSTM(128),
    Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [31]:
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)


Epoch 1/20
[1m756/756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 68ms/step - accuracy: 0.0503 - loss: 7.0069 - val_accuracy: 0.0643 - val_loss: 6.8449
Epoch 2/20
[1m756/756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 65ms/step - accuracy: 0.0677 - loss: 6.1861 - val_accuracy: 0.0823 - val_loss: 6.6431
Epoch 3/20
[1m756/756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 67ms/step - accuracy: 0.0929 - loss: 5.8396 - val_accuracy: 0.1019 - val_loss: 6.5519
Epoch 4/20
[1m756/756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 67ms/step - accuracy: 0.1156 - loss: 5.5599 - val_accuracy: 0.1109 - val_loss: 6.5544
Epoch 5/20
[1m756/756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 65ms/step - accuracy: 0.1308 - loss: 5.3423 - val_accuracy: 0.1169 - val_loss: 6.5909
Epoch 6/20
[1m756/756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 64ms/step - accuracy: 0.1409 - loss: 5.1424 - val_accuracy: 0.1205 - val_loss: 6.6472
Epoch 7/20
[1m7

<keras.src.callbacks.history.History at 0x791247128290>

## 🔮 Step 6: Predict the Next Word
We build a prediction function.

In [39]:
def predict_next_word(seed_text):
    input_words = seed_text.lower().split()
    if len(input_words) != SEQUENCE_LENGTH:
        raise ValueError(f"Expected {SEQUENCE_LENGTH} words.")

    # Convert words to indices
    input_seq = [word_to_index.get(w, 0) for w in input_words]

    # No one-hot encoding here — just word indices
    input_seq_encoded = np.array([input_seq])  # Shape: (1, SEQUENCE_LENGTH)

    # Predict next word
    prediction = model.predict(input_seq_encoded, verbose=0)

    # Get index of most probable word
    predicted_index = np.argmax(prediction)

    # Convert index back to word
    return index_to_word[predicted_index]

# Example usage
print(predict_next_word("sherlock holmes was very good"))



sidealley
