# Project Next Word Predictor using LSTM

In [None]:
!pip install nltk



## 1. Import all the libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk

## 2. Load and Data Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')
filename = '/content/drive/MyDrive/aaa.txt'

# read the text file
with open(filename, 'r', encoding='utf-8') as f:
    document = f.read()



# remove unnecessary Project Gutenberg header/footer
# find story start
start_idx = document.find('To Sherlock Holmes she is always')
if start_idx == -1:
    start_idx = document.find('A SCANDAL IN BOHEMIA')
if start_idx == -1:
    start_idx = 0

# find story end
end_idx = document.find('End of the Project Gutenberg')
if end_idx == -1:
    end_idx = document.find('END OF THE PROJECT GUTENBERG')
if end_idx == -1:
    end_idx = len(document)

# extract only the story content
document = document[start_idx:end_idx]

print(f"Extracted story: {len(document)} characters")
print(f"Preview: {document[:200]}")



# remove commas
document = document.replace(",", "")

# split by periods
input_sentences = document.split('.')
input_sentences = [s.strip() for s in input_sentences if s.strip()]



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracted story: 561606 characters
Preview: To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emot


## 3. Tokenization and Vocabulary Building

In [None]:
# download nltk data for word tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# tokenize text
tokens = word_tokenize(document.lower())

In [None]:
# build vocab
vocab = {'<unk>':0}

for token in Counter(tokens).keys():
  if token not in vocab:
    vocab[token] = len(vocab)

vocab

{'<unk>': 0,
 'to': 1,
 'sherlock': 2,
 'holmes': 3,
 'she': 4,
 'is': 5,
 'always': 6,
 '_the_': 7,
 'woman': 8,
 '.': 9,
 'i': 10,
 'have': 11,
 'seldom': 12,
 'heard': 13,
 'him': 14,
 'mention': 15,
 'her': 16,
 'under': 17,
 'any': 18,
 'other': 19,
 'name': 20,
 'in': 21,
 'his': 22,
 'eyes': 23,
 'eclipses': 24,
 'and': 25,
 'predominates': 26,
 'the': 27,
 'whole': 28,
 'of': 29,
 'sex': 30,
 'it': 31,
 'was': 32,
 'not': 33,
 'that': 34,
 'he': 35,
 'felt': 36,
 'emotion': 37,
 'akin': 38,
 'love': 39,
 'for': 40,
 'irene': 41,
 'adler': 42,
 'all': 43,
 'emotions': 44,
 'one': 45,
 'particularly': 46,
 'were': 47,
 'abhorrent': 48,
 'cold': 49,
 'precise': 50,
 'but': 51,
 'admirably': 52,
 'balanced': 53,
 'mind': 54,
 'take': 55,
 'most': 56,
 'perfect': 57,
 'reasoning': 58,
 'observing': 59,
 'machine': 60,
 'world': 61,
 'has': 62,
 'seen': 63,
 'as': 64,
 'a': 65,
 'lover': 66,
 'would': 67,
 'placed': 68,
 'himself': 69,
 'false': 70,
 'position': 71,
 'never': 72,
 's

In [None]:
len(vocab)

9058

In [None]:
input_sentences = document.split('\n')

## 4. Convert Text to Numerical Sequences

In [None]:
def text_to_indices(sentence, vocab):

  numerical_sentence = []

  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])

  return numerical_sentence


In [None]:
# convert each sentence to numerical indices
input_numerical_sentences = []

for sentence in input_sentences:
  input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))

print(f"Total numerical sentences: {len(input_numerical_sentences)}")
print(f"Sample: {input_numerical_sentences[0][:10]}")

Total numerical sentences: 11880
Sample: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [None]:
len(input_numerical_sentences)

11880

## 5. Convert training sequences

In [None]:
# create training pair: [input_seq] -> [next_word]
training_sequence = []
for sentence in input_numerical_sentences:

  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])

In [None]:
len(training_sequence)

109352

In [None]:
training_sequence[:5]

[[1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6]]

In [None]:
# find the longest sequence for padding
len_list = []

for sequence in training_sequence:
  len_list.append(len(sequence))

max_len_list = max(len_list)
max_len_list

26

In [None]:
# pad all sequences to the same length with zeros
padded_training_sequence = []
for sequence in training_sequence:

  # add zeros to the left to reach max_len_list
  padded_training_sequence.append([0]*(max_len_list - len(sequence)) + sequence)

In [None]:
len(padded_training_sequence[10])

26

In [None]:
# convert to Pytorch tensor
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [None]:
padded_training_sequence

tensor([[   0,    0,    0,  ...,    0,    1,    2],
        [   0,    0,    0,  ...,    1,    2,    3],
        [   0,    0,    0,  ...,    2,    3,    4],
        ...,
        [   0,    0,    0,  ..., 2394,   77, 2187],
        [   0,    0,    0,  ...,   77, 2187, 1207],
        [   0,    0,    0,  ..., 2187, 1207,    9]])

## 6. Prepare Training Data


In [None]:
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]

In [None]:
X

tensor([[   0,    0,    0,  ...,    0,    0,    1],
        [   0,    0,    0,  ...,    0,    1,    2],
        [   0,    0,    0,  ...,    1,    2,    3],
        ...,
        [   0,    0,    0,  ...,   62, 2394,   77],
        [   0,    0,    0,  ..., 2394,   77, 2187],
        [   0,    0,    0,  ...,   77, 2187, 1207]])

In [None]:
y

tensor([   2,    3,    4,  ..., 2187, 1207,    9])

In [None]:
# create customdataset

class CustomDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [None]:
dataset = CustomDataset(X,y)

In [None]:
len(dataset)

109352

In [None]:
# create DataLoader for batch processing
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

## 7. Build LSTM model

In [None]:
class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

### initialize model

In [None]:
# create model instance
model = LSTMModel(len(vocab))

In [None]:
# set device if GPU available otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# move model to device
model.to(device)

LSTMModel(
  (embedding): Embedding(9058, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=9058, bias=True)
)

## 8. Training setup

In [None]:
# training hyperparameters
epochs = 60
learning_rate = 0.001

# losss function
criterion = nn.CrossEntropyLoss()

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# training loop

for epoch in range(epochs):
  total_loss = 0

  for batch_x, batch_y in dataloader:

    # move batch to device
    batch_x, batch_y = batch_x.to(device), batch_y.to(device)

    # zero gradients
    optimizer.zero_grad()

    # forward pass
    output = model(batch_x)

    # calculate loss
    loss = criterion(output, batch_y)

    # backward pass
    loss.backward()

    # update weight
    optimizer.step()

    # accumulate loss
    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 16561.3758
Epoch: 2, Loss: 14949.4622
Epoch: 3, Loss: 13606.5530
Epoch: 4, Loss: 12431.0909
Epoch: 5, Loss: 11386.0895
Epoch: 6, Loss: 10445.5018
Epoch: 7, Loss: 9622.3985
Epoch: 8, Loss: 8884.0947
Epoch: 9, Loss: 8230.8399
Epoch: 10, Loss: 7644.6325
Epoch: 11, Loss: 7121.8116
Epoch: 12, Loss: 6654.6276
Epoch: 13, Loss: 6236.3300
Epoch: 14, Loss: 5859.8556
Epoch: 15, Loss: 5517.4938
Epoch: 16, Loss: 5212.4027
Epoch: 17, Loss: 4944.0445
Epoch: 18, Loss: 4694.1981
Epoch: 19, Loss: 4475.1006
Epoch: 20, Loss: 4276.3810
Epoch: 21, Loss: 4087.3041
Epoch: 22, Loss: 3929.7430
Epoch: 23, Loss: 3774.2124
Epoch: 24, Loss: 3646.7703
Epoch: 25, Loss: 3514.0105
Epoch: 26, Loss: 3406.1459
Epoch: 27, Loss: 3303.6108
Epoch: 28, Loss: 3200.4137
Epoch: 29, Loss: 3123.3815
Epoch: 30, Loss: 3036.5681
Epoch: 31, Loss: 2955.8705
Epoch: 32, Loss: 2897.9081
Epoch: 33, Loss: 2837.2712
Epoch: 34, Loss: 2772.1834
Epoch: 35, Loss: 2731.9357
Epoch: 36, Loss: 2672.3260
Epoch: 37, Loss: 2620.8913
Epoc

## 9. Text Generation

In [None]:

def prediction(model, vocab, text, max_seq_length):

    # tokenize
    tokenized_text = word_tokenize(text.lower())

    # text -> numerical indices
    numerical_text = text_to_indices(tokenized_text, vocab)

    # padding
    padded_text = torch.tensor(
        [0] * (max_seq_length  - len(numerical_text)) + numerical_text,
        dtype=torch.long
    ).unsqueeze(0)

    # send padded_text to same device as model
    device = next(model.parameters()).device
    padded_text = padded_text.to(device)

    # send to model
    output = model(padded_text)

    # predicted index
    _, index = torch.max(output, dim=1)

    #convert index to word
    index_to_word = {idx: word for word, idx in vocab.items()}

    predicted_word = index_to_word[index.item()]
    return text + " " + predicted_word


In [None]:
# single prediction
prediction(model, vocab, "I had seen little of", max_len_list)

'I had seen little of holmes'

In [None]:
prediction(model, vocab, "remained in our lodgings", max_len_list)

'remained in our lodgings in'

In [None]:
# generate multiple words

import time

num_tokens = 10
input_text = "To Sherlock Holmes "

for i in range(num_tokens):
  output_text = prediction(model, vocab, input_text, max_len_list)
  print(output_text)
  input_text = output_text
  time.sleep(0.5)


To Sherlock Holmes  she
To Sherlock Holmes  she is
To Sherlock Holmes  she is always
To Sherlock Holmes  she is always _the_
To Sherlock Holmes  she is always _the_ woman
To Sherlock Holmes  she is always _the_ woman .
To Sherlock Holmes  she is always _the_ woman . i
To Sherlock Holmes  she is always _the_ woman . i have
To Sherlock Holmes  she is always _the_ woman . i have seldom
To Sherlock Holmes  she is always _the_ woman . i have seldom heard


In [None]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

## 10. Model Evaluation

In [None]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()  # set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # get model predictions
            outputs = model(batch_x)

            # get the predicted word indices
            _, predicted = torch.max(outputs, dim=1)

            # compare with actual labels
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy

# compute accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 86.95%
