In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary

import nltk
from nltk.tokenize import word_tokenize

# to get deterministic output
torch.manual_seed(123)

sys.path.append(os.path.abspath(".."))

### Loading the Dataset

In [2]:
document = ""
with open("../datasets/word_prediction_dataset.txt", "r", encoding="utf-8") as file:
    document = file.read()

### Tokenize the dataset

In [3]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nova\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Nova\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
all_tokens = word_tokenize(document.lower())
print("Total Tokens:", len(all_tokens))
all_tokens[:20]

Total Tokens: 1018


['about',
 'the',
 'program',
 'what',
 'is',
 'the',
 'course',
 'fee',
 'for',
 'data',
 'science',
 'mentorship',
 'program',
 '(',
 'dsmp',
 '2023',
 ')',
 'the',
 'course',
 'follows']

### Build Vocabulary

In [5]:
vocab = {'<unk>': 0}

for token in Counter(all_tokens).keys():
  if token not in vocab:
    vocab[token] = len(vocab)

print("Vocab length:", len(vocab))
list(vocab.items())[:20]

Vocab length: 289


[('<unk>', 0),
 ('about', 1),
 ('the', 2),
 ('program', 3),
 ('what', 4),
 ('is', 5),
 ('course', 6),
 ('fee', 7),
 ('for', 8),
 ('data', 9),
 ('science', 10),
 ('mentorship', 11),
 ('(', 12),
 ('dsmp', 13),
 ('2023', 14),
 (')', 15),
 ('follows', 16),
 ('a', 17),
 ('monthly', 18),
 ('subscription', 19)]

### Convert Text to Numerical Sequence

In [6]:
def text_to_indices(sentence, vocab):
  numerical_sentence = []
  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])
  return numerical_sentence

In [7]:
input_sentences = document.split('\n')
numerical_sequences = []

for sentence in input_sentences:
  tokens = word_tokenize(sentence.lower())
  numerical_sequences.append(text_to_indices(tokens, vocab))

print("Sentence count:", len(input_sentences))
print("Numerical sequence count:", len(numerical_sequences))

Sentence count: 78
Numerical sequence count: 78


### Generate Training Sequences

In [8]:
training_sequences = []
for sequence in numerical_sequences:
  for i in range(1, len(sequence)):
    training_sequences.append(sequence[:i+1])
    
print("Training sequence count:", len(training_sequences))
training_sequences[:20]

Training sequence count: 942


[[1, 2],
 [1, 2, 3],
 [4, 5],
 [4, 5, 2],
 [4, 5, 2, 6],
 [4, 5, 2, 6, 7],
 [4, 5, 2, 6, 7, 8],
 [4, 5, 2, 6, 7, 8, 9],
 [4, 5, 2, 6, 7, 8, 9, 10],
 [4, 5, 2, 6, 7, 8, 9, 10, 11],
 [4, 5, 2, 6, 7, 8, 9, 10, 11, 3],
 [4, 5, 2, 6, 7, 8, 9, 10, 11, 3, 12],
 [4, 5, 2, 6, 7, 8, 9, 10, 11, 3, 12, 13],
 [4, 5, 2, 6, 7, 8, 9, 10, 11, 3, 12, 13, 14],
 [4, 5, 2, 6, 7, 8, 9, 10, 11, 3, 12, 13, 14, 15],
 [2, 6],
 [2, 6, 16],
 [2, 6, 16, 17],
 [2, 6, 16, 17, 18],
 [2, 6, 16, 17, 18, 19]]

### Padding Training Sequences

In [9]:
seq_lengths = []
for sequence in training_sequences:
  seq_lengths.append(len(sequence))

max_seq_length = max(seq_lengths)
print("Max sequence length:", max_seq_length)

padded_training_sequence = []
for sequence in training_sequences:
  padding_length = max_seq_length - len(sequence)
  padded_training_sequence.append([0]*padding_length + sequence)
  
print("Padded Training Sequence length:", len(padded_training_sequence[10]))

Max sequence length: 62
Padded Training Sequence length: 62


### Split the Features and Labels

In [10]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)
print("Padded Training Sequence shape:", padded_training_sequence.shape)

X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]
print("X shape:", X.shape)
print("y shape:", y.shape)

Padded Training Sequence shape: torch.Size([942, 62])
X shape: torch.Size([942, 61])
y shape: torch.Size([942])


In [11]:
X

tensor([[  0,   0,   0,  ...,   0,   0,   1],
        [  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   0,   0,   4],
        ...,
        [  0,   0,   0,  ...,   0, 285, 176],
        [  0,   0,   0,  ..., 285, 176, 286],
        [  0,   0,   0,  ..., 176, 286, 287]])

In [12]:
y[:10]

tensor([ 2,  3,  5,  2,  6,  7,  8,  9, 10, 11])

### Defining the DataLoader

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
class MyDataset(Dataset):

  def __init__(self, X, y):
    self.X = X.to(device)
    self.y = y.to(device)

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [15]:
train_dataset = MyDataset(X, y)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

### Test the DataLoder

In [30]:
for idx, (batch_x, batch_y) in enumerate(train_dataloader):
  print(idx, batch_x, batch_y)
  if idx >= 1:
    break

0 tensor([[  0,   0,   0,  ...,   5,   2, 254],
        [  0,   0,   0,  ..., 185, 186,  78],
        [  0,   0,   0,  ...,  65,  44, 163],
        ...,
        [  0,   0,   0,  ...,  85,  86,  87],
        [  0,   0,   0,  ..., 225, 200, 223],
        [  0,   0,   0,  ...,  30,  36,   2]], device='cuda:0') tensor([ 24, 187,  22,   2, 189,  53,  76,  93,  30, 206,  22,  36,  89,  15,
          7,  52,  81,  33,  27,   8,  74, 252,  24,   2, 149,  65, 164, 117,
         17,  17, 190,  31], device='cuda:0')
1 tensor([[  0,   0,   0,  ...,   0,   0,  22],
        [  0,   0,   0,  ...,   0, 213, 214],
        [  0,   0,   0,  ..., 262, 252, 253],
        ...,
        [  0,   0,   0,  ...,  78,   4, 208],
        [  0,   0,   0,  ...,  23,  24, 226],
        [  0,   0,   0,  ..., 164, 165, 166]], device='cuda:0') tensor([ 23,  65,   5,  15, 128, 257,  27, 166,  17,  94, 268,  24, 135,   3,
        110,  22,  80, 176,  94,  90,  30,  33,  78, 207,  15,  30,  28,  86,
         12,  86,   2, 1

### Design the Model

In [16]:
class LSTM(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    logits = self.fc(final_hidden_state.squeeze(0))
    return logits

In [26]:
# Test the layers
emb_layer = nn.Embedding(289, embedding_dim=100)
lstm_layer = nn.LSTM(100, 150, batch_first=True)
fc_layer = nn.Linear(150, 289)

input_sequence = train_dataset[0][0].reshape(1,-1).to('cpu')
print("input_sequence shape\t:", input_sequence.shape)

emb_layer_output = emb_layer(input_sequence)
print("embedding_output shape\t:", emb_layer_output.shape)

hidden_states, (final_hidden_state, final_cell_state)= lstm_layer(emb_layer_output)
print("lstm_hidden_states shape\t:", hidden_states.shape)
print("lstm_final_hidden_state shape\t:", final_hidden_state.shape)
print("lstm_final_cell_state shape\t:", final_cell_state.shape)

fc_layer_output = fc_layer(final_hidden_state.squeeze(0))
print("fully_connected_output shape\t:", fc_layer_output.shape)

input_sequence shape	: torch.Size([1, 61])
embedding_output shape	: torch.Size([1, 61, 100])
lstm_hidden_states shape	: torch.Size([1, 61, 150])
lstm_final_hidden_state shape	: torch.Size([1, 1, 150])
lstm_final_cell_state shape	: torch.Size([1, 1, 150])
fully_connected_output shape	: torch.Size([1, 289])


In [18]:
epochs = 50
learning_rate = 0.001

model = LSTM(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
summary(model)

Layer (type:depth-idx)                   Param #
LSTM                                     --
├─Embedding: 1-1                         28,900
├─LSTM: 1-2                              151,200
├─Linear: 1-3                            43,639
Total params: 223,739
Trainable params: 223,739
Non-trainable params: 0

### Train the Model

In [19]:
for epoch in range(epochs):
  epoch_loss = 0
  model = model.train()  
  for batch_idx, (batch_x, batch_y) in enumerate(train_dataloader):
    # forward pass
    logits = model(batch_x)
    loss = criterion(logits, batch_y)
    
    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    # if not batch_idx % 8:
    #   print(f' -> batch {batch_idx+1:03d} | loss: {loss:.2f}')
  
  print(f'Epoch: {epoch+1:03d}/{epochs:03d} | epoch_loss: {epoch_loss:.2f}')

Epoch: 001/050 | epoch_loss: 166.40
Epoch: 002/050 | epoch_loss: 147.15
Epoch: 003/050 | epoch_loss: 134.64
Epoch: 004/050 | epoch_loss: 122.12
Epoch: 005/050 | epoch_loss: 110.37
Epoch: 006/050 | epoch_loss: 99.04
Epoch: 007/050 | epoch_loss: 88.17
Epoch: 008/050 | epoch_loss: 78.31
Epoch: 009/050 | epoch_loss: 69.52
Epoch: 010/050 | epoch_loss: 60.89
Epoch: 011/050 | epoch_loss: 53.83
Epoch: 012/050 | epoch_loss: 47.65
Epoch: 013/050 | epoch_loss: 41.15
Epoch: 014/050 | epoch_loss: 36.38
Epoch: 015/050 | epoch_loss: 31.97
Epoch: 016/050 | epoch_loss: 27.88
Epoch: 017/050 | epoch_loss: 24.95
Epoch: 018/050 | epoch_loss: 22.00
Epoch: 019/050 | epoch_loss: 19.74
Epoch: 020/050 | epoch_loss: 17.42
Epoch: 021/050 | epoch_loss: 16.04
Epoch: 022/050 | epoch_loss: 14.47
Epoch: 023/050 | epoch_loss: 13.04
Epoch: 024/050 | epoch_loss: 12.02
Epoch: 025/050 | epoch_loss: 11.07
Epoch: 026/050 | epoch_loss: 10.22
Epoch: 027/050 | epoch_loss: 9.50
Epoch: 028/050 | epoch_loss: 8.95
Epoch: 029/050 | 

### Make Prediction

In [20]:
def predict(model, vocab, input_text):
  tokenized_text = word_tokenize(input_text.lower())
  numerical_text = text_to_indices(tokenized_text, vocab)
  padding_length = max_seq_length - 1 - len(numerical_text)
  padded_text = torch.tensor([0]*padding_length + numerical_text, dtype=torch.long).unsqueeze(0)
  logits = model(padded_text.to(device))
  logit, index = torch.max(logits, dim=1)
  return list(vocab.keys())[index], logit.item()

In [21]:
prediction, confidence = predict(model, vocab, "The course follows a monthly")
print(f"Confidence: {confidence:.2f}")
print(f"Answer: {prediction}")

Confidence: 9.85
Answer: subscription


In [22]:
num_tokens = 12
input_prompt = "The course follows a monthly"

for _ in range(num_tokens):
  prediction, logit = predict(model, vocab, input_prompt)  
  input_prompt += " " + prediction
  print(input_prompt)

The course follows a monthly subscription
The course follows a monthly subscription model
The course follows a monthly subscription model where
The course follows a monthly subscription model where you
The course follows a monthly subscription model where you have
The course follows a monthly subscription model where you have to
The course follows a monthly subscription model where you have to make
The course follows a monthly subscription model where you have to make monthly
The course follows a monthly subscription model where you have to make monthly payments
The course follows a monthly subscription model where you have to make monthly payments of
The course follows a monthly subscription model where you have to make monthly payments of rs
The course follows a monthly subscription model where you have to make monthly payments of rs 799/month


### Evaluate the Model

In [23]:
def calculate_accuracy(model, dataloader):
    correct = 0
    total = 0
    model.eval()
    with torch.inference_mode():
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            probs = model(batch_x)
            _, predicted = torch.max(probs, dim=1)
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total
    return accuracy

test_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)
accuracy = calculate_accuracy(model, test_dataloader)
print(f"Train Accuracy: {accuracy:.2f}")

Train Accuracy: 0.96
