In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import nltk
from nltk.tokenize import word_tokenize

# to get deterministic output
torch.manual_seed(123)

sys.path.append(os.path.abspath(".."))

### Loading the Dataset

In [2]:
document = ""
with open("../datasets/word_prediction_dataset.txt", "r", encoding="utf-8") as file:
    document = file.read()

### Tokenize the dataset

In [3]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nova\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Nova\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
all_tokens = word_tokenize(document.lower())
print("Token length:", len(all_tokens))
all_tokens

Token length: 1018


['about',
 'the',
 'program',
 'what',
 'is',
 'the',
 'course',
 'fee',
 'for',
 'data',
 'science',
 'mentorship',
 'program',
 '(',
 'dsmp',
 '2023',
 ')',
 'the',
 'course',
 'follows',
 'a',
 'monthly',
 'subscription',
 'model',
 'where',
 'you',
 'have',
 'to',
 'make',
 'monthly',
 'payments',
 'of',
 'rs',
 '799/month',
 '.',
 'what',
 'is',
 'the',
 'total',
 'duration',
 'of',
 'the',
 'course',
 '?',
 'the',
 'total',
 'duration',
 'of',
 'the',
 'course',
 'is',
 '7',
 'months',
 '.',
 'so',
 'the',
 'total',
 'course',
 'fee',
 'becomes',
 '799',
 '*',
 '7',
 '=',
 'rs',
 '5600',
 '(',
 'approx',
 '.',
 ')',
 'what',
 'is',
 'the',
 'syllabus',
 'of',
 'the',
 'mentorship',
 'program',
 '?',
 'we',
 'will',
 'be',
 'covering',
 'the',
 'following',
 'modules',
 ':',
 'python',
 'fundamentals',
 'python',
 'libraries',
 'for',
 'data',
 'science',
 'data',
 'analysis',
 'sql',
 'for',
 'data',
 'science',
 'maths',
 'for',
 'machine',
 'learning',
 'ml',
 'algorithms',
 'p

### Build Vocabulary

In [5]:
vocab = {'<unk>': 0}

for token in Counter(all_tokens).keys():
  if token not in vocab:
    vocab[token] = len(vocab)

print("vocab length:", len(vocab))
vocab

vocab length: 289


{'<unk>': 0,
 'about': 1,
 'the': 2,
 'program': 3,
 'what': 4,
 'is': 5,
 'course': 6,
 'fee': 7,
 'for': 8,
 'data': 9,
 'science': 10,
 'mentorship': 11,
 '(': 12,
 'dsmp': 13,
 '2023': 14,
 ')': 15,
 'follows': 16,
 'a': 17,
 'monthly': 18,
 'subscription': 19,
 'model': 20,
 'where': 21,
 'you': 22,
 'have': 23,
 'to': 24,
 'make': 25,
 'payments': 26,
 'of': 27,
 'rs': 28,
 '799/month': 29,
 '.': 30,
 'total': 31,
 'duration': 32,
 '?': 33,
 '7': 34,
 'months': 35,
 'so': 36,
 'becomes': 37,
 '799': 38,
 '*': 39,
 '=': 40,
 '5600': 41,
 'approx': 42,
 'syllabus': 43,
 'we': 44,
 'will': 45,
 'be': 46,
 'covering': 47,
 'following': 48,
 'modules': 49,
 ':': 50,
 'python': 51,
 'fundamentals': 52,
 'libraries': 53,
 'analysis': 54,
 'sql': 55,
 'maths': 56,
 'machine': 57,
 'learning': 58,
 'ml': 59,
 'algorithms': 60,
 'practical': 61,
 'mlops': 62,
 'case': 63,
 'studies': 64,
 'can': 65,
 'check': 66,
 'detailed': 67,
 'here': 68,
 '-': 69,
 'https': 70,
 '//learnwith.campusx.i

In [6]:
def text_to_indices(sentence, vocab):
  numerical_sentence = []
  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])
  return numerical_sentence

In [7]:
input_sentences = document.split('\n')
numerical_sentences = []

for sentence in input_sentences:
  tokens = word_tokenize(sentence.lower())
  numerical_sentences.append(text_to_indices(tokens, vocab))

print("numerical_sentences length:", len(numerical_sentences))

numerical_sentences length: 78


### Generate Training Sequences

In [8]:
training_sequence = []
for sentence in numerical_sentences:
  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])
    
print("training_sequence count:", len(training_sequence))
training_sequence[:5]

training_sequence count: 942


[[1, 2], [1, 2, 3], [4, 5], [4, 5, 2], [4, 5, 2, 6]]

### Padding Training Sequences

In [9]:
all_seq_lengths = []
for sequence in training_sequence:
  all_seq_lengths.append(len(sequence))

max_seq_length = max(all_seq_lengths)
print("max sequence length:", max_seq_length)

max sequence length: 62


In [10]:
padded_training_sequence = []
for sequence in training_sequence:
  padding_length = max_seq_length - len(sequence)
  padded_training_sequence.append([0]*padding_length + sequence)
  
print("padded_training_sequence length:", len(padded_training_sequence[10]))

padded_training_sequence length: 62


### Split the Features and Labels

In [11]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]

print("padded_training_sequence shape:", padded_training_sequence.shape)
print("X shape:", X.shape)
print("y shape:", y.shape)

padded_training_sequence shape: torch.Size([942, 62])
X shape: torch.Size([942, 61])
y shape: torch.Size([942])


In [12]:
padded_training_sequence

tensor([[  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        [  0,   0,   0,  ...,   0,   4,   5],
        ...,
        [  0,   0,   0,  ..., 285, 176, 286],
        [  0,   0,   0,  ..., 176, 286, 287],
        [  0,   0,   0,  ..., 286, 287, 288]])

In [13]:
X

tensor([[  0,   0,   0,  ...,   0,   0,   1],
        [  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   0,   0,   4],
        ...,
        [  0,   0,   0,  ...,   0, 285, 176],
        [  0,   0,   0,  ..., 285, 176, 286],
        [  0,   0,   0,  ..., 176, 286, 287]])

In [14]:
y

tensor([  2,   3,   5,   2,   6,   7,   8,   9,  10,  11,   3,  12,  13,  14,
         15,   6,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  18,  26,
         27,  28,  29,  30,   5,   2,  31,  32,  27,   2,   6,  33,  31,  32,
         27,   2,   6,   5,  34,  35,  30,  36,   2,  31,   6,   7,  37,  38,
         39,  34,  40,  28,  41,  12,  42,  30,  15,   5,   2,  43,  27,   2,
         11,   3,  33,  45,  46,  47,   2,  48,  49,  50,  52,  53,   8,   9,
         10,  54,   8,   9,  10,   8,  57,  58,  60,  59,  64,  65,  66,   2,
         67,  43,  68,  69,  70,  50,  71,  72,  58,  73,  74,  46,  17,  75,
         27,  76,   3,  33,  78,  74,  73,  72,  58,  79,  80,  81,  17,  75,
         27,  76,   3,  82,  83,  84,  30,  85,  86,  87,  17,  88,  89,  33,
         45,  86,  90,  17,  91,  27,   2,  89,  33,  93,  94,  95,  80,  96,
         78,  36,  97,  85,  22,  87,  17,  89,  22,  65,  98,  99,  73, 100,
          2,  91,  30,  65,  86, 101,   2, 102, 103,  33,  76, 1

### Defining the DataLoader

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
class MyDataset(Dataset):

  def __init__(self, X, y):
    self.X = X.to(device)
    self.y = y.to(device)

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [17]:
dataset = MyDataset(X, y)
train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

### Design the Model

In [18]:
class LSTM(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    logits = self.fc(final_hidden_state.squeeze(0))
    return logits

In [19]:
# Test the layers
emb_layer = nn.Embedding(289, embedding_dim=100)
lstm_layer = nn.LSTM(100, 150, batch_first=True)
fc_layer = nn.Linear(150, 289)

input_sequence = dataset[0][0].reshape(1,-1).to('cpu')
print("input_sequence:", input_sequence)
print("input_sequence shape\t:", input_sequence.shape)

emb_layer_output = emb_layer(input_sequence)
print("emb_output shape\t:", emb_layer_output.shape)

hidden_states, (final_hidden_state, final_cell_state)= lstm_layer(emb_layer_output)
print("lstm_hidden_states shape\t:", hidden_states.shape)
print("lstm_final_hidden_state shape\t:", final_hidden_state.shape)
print("lstm_final_cell_state shape\t:", final_cell_state.shape)

fc_layer_output = fc_layer(final_hidden_state.squeeze(0))
print("fc_output shape\t\t:", fc_layer_output.shape)

input_sequence: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
input_sequence shape	: torch.Size([1, 61])
emb_output shape	: torch.Size([1, 61, 100])
lstm_hidden_states shape	: torch.Size([1, 61, 150])
lstm_final_hidden_state shape	: torch.Size([1, 1, 150])
lstm_final_cell_state shape	: torch.Size([1, 1, 150])
fc_output shape		: torch.Size([1, 289])


In [20]:
epochs = 50
learning_rate = 0.001

model = LSTM(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model

LSTM(
  (embedding): Embedding(289, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=289, bias=True)
)

### Train the Model

In [21]:
for epoch in range(epochs):
  epoch_loss = 0
  model = model.train()  
  for batch_idx, (batch_x, batch_y) in enumerate(train_dataloader):
    # forward pass
    logits = model(batch_x)
    loss = criterion(logits, batch_y)
    
    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    if not batch_idx % 8:
      print(f' -> batch {batch_idx+1:03d} | loss: {loss:.2f}')
  
  print(f'Epoch: {epoch+1:03d}/{epochs:03d} | epoch_loss: {epoch_loss:.2f}')

 -> batch 001 | loss: 5.70
 -> batch 009 | loss: 5.65
 -> batch 017 | loss: 5.63
 -> batch 025 | loss: 5.52
Epoch: 001/050 | epoch_loss: 166.40
 -> batch 001 | loss: 5.10
 -> batch 009 | loss: 4.93
 -> batch 017 | loss: 4.77
 -> batch 025 | loss: 5.01
Epoch: 002/050 | epoch_loss: 147.15
 -> batch 001 | loss: 4.79
 -> batch 009 | loss: 4.62
 -> batch 017 | loss: 4.21
 -> batch 025 | loss: 4.47
Epoch: 003/050 | epoch_loss: 134.64
 -> batch 001 | loss: 4.00
 -> batch 009 | loss: 4.06
 -> batch 017 | loss: 3.69
 -> batch 025 | loss: 4.10
Epoch: 004/050 | epoch_loss: 122.12
 -> batch 001 | loss: 3.64
 -> batch 009 | loss: 3.94
 -> batch 017 | loss: 3.95
 -> batch 025 | loss: 3.57
Epoch: 005/050 | epoch_loss: 110.37
 -> batch 001 | loss: 3.62
 -> batch 009 | loss: 3.20
 -> batch 017 | loss: 3.77
 -> batch 025 | loss: 2.89
Epoch: 006/050 | epoch_loss: 99.04
 -> batch 001 | loss: 3.15
 -> batch 009 | loss: 2.74
 -> batch 017 | loss: 2.99
 -> batch 025 | loss: 2.80
Epoch: 007/050 | epoch_loss: 

### Make Prediction

In [22]:
def predict(model, vocab, input_text):
  tokenized_text = word_tokenize(input_text.lower())
  numerical_text = text_to_indices(tokenized_text, vocab)
  padding_length = max_seq_length - 1 - len(numerical_text)
  padded_text = torch.tensor([0]*padding_length + numerical_text, dtype=torch.long).unsqueeze(0)
  logits = model(padded_text.to(device))
  logit, index = torch.max(logits, dim=1)
  return list(vocab.keys())[index], logit.item()

In [23]:
prediction, confidence = predict(model, vocab, "The course follows a monthly")
print(f"Confidence: {confidence:.2f}")
print(f"Answer: {prediction}")

Confidence: 9.85
Answer: subscription


In [24]:
num_tokens = 12
input_prompt = "The course follows a monthly"

for _ in range(num_tokens):
  prediction, logit = predict(model, vocab, input_prompt)  
  input_prompt += " " + prediction
  print(input_prompt)

The course follows a monthly subscription
The course follows a monthly subscription model
The course follows a monthly subscription model where
The course follows a monthly subscription model where you
The course follows a monthly subscription model where you have
The course follows a monthly subscription model where you have to
The course follows a monthly subscription model where you have to make
The course follows a monthly subscription model where you have to make monthly
The course follows a monthly subscription model where you have to make monthly payments
The course follows a monthly subscription model where you have to make monthly payments of
The course follows a monthly subscription model where you have to make monthly payments of rs
The course follows a monthly subscription model where you have to make monthly payments of rs 799/month


### Evaluate the Model

In [25]:
def calculate_accuracy(model, dataloader):
    correct = 0
    total = 0
    model.eval()
    with torch.inference_mode():
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            probs = model(batch_x)
            _, predicted = torch.max(probs, dim=1)
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total
    return accuracy

test_dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
accuracy = calculate_accuracy(model, test_dataloader)
print(f"Train Accuracy: {accuracy:.2f}")

Train Accuracy: 0.96
