In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# to get deterministic output
torch.manual_seed(123)

sys.path.append(os.path.abspath(".."))

### Loading the Dataset

In [2]:
df = pd.read_csv('../datasets/question_answer_dataset.csv')
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


### Tokenize the dataset

In [3]:
def tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  return text.split()

tokenize('What is the capital of France?')

['what', 'is', 'the', 'capital', 'of', 'france']

### Build Vocabulary

In [4]:
vocab = {'<UNK>': 0}

def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])
  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:
    if token not in vocab:
      vocab[token] = len(vocab)

In [5]:
df.apply(build_vocab, axis=1)
print("vocab size:", len(vocab))

vocab size: 324


In [6]:
def text_to_indices(text, vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

text_to_indices("What is campusx", vocab)

[1, 2, 0]

### Defining the DataLoader

In [7]:
class MyDataset(Dataset):
  
  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)
    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [8]:
dataset = MyDataset(df, vocab)
train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

### Visualize the Dataset

In [9]:
for question, answer in train_dataloader:
  print(question, answer)
  break

tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([[85]])


### Design the Model

In [10]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    logits = self.fc(final.squeeze(0))
    return logits

In [11]:
# Test the layers
emb_layer = nn.Embedding(324, embedding_dim=50)
rnn_layer = nn.RNN(50, 64, batch_first=True)
fc_layer = nn.Linear(64, 324)

input_question = dataset[0][0].reshape(1,-1)
print("input_question\t\t:", input_question)
print("input_question shape\t:", input_question.shape)

emb_layer_output = emb_layer(input_question)
print("emb_output shape\t:", emb_layer_output.shape)

rnn_hidden, rnn_final = rnn_layer(emb_layer_output)
print("rnn_hidden shape\t:", rnn_hidden.shape)
print("rnn_final shape\t\t:", rnn_final.shape)

fc_layer_output = fc_layer(rnn_final.squeeze(0))
print("fc_output shape\t\t:", fc_layer_output.shape)

input_question		: tensor([[1, 2, 3, 4, 5, 6]])
input_question shape	: torch.Size([1, 6])
emb_output shape	: torch.Size([1, 6, 50])
rnn_hidden shape	: torch.Size([1, 6, 64])
rnn_final shape		: torch.Size([1, 1, 64])
fc_output shape		: torch.Size([1, 324])


In [12]:
epochs = 20
learning_rate = 0.001

model = SimpleRNN(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model

SimpleRNN(
  (embedding): Embedding(324, 50)
  (rnn): RNN(50, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=324, bias=True)
)

### Train the Model

In [13]:
for epoch in range(epochs):
  epoch_loss = 0
  model = model.train()  
  for question, answer in train_dataloader:
    # forward pass
    logits = model(question)

    # loss -> output shape (1,324) - (1)
    loss = criterion(logits, answer[0])

    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()

  print(f'Epoch: {epoch+1:03d}/{epochs:03d} | epoch_loss: {epoch_loss:.2f}')

Epoch: 001/020 | epoch_loss: 523.89
Epoch: 002/020 | epoch_loss: 455.28
Epoch: 003/020 | epoch_loss: 374.98
Epoch: 004/020 | epoch_loss: 311.73
Epoch: 005/020 | epoch_loss: 258.96
Epoch: 006/020 | epoch_loss: 209.89
Epoch: 007/020 | epoch_loss: 166.38
Epoch: 008/020 | epoch_loss: 129.03
Epoch: 009/020 | epoch_loss: 98.41
Epoch: 010/020 | epoch_loss: 75.58
Epoch: 011/020 | epoch_loss: 57.99
Epoch: 012/020 | epoch_loss: 45.22
Epoch: 013/020 | epoch_loss: 36.15
Epoch: 014/020 | epoch_loss: 28.90
Epoch: 015/020 | epoch_loss: 23.83
Epoch: 016/020 | epoch_loss: 19.88
Epoch: 017/020 | epoch_loss: 16.81
Epoch: 018/020 | epoch_loss: 14.35
Epoch: 019/020 | epoch_loss: 12.40
Epoch: 020/020 | epoch_loss: 10.73


### Make Prediction

In [14]:
def predict(model, question, threshold=0.5):
  numerical_question = text_to_indices(question, vocab)
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)
  logits = model(question_tensor)

  # convert logits to probs
  probs = nn.functional.softmax(logits, dim=1)
  prob, index = torch.max(probs, dim=1)
  if prob < threshold:
    return "I don't know", prob.item()

  return list(vocab.keys())[index], prob.item()

In [15]:
prediction, confidence = predict(model, "What is the largest planet in our solar system?")
print(f"Confidence: {confidence:.2f}")
print(f"Answer: {prediction}")

Confidence: 0.90
Answer: jupiter


### Evaluate the Model

In [16]:
def calculate_accuracy(model):
    total = 0
    correct = 0
    model.eval()
    with torch.inference_mode():
        for _, datarow in df.iterrows():
            prediction, prob = predict(model, datarow['question'])
            correct += (prediction.casefold() == datarow['answer'].casefold())
            total += 1
    accuracy = correct / total
    return accuracy

train_acc = calculate_accuracy(model)
print(f"Train Accuracy: {train_acc:.2f}")

Train Accuracy: 1.00
