In [21]:
import pandas as pd

In [22]:
df = pd.read_csv('./sample_data/100_Unique_QA_Dataset.csv')

df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [23]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  return text.split()


In [24]:
# vocabilary
vocab = {'<UNK>':0}

In [25]:
def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:

    if token not in vocab:
      vocab[token] = len(vocab)

In [26]:
df.apply(build_vocab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [27]:
len(vocab)

324

In [28]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [29]:
# convert words to numerical indicis

def text_to_indices(text, vocab):
  indexed_text = []

  for token in tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text


In [30]:
text_to_indices("What is the largest planet in our solar systems?",vocab)

[1, 2, 3, 17, 18, 19, 20, 21, 0]

In [31]:
import torch
from torch.utils.data import Dataset,DataLoader


In [32]:
class QADataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [33]:
dataset = QADataset(df, vocab)

In [34]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [35]:
for question, answer in dataloader:
  print(question, answer[0])

tensor([[  1,   2,   3,   4,   5, 286]]) tensor([287])
tensor([[42, 43, 44, 45, 46, 47, 48]]) tensor([49])
tensor([[  1,   2,   3, 122, 123,  19,   3,  45]]) tensor([124])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([215])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([244])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([85])
tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]]) tensor([61])
tensor([[ 1,  2,  3,  4,  5, 73]]) tensor([74])
tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([162])
tensor([[ 42,   2,   3, 274, 211, 275]]) tensor([276])
tensor([[  1,  87, 229, 230, 231, 232]]) tensor([233])
tensor([[ 42, 137, 118,   3, 247,   5, 248]]) tensor([249])
tensor([[ 10,  29, 130, 131]]) tensor([132])
tensor([[  1,   2,   3,   4,   5, 135]]) tensor([136])
tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([53])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([149])
tensor([[  1,   2,   3, 141, 117,  83,   3, 277, 278]]) tensor([121])
tensor([[  1,   2,   3, 

In [36]:
# build RNN Architecture
import torch.nn as nn

In [58]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, x):
    embedded_question = self.embedding(question)
    hidden , final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))
    return output



In [59]:

x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [60]:
x = nn.Embedding(324, embedding_dim=50)

In [61]:
x(dataset[0][0]).shape

torch.Size([6, 50])

In [70]:
learning_rate = 0.001
epochs = 200

In [71]:
model = SimpleRNN(len(vocab))

In [72]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [73]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 523.967532
Epoch: 2, Loss: 460.864661
Epoch: 3, Loss: 385.730393
Epoch: 4, Loss: 317.538207
Epoch: 5, Loss: 263.383903
Epoch: 6, Loss: 214.664527
Epoch: 7, Loss: 169.494988
Epoch: 8, Loss: 131.456548
Epoch: 9, Loss: 99.707689
Epoch: 10, Loss: 75.731307
Epoch: 11, Loss: 58.158560
Epoch: 12, Loss: 45.162881
Epoch: 13, Loss: 35.865254
Epoch: 14, Loss: 29.062851
Epoch: 15, Loss: 23.847816
Epoch: 16, Loss: 19.908663
Epoch: 17, Loss: 16.867902
Epoch: 18, Loss: 14.388764
Epoch: 19, Loss: 12.326574
Epoch: 20, Loss: 10.767637
Epoch: 21, Loss: 9.390664
Epoch: 22, Loss: 8.300285
Epoch: 23, Loss: 7.300028
Epoch: 24, Loss: 6.560858
Epoch: 25, Loss: 5.901396
Epoch: 26, Loss: 5.318832
Epoch: 27, Loss: 4.823183
Epoch: 28, Loss: 4.381621
Epoch: 29, Loss: 3.987852
Epoch: 30, Loss: 3.656475
Epoch: 31, Loss: 3.363274
Epoch: 32, Loss: 3.097305
Epoch: 33, Loss: 2.861599
Epoch: 34, Loss: 2.648633
Epoch: 35, Loss: 2.459371
Epoch: 36, Loss: 2.282010
Epoch: 37, Loss: 2.125270
Epoch: 38, Loss: 1.

In [74]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [77]:
predict(model, "What is the capital of France")

avocado
