In [1]:
import pandas as pd
df=pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [2]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  90 non-null     object
 1   answer    90 non-null     object
dtypes: object(2)
memory usage: 1.5+ KB


In [4]:
df.describe()

Unnamed: 0,question,answer
count,90,90
unique,90,85
top,What is the capital of France?,8
freq,1,3


In [5]:
df.isnull().sum()

Unnamed: 0,0
question,0
answer,0


In [6]:
df.shape

(90, 2)

In [8]:
#tokenize
def tokenize(text):
  text=text.lower()
  text=text.replace('.','')
  text=text.replace(',','')
  text=text.replace('?','')
  return text.split()

In [9]:
tokenize("What is the capital of France?")

['what', 'is', 'the', 'capital', 'of', 'france']

In [10]:
#vocab
vocab={'<UK>':0}

In [58]:
def build_vocab(row):
  tokenized_question=tokenize(row['question'])
  tokenized_answer=tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer
  for token in merged_tokens:
    if token not in vocab:
      vocab[token]=len(vocab)

In [59]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [60]:
vocab

{'<UK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 "'to": 12,
 'kill': 13,
 'a': 14,
 "mockingbird'": 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 "'1984'": 67,
 'george-orwell': 68,
 'currency': 69,
 'u

In [61]:
len(vocab)

326

In [62]:
#convert words to numerical indices
def text_to_indices(text,vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UK>'])
  return indexed_text

In [63]:
text_to_indices("What is campusx",vocab)

[1, 2, 0]

In [64]:
import torch
from torch.utils.data import Dataset,DataLoader

In [65]:
class QADataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    numerical_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
    numerical_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)

    return torch.tensor(numerical_question),torch.tensor(numerical_answer)

In [66]:
dataset=QADataset(df,vocab)

In [67]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)

In [68]:
for question,answer in dataloader:
  print(question,answer[0])

tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([23])
tensor([[10, 55,  3, 56,  5, 57]]) tensor([58])
tensor([[10, 96,  3, 97]]) tensor([98])
tensor([[ 10, 140,   3, 141, 272,  93, 273,   5,   3, 274]]) tensor([275])
tensor([[ 78,  79, 196,  81,  19,   3, 197, 198, 199]]) tensor([200])
tensor([[  1,   2,   3, 147, 148,  19, 149]]) tensor([150])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([106])
tensor([[ 42, 292, 293, 118, 294, 159, 295, 296]]) tensor([297])
tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([186])
tensor([[  1,   2,   3, 213,   5,  14, 214, 215]]) tensor([216])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([7])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([91])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([262])
tensor([[  1,   2,   3, 222,   5, 223, 224, 225]]) tensor([226])
tensor([[  1,   2,   3, 147,  86,  19, 193, 194]]) tensor([195])
tensor([[  1,   2,   3,  37,  38,  39, 162]]) tensor([163])
tensor([[ 10,  29, 130, 131]]) tensor([132])
tensor([[ 

In [69]:
import torch.nn as nn

In [70]:
class SimpleRNN(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
    self.rnn = nn.RNN(50,64,batch_first=True)
    self.fc=nn.Linear(64,vocab_size)

  def forward(self,question):
    embedded_question = self.embedding(question)
    hidden,final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))
    return output


In [71]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [72]:
learning_rate =0.001
epochs = 20

In [73]:
model = SimpleRNN(len(vocab))

In [74]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [76]:
#training loop
for epoch in range(epochs):
  total_loss= 0
  for question, answer in dataloader:
    optimizer.zero_grad()

    #forward pass
    output = model(question)

    #loss. -> output shape(1,324)
    loss = criterion(output, answer[0])

    #gradients
    loss.backward()

    #update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch:{epoch+1},Loss:{total_loss:4f}")

Epoch:1,Loss:523.482217
Epoch:2,Loss:456.044917
Epoch:3,Loss:376.313285
Epoch:4,Loss:313.617137
Epoch:5,Loss:261.006563
Epoch:6,Loss:212.203717
Epoch:7,Loss:168.291559
Epoch:8,Loss:129.670244
Epoch:9,Loss:97.946338
Epoch:10,Loss:73.932871
Epoch:11,Loss:56.691350
Epoch:12,Loss:43.710788
Epoch:13,Loss:34.519835
Epoch:14,Loss:27.642864
Epoch:15,Loss:22.461542
Epoch:16,Loss:18.674014
Epoch:17,Loss:15.565060
Epoch:18,Loss:13.389527
Epoch:19,Loss:11.510710
Epoch:20,Loss:10.000220


In [77]:
def predict(model,question,threshold=0.5):
  #convert question to numbers
  numerical_question = text_to_indices(question,vocab)

  #tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  #send to model
  output = model(question_tensor)

  #convert logits to probs
  probs = torch.nn.functional.softmax(output,dim=1)

  #find index of max prob
  value, index = torch.max(probs,dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [78]:
predict(model,"What is the largest planet in our solar system?")

jupiter


In [79]:
list(vocab.keys())[7]

'paris'