In [7]:
import pandas as pd

In [8]:
df = pd.read_csv('100_Unique_QA_Dataset.csv')
print(df.head())

                                          question      answer
0                   What is the capital of France?       Paris
1                  What is the capital of Germany?      Berlin
2               Who wrote 'To Kill a Mockingbird'?  Harper-Lee
3  What is the largest planet in our solar system?     Jupiter
4   What is the boiling point of water in Celsius?         100


In [9]:
# Tokenize the sentence
def tokenize(text):
    text = text.lower()
    text = text.replace("?",'')
    text = text.replace("'","")
    return text.split()



In [10]:
# vocab
vocab = {'<UNK>':0}

In [11]:
def build_vocab(row):
    # print(row['question'],row['answer'])
    tokenized_question=tokenize(row['question'])
    tokenized_answer = tokenize(row['answer'])
    
    merged_token=tokenized_question + tokenized_answer
    for token in merged_token:
        if token not in vocab:
            vocab[token] = len(vocab)

In [12]:
df.apply(build_vocab,axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [13]:
# convert word into numerical indices
def text_to_indices(text,vocab):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text


# text_to_indices("What is capital of France?",vocab)

In [14]:

import torch
from torch.utils.data import DataLoader,Dataset


In [15]:
class QADataset(Dataset):
    def __init__(self,df,vocab):
        self.df=df
        self.vocab = vocab
    
    def __len__(self,):
        return self.df.shape[0]

    def __getitem__(self, index):
        numerical_question = text_to_indices(self.df.iloc[index]['question'],self.vocab)
        numerical_answer = text_to_indices(self.df.iloc[index]['answer'],self.vocab)
        return torch.tensor(numerical_question),torch.tensor(numerical_answer)
    


In [16]:
dataset = QADataset(df,vocab)


In [17]:
dataloader = DataLoader(dataset,batch_size=1,shuffle=True)

In [18]:
for question,answer in dataloader:
    print(question,answer)

tensor([[ 1,  2,  3, 92, 93, 94]]) tensor([[95]])
tensor([[ 42, 263, 264,  14, 265, 266, 158, 267]]) tensor([[268]])
tensor([[ 42, 216, 118, 217, 218,  19,  14, 219,  43]]) tensor([[220]])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([[16]])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([[246]])
tensor([[ 78,  79, 195,  81,  19,   3, 196, 197, 198]]) tensor([[199]])
tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]]) tensor([[61]])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([[36]])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([[149]])
tensor([[ 1,  2,  3,  4,  5, 53]]) tensor([[54]])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([[188]])
tensor([[ 1,  2,  3,  4,  5, 73]]) tensor([[74]])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([[260]])
tensor([[10, 55,  3, 56,  5, 57]]) tensor([[58]])
tensor([[ 10,   2,  62,  63,   3, 283,   5, 284]]) tensor([[285]])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([[91]])
tensor([[ 42, 299, 300, 118,  14, 301, 302, 158, 303, 304, 305, 306]])

In [19]:
# Architecture of RNN
import torch.nn as nn


In [44]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
        self.rnn =nn.RNN(50,64,batch_first=True)
        self.fc= nn.Linear(64,vocab_size)
        
        

    def forward(self,question):
        embedded_question = self.embedding(question)
        hidden,final= self.rnn(embedded_question)
        output = self.fc(final.squeeze(0))
        return output
        


In [45]:
# x= nn.Embedding(324,embedding_dim=0)
# y = nn.RNN(50,60,batch_first=True)

In [46]:
learning_rate = 0.001
epochs =100

In [47]:
model = SimpleRNN(len(vocab))

In [48]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr =learning_rate)

In [49]:
# training Loop
for epoch in range(epochs):
    total_loss = 0

    for question,answer in dataloader:
        optimizer.zero_grad()

        # forward pass
        output = model(question)

        # loss calculate
        loss = criterion(output,answer[0])

        # gradients
        loss.backward()

        # update
        optimizer.step()

        total_loss = total_loss+loss.item()

    print(f"Epoch {epoch+1} , Loss: {total_loss:4f}")



Epoch 1 , Loss: 520.730445
Epoch 2 , Loss: 457.606430
Epoch 3 , Loss: 381.496815
Epoch 4 , Loss: 316.488273
Epoch 5 , Loss: 263.325233
Epoch 6 , Loss: 215.176512
Epoch 7 , Loss: 170.197243
Epoch 8 , Loss: 131.330620
Epoch 9 , Loss: 100.422415
Epoch 10 , Loss: 75.895565
Epoch 11 , Loss: 58.484082
Epoch 12 , Loss: 45.619724
Epoch 13 , Loss: 36.533978
Epoch 14 , Loss: 29.959860
Epoch 15 , Loss: 24.747012
Epoch 16 , Loss: 20.938839
Epoch 17 , Loss: 17.832325
Epoch 18 , Loss: 15.367883
Epoch 19 , Loss: 13.191579
Epoch 20 , Loss: 11.750370
Epoch 21 , Loss: 10.374855
Epoch 22 , Loss: 9.064135
Epoch 23 , Loss: 8.207646
Epoch 24 , Loss: 7.160743
Epoch 25 , Loss: 6.379970
Epoch 26 , Loss: 5.782285
Epoch 27 , Loss: 5.187804
Epoch 28 , Loss: 4.608363
Epoch 29 , Loss: 4.134892
Epoch 30 , Loss: 3.761418
Epoch 31 , Loss: 3.417905
Epoch 32 , Loss: 3.135222
Epoch 33 , Loss: 2.876524
Epoch 34 , Loss: 2.651708
Epoch 35 , Loss: 2.446326
Epoch 36 , Loss: 2.269458
Epoch 37 , Loss: 2.106120
Epoch 38 , Loss: 

In [58]:
#  Predicition
def predict(model,question,threshold=0.5):

    # convert question to number
    numerical_question = text_to_indices(question,vocab)

    # convert to tensor
    question_tensor = torch.tensor(numerical_question).unsqueeze(0)

    # send to model
    output = model(question_tensor)

    # converts logits to probability
    probs = torch.nn.functional.softmax(output,dim=1)

    # find max probability
    value,index = torch.max(probs,dim=1)

    if value < threshold:
        print("I don't know")
    
    return list(vocab.keys())[index]




In [60]:
question = input("Enter your question")
answer = predict(model,question).capitalize()
print(answer)

Paris
