In [8]:
import pandas as pd 

df= pd.read_csv("100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [9]:
# tokenize 
def tokenize(text):
    text = text.lower()
    text = text.replace('?',"")
    text = text.replace("'","")
    return text.split()

In [10]:
tokenize("What is the capital of France?")

['what', 'is', 'the', 'capital', 'of', 'france']

In [11]:
# vocab
vocab = {'<UNK>':0}

In [14]:
def build_vocab(row):
    tokenized_question = tokenize(row['question'])
    tokenized_answer = tokenize(row['answer'])

    merged_tokens = tokenized_question + tokenized_answer

    for token in merged_tokens:

        if token not in vocab:
            vocab[token] = len(vocab)


In [15]:
df.apply(build_vocab,axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [16]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [17]:
#convert words to numerical indices

def text_to_indices(text, vocab):

    indexed_text = []

    for token in tokenize(text):

        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text

In [19]:
text_to_indices("what is campusx",vocab)

[1, 2, 0]

In [20]:
import torch 
from torch.utils.data import Dataset, DataLoader

In [21]:
class QADataset(Dataset):

    def __init__(self, df, vocab):
        self.df  = df
        self.vocab = vocab
    
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):

        numeric_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
        numeric_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

        return torch.tensor(numeric_question), torch.tensor(numeric_answer)
        

In [22]:
dataset = QADataset(df, vocab)

In [24]:
dataset[1]

(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))

In [25]:
dataloader = DataLoader(dataset,batch_size=1, shuffle=True)

In [26]:
for question, answer in dataloader:
    print(question,answer)

tensor([[ 10,  29, 130, 131]]) tensor([[132]])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([[184]])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([[91]])
tensor([[ 78,  79, 288,  81,  19,  14, 289]]) tensor([[85]])
tensor([[ 10,  75,   3, 296,  19, 297]]) tensor([[298]])
tensor([[  1,   2,   3,   4,   5, 135]]) tensor([[136]])
tensor([[  1,   2,   3, 146,  86,  19, 192, 193]]) tensor([[194]])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([[149]])
tensor([[ 10, 140,   3, 141, 171,   5,   3,  70, 172]]) tensor([[173]])
tensor([[ 42, 101,   2,   3,  17]]) tensor([[102]])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([[36]])
tensor([[  1,  87, 229, 230, 231, 232]]) tensor([[233]])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([[121]])
tensor([[ 10,  96,   3, 104, 239]]) tensor([[240]])
tensor([[ 78,  79, 150, 151,  14, 152, 153]]) tensor([[154]])
tensor([[  1,   2,   3,   4,   5, 279]]) tensor([[280]])
tensor([

In [27]:
import torch.nn as nn

In [59]:
class SimpleRNN(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
        self.rnn  = nn.RNN(50,64,batch_first=True)
        self.fc = nn.Linear(64,vocab_size)

    def forward(self,question):
        embedded_question = self.embedding(question)
        hidden, final = self.rnn(embedded_question)
        output = self.fc(final.squeeze(0))

        return output 

In [60]:
dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [32]:
x = nn.Embedding(324,embedding_dim=50)

In [38]:
a = x(dataset[0][0])

In [39]:
y = nn.RNN(50,64)
y(a)

(tensor([[ 0.4494, -0.5461,  0.3056,  0.7437,  0.2983,  0.0124, -0.1079, -0.0143,
           0.5274,  0.9433, -0.1670, -0.6685, -0.7528,  0.2242, -0.3562,  0.6974,
           0.2336,  0.0743, -0.4444,  0.8138,  0.1772,  0.6304,  0.3502,  0.9640,
          -0.4301,  0.3443, -0.5759, -0.1170,  0.3420, -0.2141,  0.7222, -0.7544,
          -0.8017,  0.7667,  0.7431,  0.2453, -0.4274, -0.2814, -0.2188, -0.1183,
           0.0137,  0.0196, -0.6149, -0.0992,  0.7953, -0.1367, -0.2512, -0.2879,
           0.6020, -0.5435, -0.0045, -0.2979,  0.0404,  0.0566,  0.6073,  0.5601,
           0.2184, -0.8484,  0.5441,  0.3901, -0.8172, -0.3708,  0.0498, -0.8035],
         [-0.6675, -0.2216, -0.3376, -0.2703,  0.0563, -0.0786, -0.1432,  0.3747,
           0.6874, -0.0502,  0.3647,  0.5681, -0.7652,  0.0733, -0.8828,  0.1529,
          -0.0318, -0.0063, -0.1874,  0.7524, -0.5622, -0.0648,  0.2688,  0.0568,
           0.0790,  0.5612, -0.8122,  0.6170, -0.0224, -0.3228,  0.2619,  0.1264,
          -0.75

In [40]:
# final output
b = y(a)[1]

In [41]:
b

tensor([[-0.4037, -0.5205, -0.7580,  0.6154, -0.2830,  0.2876, -0.8933, -0.8731,
         -0.5993,  0.2696, -0.2361, -0.3880, -0.3012, -0.7472,  0.3618,  0.7298,
          0.1749,  0.5442,  0.5438,  0.1388, -0.3925, -0.6926, -0.2791,  0.0262,
          0.2145, -0.1742,  0.6540, -0.5517,  0.8703,  0.8134, -0.3008,  0.4091,
         -0.0658, -0.1243, -0.7577, -0.2381,  0.0919,  0.4656, -0.2604,  0.7076,
          0.4023, -0.4114, -0.0129, -0.2930,  0.3663,  0.6723,  0.1628,  0.3228,
         -0.3637, -0.0977,  0.8396, -0.6188, -0.0297,  0.0271, -0.4886, -0.5428,
         -0.4049,  0.3265,  0.4422,  0.2106,  0.8318, -0.4042,  0.0893,  0.5339]],
       grad_fn=<SqueezeBackward1>)

In [65]:
learning_rate = 0.001
epochs= 50

In [61]:
model = SimpleRNN(len(vocab))

In [62]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [66]:
# training loop 

for epoch in range(epochs):

    total_loss = 0

    for question, answer in dataloader:

        optimizer.zero_grad()

        # forward pass
        output = model(question)

        # loss
        loss = criterion(output,answer[0])

        # gradients
        loss.backward()

        # update 
        optimizer.step()

        total_loss = total_loss+ loss.item()

    print(f"Epochs: {epoch+1}, Loss: {total_loss:4f}")
    

Epochs: 1, Loss: 9.582391
Epochs: 2, Loss: 8.400165
Epochs: 3, Loss: 7.430454
Epochs: 4, Loss: 6.661601
Epochs: 5, Loss: 5.924522
Epochs: 6, Loss: 5.359097
Epochs: 7, Loss: 4.832064
Epochs: 8, Loss: 4.391516
Epochs: 9, Loss: 4.012351
Epochs: 10, Loss: 3.672276
Epochs: 11, Loss: 3.365670
Epochs: 12, Loss: 3.099010
Epochs: 13, Loss: 2.858782
Epochs: 14, Loss: 2.642677
Epochs: 15, Loss: 2.457269
Epochs: 16, Loss: 2.276565
Epochs: 17, Loss: 2.117771
Epochs: 18, Loss: 1.973163
Epochs: 19, Loss: 1.845431
Epochs: 20, Loss: 1.723939
Epochs: 21, Loss: 1.611664
Epochs: 22, Loss: 1.508893
Epochs: 23, Loss: 1.415923
Epochs: 24, Loss: 1.328813
Epochs: 25, Loss: 1.248929
Epochs: 26, Loss: 1.174930
Epochs: 27, Loss: 1.106384
Epochs: 28, Loss: 1.042491
Epochs: 29, Loss: 0.982233
Epochs: 30, Loss: 0.926542
Epochs: 31, Loss: 0.874649
Epochs: 32, Loss: 0.827121
Epochs: 33, Loss: 0.781498
Epochs: 34, Loss: 0.739309
Epochs: 35, Loss: 0.699548
Epochs: 36, Loss: 0.662560
Epochs: 37, Loss: 0.627010
Epochs: 38

In [67]:
def predict(model, question, threshold=0.5):

    # conver question to numbers
    numerical_question = text_to_indices(question, vocab)

    # tensor 
    question_tesor = torch.tensor(numerical_question).unsqueeze(0)

    # send to model 
    output = model(question_tesor)

    # convert loguts to probs
    probs = torch.nn.functional.softmax(output, dim=1)

    # find index of max prob 
    value, index = torch.max(probs, dim=1)

    if value < threshold:
        print("I don't know")

    print(list(vocab.keys())[index])


In [68]:
predict(model, "what is the largest plannet in our solar system")

jupiter
