In [104]:
import pandas as pd
import numpy as np
import torch 
import torch.optim as optim
from collections import Counter
import torch.nn as nn
from pathlib import Path

In [105]:
torch.manual_seed(43)

<torch._C.Generator at 0x121df6870>

In [106]:
if torch.backends.mps.is_available:
    device=torch.device('mps')
else:
    device=torch.device('cpu')

In [107]:
device

device(type='mps')

In [108]:
path=Path('/Users/divyyadav/Desktop/pytorch/src/ann,cnn,rnn,lstm/100_Unique_QA_Dataset.csv')

In [109]:
df=pd.read_csv(path)

In [110]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [111]:
df.info,df.describe

(<bound method DataFrame.info of                                              question        answer
 0                      What is the capital of France?         Paris
 1                     What is the capital of Germany?        Berlin
 2                  Who wrote 'To Kill a Mockingbird'?    Harper-Lee
 3     What is the largest planet in our solar system?       Jupiter
 4      What is the boiling point of water in Celsius?           100
 ..                                                ...           ...
 85                  Who directed the movie 'Titanic'?  JamesCameron
 86  Which superhero is also known as the Dark Knight?        Batman
 87                     What is the capital of Brazil?      Brasilia
 88        Which fruit is known as the king of fruits?         Mango
 89       Which country is known for the Eiffel Tower?        France
 
 [90 rows x 2 columns]>,
 <bound method NDFrame.describe of                                              question        answer
 0        

In [112]:
#performing tokenization
def tokenization(text):
    lower_text=text.lower()
    text=lower_text.replace('','')
    text=lower_text.replace('?','')
    return text.split()

In [113]:
vocab={'<UNK>':0}

In [114]:
#making vocabulary
def  build_vocab(row):

    tokenized_row=tokenization(row['question'])
    tokenized_question=tokenization(row['answer'])

    merged_tokens=tokenized_row+tokenized_question
    
    for token in merged_tokens:

        if token not in vocab:
            
            vocab[token]=len(vocab)


In [115]:
df.apply(build_vocab,axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [116]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 "'to": 12,
 'kill': 13,
 'a': 14,
 "mockingbird'": 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 "'1984'": 67,
 'george-orwell': 68,
 'currency': 69,
 '

In [117]:
len(vocab)

326

In [118]:
# a functions that returns the indice value from the text after iterating the tokens 
def index(text,vocab):
    
    indexed_text=[]

    for tokens in tokenization(text):

        if tokens in vocab:
            indexed_text.append(vocab[tokens])

        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text
    

In [119]:
#now we need to convert the question and answer from the data to indexs

from torch.utils.data import Dataset,DataLoader

In [120]:
class CustomDatase(Dataset):

    def __init__(self,df,vocab):
        super().__init__()
        self.text=df
        self.vocab=vocab

    def __len__(self):
        return self.text.shape[0]
    
    def __getitem__(self,indx):

        question=self.text.iloc[indx]['question']
      
        answer=self.text.iloc[indx]['answer']

        question_list=index(question,self.vocab)

        answer_list=index(answer,self.vocab)
        
        #conver to tensor 
        return torch.tensor(question_list),torch.tensor(answer_list)


In [121]:
data=CustomDatase(df,vocab)

In [122]:
from torch.nn.utils.rnn import pad_sequence

In [123]:
#to do padding over the questions and answers embedding 
def collate_fn(batch):
            """
            Custom collate function to pad variable length sequences.
            
            Args:
                batch: A list of tuples, where each tuple is (question_tensor, answer_tensor)
            """
           
            # Step A: Separate the data and the labels
                # We unzip the list of tuples into two separate lists.
                # batch_data will be [tensor([1,2]), tensor([3,4,5,6]), ...]
                # batch_labels will be [0, 1, 0]
 
            question,answer=zip(*batch)
           
            # Step B: Pad the data
            # Since the tensors are different lengths, we cannot stack them yet.
            # pad_sequence will add zeros to the shorter ones so they match the longest one.
            # batch_first=True means the output shape is (Batch_Size, Max_Length)
            question_paded=pad_sequence(question,batch_first=True)
             
            if answer[0].dim()>0:
                answer_padded=pad_sequence(answer,batch_first=True)

            else:
               # Step C: Stack the labels
                # Labels are usually just single numbers (integers), so we can simply 
                # turn the list into a Tensor.
               answer_padded=torch.stack(answer)

            return question_paded,answer_padded 

In [124]:
#DataLoader
training_data_loader=DataLoader(dataset=data,batch_size=32,shuffle=True,pin_memory=True,collate_fn=collate_fn)

In [125]:
for questio,answer in training_data_loader:
    print(questio,answer)

tensor([[ 10,  29, 130, 131,   0,   0,   0,   0,   0,   0],
        [ 42, 320,   2,  62,  63,   3, 321,   5, 322,   0],
        [ 42,  18,   2,   3, 283, 143,   3, 284,   0,   0],
        [ 42,  43,  44,  45,  46,  47,  48,   0,   0,   0],
        [ 10,  96,   3, 104, 241,   0,   0,   0,   0,   0],
        [ 42, 137, 118,   3, 249,   5, 250,   0,   0,   0],
        [ 42, 137,   2,  62,  39,   3, 324, 325,   0,   0],
        [  1,   2,   3,  33,  34,   5, 247,   0,   0,   0],
        [  1,   2,   3,   4,   5,  73,   0,   0,   0,   0],
        [ 78,  79, 196,  81,  19,   3, 197, 198, 199,   0],
        [ 42, 252, 253, 118, 254, 255,   0,   0,   0,   0],
        [ 10,   2,   3,  66,   5,  67,   0,   0,   0,   0],
        [ 10,  11,  12,  13,  14,  15,   0,   0,   0,   0],
        [ 10,  75,  76,   0,   0,   0,   0,   0,   0,   0],
        [ 78,  79, 263, 152,  14, 264, 154,   0,   0,   0],
        [ 42, 137,   2, 138,  39, 139,   0,   0,   0,   0],
        [  1,   2,   3,  37,  38,  39, 1



In [250]:
class RNN(nn.Module):

    def __init__(self,vocab_size):

        super().__init__()

        self.embeddings=nn.Embedding(vocab_size,52)

        self.rnn=nn.RNN(52,30,batch_first=True)

        self.output=nn.Linear(30,vocab_size)


    def forward(self,text):
          
        embeddings=self.embeddings(text)

        hidden_output,final_output=self.rnn(embeddings)

        output=self.output(final_output.squeeze(0))

        return output



In [251]:
#testing the model
t=nn.Embedding(324,52)
z=nn.RNN(52,30)
y=nn.Linear(30,324)

In [252]:
input=data[0][0].reshape(1,6)

print(input.shape)

m=t(input)

print(m.shape)

x,c=z(m)
print(c.shape)

n=y(c)

torch.Size([1, 6])
torch.Size([1, 6, 52])
torch.Size([1, 6, 30])


In [253]:
model=RNN(len(vocab)).to(device)

In [254]:
lr=0.001
optimizer=optim.Adam(model.parameters(),lr=lr)
los=nn.CrossEntropyLoss()
epochs=80

In [255]:

for epocs in range(epochs):

    total_loss=0

    for features,labels in training_data_loader:

        features,labels=features.to(device),labels.to(device)

        #forward propagaton
        output=model(features)

        #loss calculation
        loss=los(output,labels.squeeze())

        #backward propagation
        optimizer.zero_grad()

        loss.backward()

        #optimization
        optimizer.step()

        loss+=loss.item()
        
    print(f'loss for {epocs+1} is {loss:.2f}')




loss for 1 is 11.58
loss for 2 is 11.40
loss for 3 is 11.24
loss for 4 is 11.25
loss for 5 is 11.11
loss for 6 is 10.99
loss for 7 is 10.74
loss for 8 is 10.63
loss for 9 is 10.57
loss for 10 is 10.61
loss for 11 is 10.42
loss for 12 is 10.19
loss for 13 is 10.02
loss for 14 is 9.79
loss for 15 is 9.94
loss for 16 is 9.84
loss for 17 is 9.84
loss for 18 is 9.44
loss for 19 is 9.60
loss for 20 is 9.27
loss for 21 is 9.31
loss for 22 is 9.33
loss for 23 is 9.35
loss for 24 is 9.20
loss for 25 is 9.15
loss for 26 is 9.25
loss for 27 is 9.20
loss for 28 is 9.14
loss for 29 is 9.12
loss for 30 is 9.17
loss for 31 is 8.99
loss for 32 is 8.90
loss for 33 is 8.90
loss for 34 is 8.85
loss for 35 is 8.90
loss for 36 is 8.99
loss for 37 is 8.70
loss for 38 is 8.82
loss for 39 is 8.84
loss for 40 is 8.81
loss for 41 is 8.83
loss for 42 is 8.96
loss for 43 is 8.69
loss for 44 is 8.77
loss for 45 is 8.51
loss for 46 is 9.12
loss for 47 is 8.84
loss for 48 is 9.06
loss for 49 is 8.83
loss for 50 is 8

In [269]:

def prediction(model, question, threshold=0.5):
    # Index the question using the vocabulary
    indexed_question = index(question, vocab=vocab)

    # Convert the indexed question to a tensor and add a batch dimension
    indexed_question = torch.tensor(indexed_question).unsqueeze(0)

    # Ensure the tensor is on the same device as the model
    indexed_question = indexed_question.to(device=device)

    # Make a prediction using the model in inference mode
    predicted = model(indexed_question)

    # Apply softmax to the predicted output
    output = nn.functional.softmax(predicted, dim=1)

    # Get the maximum probability and its corresponding index
    values, indices = torch.max(output, dim=1)
     
    print(values)
    if values<threshold: 
        print('i dont know ')
    else:
       word=list(vocab.keys())[indices]
       print(word)

In [270]:
list(vocab.keys())[36]

'8'

In [271]:
prediction(model,question='Who painted the Mona Lisa?,')

tensor([0.0368], device='mps:0', grad_fn=<MaxBackward0>)
i dont know 


In [260]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from pathlib import Path
import torch.optim as optim

In [35]:
path=Path('/Users/divyyadav/Desktop/pytorch/src/ann,cnn,rnn,lstm/100_Unique_QA_Dataset.csv')

In [36]:
df=pd.read_csv(path)

In [45]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [46]:
def tokenize(text):
    text=text.lower()
    text=text.replace("","")
    text=text.replace("?","")

    return text.split()

In [50]:
vocab={'UNK':0}

In [57]:
def build_vocab(row):

    tokenized_row=tokenization(row['question'])
    tokenized_answer=tokenize(row['answer'])

    merged_tokens=tokenized_row+tokenized_answer

    for token in merged_tokens:
        if token not in vocab:
            vocab[token]=len(vocab)

        else:
            vocab['UNK']=0

    return merged_tokens

In [58]:
df.apply(build_vocab,axis=1)

0           [what, is, the, capital, of, france, paris]
1         [what, is, the, capital, of, germany, berlin]
2     [who, wrote, 'to, kill, a, mockingbird', harpe...
3     [what, is, the, largest, planet, in, our, sola...
4     [what, is, the, boiling, point, of, water, in,...
                            ...                        
85    [who, directed, the, movie, 'titanic', jamesca...
86    [which, superhero, is, also, known, as, the, d...
87       [what, is, the, capital, of, brazil, brasilia]
88    [which, fruit, is, known, as, the, king, of, f...
89    [which, country, is, known, for, the, eiffel, ...
Length: 90, dtype: object

In [60]:
len(vocab)

326

In [62]:
def indexed(text,vocab):

    indexed_list=[]

    for token in tokenization(text):
        if token in vocab:
            indexed_list.append(vocab[token])

        else:
            indexed_list.append(vocab['UNK'])

    return indexed_list

In [64]:
class CustomDatase(Dataset):
    def __init__(self,df,vocab):
        self.df=df
        self.vocab=vocab

    def __len__(self):
        return self.df.shape[0]


    def __getitem__(self,index):

        question=self.df.iloc[index]['question']

        answer=self.df.iloc[index]['answer']

        questions_indexed=indexed(question,self.vocab)

        answer_indexed=indexed(answer,self.vocab)

        return questions_indexed,answer_indexed
        

In [65]:
data=CustomDatase(df=df,vocab=vocab)

In [85]:
len(data)

90

In [67]:
data[0]

([1, 2, 3, 4, 5, 6], [7])

In [70]:
def find_the_maximum_length(data):
    
    all_lengths=[]

    for question,answer in data:

        all_lengths.append(len(question))
    
    return max(all_lengths),print(all_lengths)


In [71]:
find_the_maximum_length(data=data)

[6, 6, 6, 9, 9, 5, 7, 7, 7, 8, 6, 6, 9, 8, 6, 8, 6, 3, 7, 7, 6, 4, 6, 5, 8, 6, 3, 6, 7, 7, 8, 8, 8, 4, 7, 6, 6, 10, 7, 7, 6, 5, 7, 8, 7, 9, 10, 7, 8, 6, 5, 8, 9, 8, 6, 3, 8, 8, 9, 8, 8, 6, 6, 7, 5, 8, 7, 7, 6, 7, 6, 7, 8, 7, 10, 6, 9, 6, 8, 8, 6, 7, 8, 6, 12, 5, 9, 6, 9, 8]


(12, None)

In [None]:
#padding on the left side of the questions
def pad_data(data,max_length):

    for question,answer in data:

        [0]*(max_length-len(question))+question

    