In [5]:
import pandas as pd
import numpy as np
import torch 
import torch.optim as optim
from collections import Counter
import torch.nn as nn
from pathlib import Path

In [157]:
torch.manual_seed(43)

<torch._C.Generator at 0x10ed0a8d0>

In [158]:
if torch.backends.mps.is_available:
    device=torch.device('mps')
else:
    device=torch.device('cpu')

In [159]:
device

device(type='mps')

In [11]:
path=Path('/Users/divyyadav/Desktop/pytorch/src/ann,cnn,rnn,lstm/100_Unique_QA_Dataset.csv')

In [12]:
df=pd.read_csv(path)

In [13]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [19]:
df.info,df.describe

(<bound method DataFrame.info of                                              question        answer
 0                      What is the capital of France?         Paris
 1                     What is the capital of Germany?        Berlin
 2                  Who wrote 'To Kill a Mockingbird'?    Harper-Lee
 3     What is the largest planet in our solar system?       Jupiter
 4      What is the boiling point of water in Celsius?           100
 ..                                                ...           ...
 85                  Who directed the movie 'Titanic'?  JamesCameron
 86  Which superhero is also known as the Dark Knight?        Batman
 87                     What is the capital of Brazil?      Brasilia
 88        Which fruit is known as the king of fruits?         Mango
 89       Which country is known for the Eiffel Tower?        France
 
 [90 rows x 2 columns]>,
 <bound method NDFrame.describe of                                              question        answer
 0        

In [None]:
#performing tokenization
def tokenization(text):
    lower_text=text.lower()
    text=lower_text.replace('','')
    text=lower_text.replace('?','')
    return text.split()

In [87]:
vocab={'<UNK>':0}

In [88]:
#making vocabulary
def  build_vocab(row):

    tokenized_row=tokenization(row['question'])
    tokenized_question=tokenization(row['answer'])

    merged_tokens=tokenized_row+tokenized_question
    
    for token in merged_tokens:

        if token not in vocab:
            
            vocab[token]=len(vocab)


In [89]:
df.apply(build_vocab,axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [92]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 "'to": 12,
 'kill': 13,
 'a': 14,
 "mockingbird'": 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 "'1984'": 67,
 'george-orwell': 68,
 'currency': 69,
 '

In [91]:
len(vocab)

326

In [138]:
# a functions that returns the indice value from the text after iterating the tokens 
def index(text,vocab):
    
    indexed_text=[]

    for tokens in tokenization(text):

        if tokens in vocab:
            indexed_text.append(vocab[tokens])

        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text
    

In [139]:
#now we need to convert the question and answer from the data to indexs

from torch.utils.data import Dataset,DataLoader

In [153]:
class CustomDatase(Dataset):

    def __init__(self,df,vocab):
        super().__init__()
        self.text=df
        self.vocab=vocab

    def __len__(self):
        return self.text.shape[0]
    
    def __getitem__(self,indx):

        question=self.text.iloc[indx]['question']
      
        answer=self.text.iloc[indx]['answer']

        question_list=index(question,self.vocab)

        answer_list=index(answer,self.vocab)
        
        #conver to tensor 
        return torch.tensor(question_list),torch.tensor(answer_list)


In [154]:
data=CustomDatase(df,vocab)

In [None]:
from torch.nn.utils.rnn import pad_sequence

In [None]:
#to do padding over the questions and answers embedding 
def collate_fn(batch):
            """
            Custom collate function to pad variable length sequences.
            
            Args:
                batch: A list of tuples, where each tuple is (question_tensor, answer_tensor)
            """
           
            # Step A: Separate the data and the labels
                # We unzip the list of tuples into two separate lists.
                # batch_data will be [tensor([1,2]), tensor([3,4,5,6]), ...]
                # batch_labels will be [0, 1, 0]
 
            question,answer=zip(*batch)
           
            # Step B: Pad the data
            # Since the tensors are different lengths, we cannot stack them yet.
            # pad_sequence will add zeros to the shorter ones so they match the longest one.
            # batch_first=True means the output shape is (Batch_Size, Max_Length)
            question_paded=pad_sequence(question,batch_first=True)
             
            if answer[0].dim()>0:
                answer_padded=pad_sequence(answer,batch_first=True)

            else:
               # Step C: Stack the labels
                # Labels are usually just single numbers (integers), so we can simply 
                # turn the list into a Tensor.
               answer_padded=torch.stack(answer)

            return question_paded,answer_padded 

In [243]:
#DataLoader
training_data_loader=DataLoader(dataset=data,batch_size=32,shuffle=True,pin_memory=True,collate_fn=collate_fn)

In [244]:
for questio,answer in training_data_loader:
    print(questio,answer)

tensor([[  1,   2,   3,   4,   5, 113,   0,   0,   0],
        [  1,   2,   3,  33,  34,   5,  35,   0,   0],
        [  1,   2,   3, 141, 117,  83,   3, 279, 280],
        [  1,   2,   3,  69,   5,   3,  70,  71,   0],
        [ 42, 137,   2, 138,  39, 176, 271,   0,   0],
        [  1,   2,   3,  69,   5, 156,   0,   0,   0],
        [  1,  87, 230, 231, 232, 233,   0,   0,   0],
        [ 42, 265, 266,  14, 267, 268, 159, 269,   0],
        [  1,   2,   3, 235,   5, 236,   0,   0,   0],
        [  1,   2,   3,   4,   5, 281,   0,   0,   0],
        [ 10,  75, 111,   0,   0,   0,   0,   0,   0],
        [ 10,  29,   3,  30,  31,   0,   0,   0,   0],
        [ 78,  79, 151, 152,  14, 153, 154,   0,   0],
        [ 10,  11,  12,  13,  14,  15,   0,   0,   0],
        [ 42, 137,   2, 138,  39, 139,   0,   0,   0],
        [  1,   2,   3,  37,  38,  39, 162,   0,   0],
        [  1,   2,   3,   4,   5, 288,   0,   0,   0],
        [  1,   2,   3, 222,   5, 223, 224, 225,   0],
        [ 

In [255]:
class RNN(nn.Module):

    def __init__(self,vocab_size):

        super().__init__()

        self.embeddings=nn.Embedding(vocab_size,52)

        self.rnn=nn.RNN(52,30,batch_first=True)

        self.output=nn.Linear(30,vocab_size)


    def forward(self,text):
          
        embeddings=self.embeddings(text)

        hidden_output,final_output=self.rnn(embeddings)

        output=self.output(final_output.squeeze(0))

        return output



In [256]:
#testing the model
t=nn.Embedding(324,52)
z=nn.RNN(52,30)
y=nn.Linear(30,324)

In [257]:
input=data[0][0].reshape(1,6)

print(input.shape)

m=t(input)

print(m.shape)

x,c=z(m)
print(c.shape)

n=y(c)

torch.Size([1, 6])
torch.Size([1, 6, 52])
torch.Size([1, 6, 30])


In [258]:
model=RNN(len(vocab)).to(device)

In [259]:
lr=0.001
optimizer=optim.Adam(model.parameters(),lr=lr)
los=nn.CrossEntropyLoss()
epochs=2

In [260]:

for epocs in range(epochs):

    total_loss=0

    for features,labels in training_data_loader:

        features,labels=features.to(device),labels.to(device)

        #forward propagaton
        output=model(features)

        #print(output.shape,labels.shape)
        features

        #loss calculation
        loss=los(output,labels)

        #backward propagation
        optimizer.zero_grad()

        loss.backward()

        #optimization
        optimizer.step()

        loss+=loss.item()
        
    print(f'loss for {epocs+1} is {loss:.2f}')




RuntimeError: 0D or 1D target tensor expected, multi-target not supported