##### Recurrent Neural Networks  
RNNs are a type of neural networks designed for processing sequential data.  
They maintain a memory of previous inputs.  
It excels in usecases where context and order matter.  

Flow: 
- Read textual data  
- Convert text data into embeddings(words into numbers)  
- create a model  
- train the model  

In [9]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [92]:
df = pd.read_csv("data/100_Unique_QA_Dataset (1).csv")
df.head()
new_data = {
    "Who is the most beautiful woman in the world?": "Aayushi",
    "Whom does Geet love the most?": "Aayushi",
    "Who makes Geet's heart skip a beat?": "Aayushi",
    "Who is the queen of Geet's world?": "Aayushi",
    "Who has the most gorgeous smile ever?": "Aayushi",
    "Who lights up Geet's life like a thousand suns?": "Aayushi",
    "Who is the one Geet can't live without?": "Aayushi",
    "Who is Geet's forever crush?": "Aayushi",
    "Who rules Geet's heart?": "Aayushi",
    "Who is Geet's definition of perfection?": "Aayushi",
    "Who deserves all the love poems in the world?": "Aayushi",
    "Who is the sunshine on a rainy day for Geet?": "Aayushi",
    "Who is the love of Geet’s life?": "Aayushi",
    "Who is the most special person to Geet?": "Aayushi",
    "Who does Geet adore the most?": "Aayushi",
    "Who is the only name in Geet’s playlist of love?": "Aayushi",
    "If Geet had one wish, who would he wish for?": "Aayushi",
    "Who is the most magical woman ever?": "Aayushi",
    "Who makes Geet smile just by existing?": "Aayushi",
    "Who is the star of every dream Geet has?": "Aayushi",
}

new_df = pd.DataFrame(list(new_data.items()), columns=["question", "answer"])

df = pd.concat([df, new_df], ignore_index=True)


In [93]:
def tokenize(text):
    text= text.lower()
    text = text.replace("'", "")
    text = text.replace('"', '')
    text = text.replace("?", "")
    text = text.split(" ")
    return text

def create_vocab(df):
    create_vocab = set()
    for i in range(len(df)):
        question = tokenize(df['question'][i])
        answer = tokenize(df['answer'][i])
        create_vocab.update(question)
        create_vocab.update(answer)
    return list(create_vocab)

In [94]:
vocab = create_vocab(df)
def vocab_to_dict(vocab):
    vocab_dict = {'<unk>': 0}
    for idx, word in enumerate(vocab):
        vocab_dict[word] = idx + 1
    
    return vocab_dict

vocab_dict = vocab_to_dict(vocab)
vocab_dict

{'<unk>': 0,
 'from': 1,
 'albert-einstein': 2,
 'whale': 3,
 'main': 4,
 'discovered': 5,
 'breathe': 6,
 'color': 7,
 'net,': 8,
 'kill': 9,
 'tower': 10,
 'author': 11,
 'romeo': 12,
 'did': 13,
 'directed': 14,
 '64': 15,
 'room': 16,
 'tokyo': 17,
 'united': 18,
 '28': 19,
 'humans': 20,
 'pinocchio': 21,
 'body': 22,
 'in': 23,
 'lying': 24,
 'spain': 25,
 'germany': 26,
 'cant': 27,
 'electricity': 28,
 'common': 29,
 'step': 30,
 'rainy': 31,
 'pound': 32,
 'how': 33,
 'geet’s': 34,
 'would': 35,
 'great': 36,
 'margaretthatcher': 37,
 'brasilia': 38,
 'gas': 39,
 '1': 40,
 'disney': 41,
 'mango': 42,
 'end': 43,
 'developed': 44,
 'gold': 45,
 'most': 46,
 'iron': 47,
 'freezing': 48,
 'adult': 49,
 'gorgeous': 50,
 'year': 51,
 'forever': 52,
 'every': 53,
 'substance': 54,
 'root': 55,
 'arms': 56,
 'long': 57,
 'show': 58,
 'sunshine': 59,
 'india': 60,
 'collect': 61,
 'formula': 62,
 'many': 63,
 'first': 64,
 'study': 65,
 'to': 66,
 '1945': 67,
 'bees': 68,
 'creature':

In [95]:
def text_to_index(text, vocab_dict):
    text = tokenize(text)
    text_index = []
    for word in text:
        if word in vocab_dict.keys():
            text_index.append(vocab_dict[word])
        else:
            text_index.append(vocab_dict['<unk>'])
    if len(text_index) == 1:
        return text_index[0]
    else:
        return text_index

text_to_index("WHICH is the third Planet", vocab_dict)

[124, 128, 186, 0, 317]

In [96]:
df['question'] = df['question'].apply(lambda x: text_to_index(x, vocab_dict))
df['answer'] = df['answer'].apply( lambda x: text_to_index(x, vocab_dict))
df.head()

Unnamed: 0,question,answer
0,"[232, 128, 186, 283, 135, 179]",88
1,"[232, 128, 186, 283, 135, 26]",145
2,"[142, 161, 66, 9, 75, 215]",169
3,"[232, 128, 186, 289, 317, 23, 225, 156, 210]",89
4,"[232, 128, 186, 202, 99, 135, 281, 23, 101]",141


In [97]:
class qadatset(Dataset):
    def __init__(self, df):
        self.df = df
        self.features = df.question
        self.labels = df.answer
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        return torch.tensor(self.features[idx]), torch.tensor(self.labels[idx])

train_dataset = qadatset(df)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)


In [98]:
for idx, (data, label) in enumerate(train_dataloader):
    print(f"Batch {idx+1}")
    print("Data:", data)
    print("Label:", label.shape)
    break

Batch 1
Data: tensor([[124, 326, 128, 352,  66, 186,  36, 212]])
Label: torch.Size([1])


In [99]:
class myrnnmodel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.embedding =   nn.Embedding(
                num_embeddings=vocab_size, 
                embedding_dim=64
            )
        self.rnn = nn.RNN(input_size=64,hidden_size=64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)

    def forward(self, x):
        out = self.embedding(x)
        _, out = self.rnn(out)
        out = self.fc(out.squeeze(0))
        return out

vocab_size = len(vocab_dict)
model = myrnnmodel(vocab_size)
model

myrnnmodel(
  (embedding): Embedding(377, 64)
  (rnn): RNN(64, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=377, bias=True)
)

In [100]:
x = train_dataset[45][0]
# print(x)
embedding = nn.Embedding(
                num_embeddings=vocab_size, 
                embedding_dim=64
            )
out = embedding(x)
print(out.shape)
rnn = nn.RNN(64,64)
_, out = rnn(out)
final = nn.Linear(64, vocab_size)
out = final(out)
out.shape

torch.Size([9, 64])


torch.Size([1, 377])

In [101]:
epochs = 100
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(epochs):
    for batch_features, batch_labels in train_dataloader:
        y_pred = model(batch_features)
        loss = loss_fn(y_pred, batch_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch%10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 6.008917331695557
Epoch 11, Loss: 0.426745742559433
Epoch 21, Loss: 0.05934172496199608
Epoch 31, Loss: 0.007238590624183416
Epoch 41, Loss: 0.011146065779030323
Epoch 51, Loss: 0.0008330450509674847
Epoch 61, Loss: 0.004755496513098478
Epoch 71, Loss: 0.0031767638865858316
Epoch 81, Loss: 0.0012318650260567665
Epoch 91, Loss: 0.0004919749335385859


In [118]:
question = "Who is the Geet's crush?"
question_index = text_to_index(question, vocab_dict)
question_index = torch.tensor(question_index).unsqueeze(0)
question_index.shape
y_pred = model(question_index)
y_pred = y_pred.argmax(dim=1)
answer = [k for k, v in vocab_dict.items() if v == y_pred.item()]
print(f"Question: {question}")
print(f"Predicted Answer: {answer[0] if answer else 'No answer found'}")

Question: Who is the Geet's crush?
Predicted Answer: aayushi
