## Dataset

This section describes the process to load the dataset used to train and test the model. The dataset I am using on this project is just as stupid as the network. The idea is just to learn more about recurrent neural networks.

In [2]:
import pandas as pd

In [9]:
dataset_path = "../../dataset/data.csv"
data = pd.read_csv(dataset_path, header=0)
data.head()

Unnamed: 0,question,answer
0,how are you?,good
1,how are you?,sad
2,how are you?,upset
3,how old are you?,23 years old
4,how old are you?,9 years old


This dataset is not large enough to justify using the PyTorch `Dataset` utility class. However, I will use it.

In [128]:
import torch
from torch.utils.data.dataset import Dataset
import pandas as pd
import numpy as np


class StupidBotDataset(Dataset):
    def __init__(self, csv_path):
        self.data = pd.read_csv(csv_path, header=0)
        self.questions = self.data["question"]
        self.answers = self.data["answer"]
        self.data_len = len(self.data.index)
        
        # Unique characters in the database.
        self.unique_characters = set("".join(self.questions + self.answers))
        self.unique_characters_length = len(self.unique_characters)
        # Map int to character.
        self.int2char = dict(enumerate(self.unique_characters))
        # Map character to int.
        self.char2int = {char: i for i, char in self.int2char.items()}
        
        # Longer question.
        longer_question_length = len(max(self.questions, key=len))
        # Longer answer.
        longer_answer_length = len(max(self.answers, key=len))
        
        # Pad strings.
        self.questions = self.questions.str.pad(longer_question_length, side="right")
        self.answers = self.answers.str.pad(longer_answer_length, side="right")

    def __getitem__(self, index):
        x = self.questions[index]
        # Map text to int.
        x = self.text2int(x)
        # One-hot encode x.
        x = self.one_hot_encode(x)
        x = torch.tensor(x).cuda()
        
        y = self.answers[index]
        # Map text to int.
        y = self.text2int(y)
        # One-hot encode y.
        y = self.one_hot_encode(y)
        y = torch.tensor(y).cuda()
        return x, y

    def __len__(self):
        return self.data_len
    
    def text2int(self, text):
        return [self.char2int[c] for c in text]
    
    def one_hot_encode(self, sequence):
        encoded = np.zeros([self.unique_characters_length, len(sequence)], dtype=int)
        for i, character in enumerate(sequence):
            encoded[character][i] = 1
        return encoded
    
    def one_hot_decode(self, sequence):
        """
            sequence: expected to be a PyTorch tensor.
        """
        return [np.argmax(x) for x in sequence.numpy().T]

In [129]:
dataset = StupidBotDataset(dataset_path)
dataset[1]

(tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0