In [1]:
from typing import Union
from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import pandas as pd
import numpy as np

In [2]:
torch.__version__, torchtext.__version__

('1.10.0+cu111', '0.11.0')

In [3]:
%%shell
# downloading QA Dataset
wget http://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz -O QA.tar.gz
mkdir -p data
tar -xf QA.tar.gz -C data
rm QA.tar.gz

--2021-11-27 15:21:33--  http://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8254496 (7.9M) [application/x-gzip]
Saving to: ‘QA.tar.gz’


2021-11-27 15:21:43 (895 KB/s) - ‘QA.tar.gz’ saved [8254496/8254496]





# The Data

In [4]:
df = pd.read_csv("data/Question_Answer_Dataset_v1.2/S08/question_answer_pairs.txt", sep="\t", encoding="iso-8859-1")
df.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,data/set3/a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,data/set3/a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,data/set3/a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,data/set3/a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,data/set3/a4


Looks like the dataset has 5 columns, and the ones we are interested in right now are the Question and Answer columns.

In [5]:
qa_df = df[["Question", "Answer"]].copy()
qa_df.tail()

Unnamed: 0,Question,Answer
1710,Was Wilson president of the American Political...,Yes
1711,Did he not cast his ballot for John M. Palmer ...,Yes
1712,Did Wilson not spend 1914 through the beginnin...,Yes
1713,"Was Wilson , a staunch opponent of antisemitis...",Yes
1714,What happened in 1917?,"raised billions through Liberty loans, imposed..."


In [6]:
len(qa_df)

1715

In [7]:
is_NaN = qa_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = qa_df[row_has_NaN]

In [8]:
rows_with_NaN

Unnamed: 0,Question,Answer
36,Do scholars rank lincoln among the top three p...,
38,Did lincoln have 18 months of schooling?,
40,Was Lincoln chosen as a presidential candidate...,
42,How old was Lincoln in 1816?,
44,When was the first photgraph of lincoln taken?,
...,...,...
1457,Who did Sir Thomas Stamford Raffles work for?,
1459,When was Lee Kuan Yew prime minister of Singap...,
1461,What is the punishment for first-degree murder?,
1509,Was Roosevelt's family rich?,


About 240 rows have NA values. We'll simply drop those rows for now.

In [9]:
qa_df.dropna(inplace=True)

In [10]:
len(qa_df)

1475

So for S09 and S10, we do the same, get only the Question and Answer columns, and drop the rows which have NA values.

# The Dataset

In [11]:
class WikiQA(Dataset):
    def __init__(self, root: Union[str, Path]):
        super().__init__()
        path = Path(root) if isinstance(root, str) else root

        frames = []
        for s in ["S08", "S09", "S10"]:
            df = pd.read_csv(path/"Question_Answer_Dataset_v1.2"/s/"question_answer_pairs.txt", sep="\t", encoding="iso-8859-1")
            df = df[["Question", "Answer"]]
            df.dropna(inplace=True)
            frames.append(df)
        self.df = pd.concat(frames)
        self.df.reset_index(inplace=True, drop=True)

        self.tokenizer = get_tokenizer("basic_english")

        def yield_tokens(dataframe: pd.DataFrame):
            for row in dataframe.itertuples():
                yield self.tokenizer(row.Question) + self.tokenizer(row.Answer)
        
        self.vocab = build_vocab_from_iterator(yield_tokens(self.df), specials=["<unk>", "<sos>", "<eos>", "<pad>"])
        self.unk_idx = self.vocab["<unk>"]
        self.eos_idx = self.vocab["<eos>"]
        self.sos_idx = self.vocab["<sos>"]
        self.pad_idx = self.vocab["<pad>"]
        self.vocab.set_default_index(self.unk_idx)

        self.text_pipeline = lambda x: self.vocab(self.tokenizer(x))
        self.label_pipeline = lambda x: self.vocab(self.tokenizer(x))
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        return self.text_pipeline(row["Question"]), self.label_pipeline(row["Answer"])

    def __len__(self):
        return len(self.df)

    def collate_fn(self):
        def wrapper(batch):
            texts, labels = zip(*batch)
            lengths = torch.LongTensor([len(s) for s in texts])

            # adding the SOS and EOS tokens
            texts = [
                torch.cat([
                    torch.tensor([self.sos_idx]), 
                    torch.tensor(s), 
                    torch.tensor([self.eos_idx])
                ]) for s in texts
            ]
            labels = [
                torch.cat([
                    torch.tensor([self.sos_idx]), 
                    torch.tensor(l), 
                    torch.tensor([self.eos_idx])
                ]) for l in labels
            ]

            # adding padding
            texts = torch.nn.utils.rnn.pad_sequence(texts, padding_value=self.pad_idx, batch_first=False)
            labels = torch.nn.utils.rnn.pad_sequence(labels, padding_value=self.pad_idx, batch_first=False)

            return texts, labels, lengths

        return wrapper


creating the dataset and dataloader

In [12]:
dataset = WikiQA(root="data")
loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=2, pin_memory=True, collate_fn=dataset.collate_fn())

In [13]:
texts, labels, lengths = next(iter(loader))
texts.shape, labels.shape, lengths.shape

(torch.Size([13, 16]), torch.Size([16, 16]), torch.Size([16]))

lets look at some of the samples

In [14]:
for t in texts:
    print(" ".join(dataset.vocab.get_itos()[x] for x in t))

<sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos>
was was did did did did how how when when what what who who when when
abraham abraham lincoln lincoln his his many many did did did did suggested suggested did did
lincoln lincoln sign sign mother mother long long lincoln lincoln the the lincoln lincoln the the
the the the the die die was was begin begin legal legal grow grow gettysburg gettysburg
sixteenth sixteenth national national of of lincoln lincoln his his tender tender a a address address
president president banking banking pneumonia pneumonia ' ' political political act act beard beard argue argue
of of act act ? ? s s career career of of ? ? that that
the the of of <eos> <eos> formal formal ? ? 1862 1862 <eos> <eos> america america
united united 1863 1863 <pad> <pad> education education <eos> <eos> establish establish <pad> <pad> was was
states states ? ? <pad> <pad> ? ? <pad> <pad> ? ? <pad> <pad> born born
? ? <eos> <eos> <pad> 

In [15]:
for l in labels:
    print(" ".join(dataset.vocab.get_itos()[x] for x in l))

<sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos> <sos>
yes yes yes yes no no 18 18 1832 1832 the the 11-year-old grace 1776 1776
<eos> . <eos> . <eos> . months months <eos> . united united grace bedell <eos> .
<pad> <eos> <pad> <eos> <pad> <eos> <eos> . <pad> <eos> states states bedell . <pad> <eos>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <eos> <pad> <pad> note note <eos> <eos> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> , , <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> the the <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> first first <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> paper paper <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> currency currency <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> in in <pad> <pad> <pad> <pad>
