In [13]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 

## Load Data & EDA

In [2]:
data = pd.read_csv('combined-newsqa-data-v1.csv')
print(data.columns)
print(type(data['is_question_bad'][0]))

# remove Q/A pairs that are invalid or missing
#data = data[(data.is_question_bad=='0.0') & (data.is_answer_absent=='0.0')]
data.head()

Index(['story_id', 'question', 'answer_char_ranges', 'is_answer_absent',
       'is_question_bad', 'validated_answers', 'story_text'],
      dtype='object')
<class 'str'>


Unnamed: 0,story_id,question,answer_char_ranges,is_answer_absent,is_question_bad,validated_answers,story_text
0,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,What was the amount of children murdered?,294:297|None|None,0.0,0.0,"{""none"": 1, ""294:297"": 2}","NEW DELHI, India (CNN) -- A high court in nort..."
1,./cnn/stories/c48228a52f26aca65c31fad273e66164...,Where was one employee killed?,34:60|1610:1618|34:60,0.0,0.0,,(CNN) -- Fighting in the volatile Sudanese reg...
2,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,who did say South Africa did not issue a visa ...,103:127|114:127|839:853,0.0,0.0,"{""839:853"": 1, ""103:127"": 2}",Johannesburg (CNN) -- Miffed by a visa delay t...
3,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,How many years old was the businessman?,538:550|538:550,0.0,0.0,,(CNN) -- England international footballer Ste...
4,./cnn/stories/13012604e3203c18df09289dfedd14cd...,What frightened the families?,690:742|688:791|630:646,0.0,0.0,"{""688:791"": 2, ""690:742"": 1}","BAGHDAD, Iraq (CNN) -- At least 6,000 Christi..."


In [3]:
print(data['is_question_bad'][0])

0.0


In [4]:
# 119,633 Q/A's , 12088 articles
print(len(data))
print(len(data['story_text'].unique()))


119633
12088


In [5]:
first_doc = data['story_text'][0]
first_doc

'NEW DELHI, India (CNN) -- A high court in northern India on Friday acquitted a wealthy businessman facing the death sentence for the killing of a teen in a case dubbed "the house of horrors."\n\n\n\nMoninder Singh Pandher was sentenced to death by a lower court in February.\n\n\n\nThe teen was one of 19 victims -- children and young women -- in one of the most gruesome serial killings in India in recent years.\n\n\n\nThe Allahabad high court has acquitted Moninder Singh Pandher, his lawyer Sikandar B. Kochar told CNN.\n\n\n\nPandher and his domestic employee Surinder Koli were sentenced to death in February by a lower court for the rape and murder of the 14-year-old.\n\n\n\nThe high court upheld Koli\'s death sentence, Kochar said.\n\n\n\nThe two were arrested two years ago after body parts packed in plastic bags were found near their home in Noida, a New Delhi suburb. Their home was later dubbed a "house of horrors" by the Indian media.\n\n\n\nPandher was not named a main suspect by 

In [6]:
first_doc[294:297]

'19 '

In [7]:
second_doc = data['story_text'][2]
second_doc

'Johannesburg (CNN) -- Miffed by a visa delay that led the Dalai Lama to cancel a trip to South Africa, Archbishop Desmond Tutu lashed out at his government Tuesday, saying it had acted worse than apartheid regimes and had forgotten all that the nation stood for.\n\n\n\n\n\n"When we used to apply for passports under the apartheid government, we never knew until the last moment what their decision was," Tutu said at a news conference. "Our government is worse than the apartheid government because at least you were expecting it from the apartheid government.\n\n\n\n\n\n"I have to say that I can\'t believe this. I really can\'t believe this," Tutu said. "You have to wake me up and tell me this is actually happening here."\n\n\n\n\n\nThe Dalai Lama scrapped his planned trip to South Africa this week after the nation failed to issue him a visa in time, his spokesman said.\n\n\n\n\n\nVisa applications for him and his entourage were submitted to the South African High Commission in New Delhi,

In [8]:
print(second_doc[103:127])
print(second_doc[114:127])
print(second_doc[839:853])

Archbishop Desmond Tutu 
Desmond Tutu 
his spokesman 


Answers given by different human reviewers are in the answer_char_ranges column and the validated_answers column. These values are string index ranges within the document that represent the answer. For each row, we need to determine which character range is the best answer to use and extract it. There are a couple ways we could approach this problem, the first is to have a model that takes the question(str), the story(str), spits out an answer(str).

or we could have it spit back out the string indexes themselves. 

In [9]:
# Remove uneeded columns
df = data[['question','answer_char_ranges','validated_answers','story_text']]
df = df.iloc[0:100] # reduce size for development purposes
df.head()

Unnamed: 0,question,answer_char_ranges,validated_answers,story_text
0,What was the amount of children murdered?,294:297|None|None,"{""none"": 1, ""294:297"": 2}","NEW DELHI, India (CNN) -- A high court in nort..."
1,Where was one employee killed?,34:60|1610:1618|34:60,,(CNN) -- Fighting in the volatile Sudanese reg...
2,who did say South Africa did not issue a visa ...,103:127|114:127|839:853,"{""839:853"": 1, ""103:127"": 2}",Johannesburg (CNN) -- Miffed by a visa delay t...
3,How many years old was the businessman?,538:550|538:550,,(CNN) -- England international footballer Ste...
4,What frightened the families?,690:742|688:791|630:646,"{""688:791"": 2, ""690:742"": 1}","BAGHDAD, Iraq (CNN) -- At least 6,000 Christi..."


In [10]:
# loops through the rows and prints the question along with the first answer given
for i, row in df.iterrows():
    possible_ranges = row['answer_char_ranges']
    possible_ranges_list = possible_ranges.split('|')
    range1 = possible_ranges_list[0].split(':')
    print(range1)
    answer = row['story_text'][int(range1[0]):int(range1[1])]
    question = row['question']
    print(question)
    print(answer)
    break

['294', '297']
What was the amount of children murdered?
19 


Maximum length for story text

In [11]:
max(len(x) for x in df['story_text'])

7208

In [12]:
max(len(x) for x in df['question'])

331

In [53]:
type(df['validated_answers'][0])

str

## Pre-trained glove embeddings

In [15]:
def loadGloveModel(gloveFile="glove.6B.300d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [16]:
word_vecs = loadGloveModel()

## Create vocabulary

In [17]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [19]:
spacy_tok(df['question'][0])

['What', 'was', 'the', 'amount', 'of', 'children', 'murdered', '?']

In [23]:
word_count = Counter()

In [24]:
for i, row in df.iterrows():
    word_count.update(spacy_tok(row['story_text']))
    word_count.update(spacy_tok(row['question']))

In [25]:
len(word_count.keys())

9658

In [26]:
# delete if occurs < 5 times and it is not in our pretrained embeddings
for word in list(word_count):
    if word_count[word] < 5 and word not in word_vecs:
        del word_count[word]
len(word_count)

7066

In [28]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

In [29]:
len(words)

7068

## Create embeddings

In [31]:
def random_word_vector(D=300):
    """Create arandom word vector
    
    0.25 is chosen so the unknown vectors have (approximately) same variance 
    as pre-trained ones
    """
    return np.random.uniform(-0.25,0.25,D)

In [32]:
def create_embedding_matrix(word_vecs, vocab2index, words, D=300):
    """Creates embedding matrix from word vectors. """
    V = len(words)
    W = np.zeros((V, D), dtype="float32")
    W[0] = np.zeros(D, dtype='float32')
    i = 1
    for i in range(1, V):
        if words[i] in word_vecs:
            W[i] = word_vecs[words[i]]
        else:
            W[i] = random_word_vector()
    return W

In [33]:
embedding_matrix = create_embedding_matrix(word_vecs, vocab2index, words)
embedding_matrix.shape

(7068, 300)

In [67]:
# note that spacy_tok takes a while run it just once
def encode_text(text, vocab2index, N=400, padding_start=True):
    x = spacy_tok(text)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc #, l

In [68]:
text = df['story_text'][0]
encode_text(text, vocab2index, N=400)

array([  1,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         3,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  20,
        24,  25,  16,  26,  11,  16,  27,  28,  29,  20,  30,  25,  31,
        32,  29,  33,   1,   1,  34,  35,  36,  37,  21,  38,  16,  39,
        10,  11,  40,  32,  33,  41,  26,  35,  42,  25,  43,  44,   7,
        45,  46,  47,  48,   7,  11,  42,  25,  20,  49,  50,  51,  52,
        11,   3,  11,  53,  54,  32,  33,  41,   1,   9,  10,  55,  15,
         1,   1,  34,   2,  56,  57,   1,   1,   1,  58,   5,  32,  33,
        34,  46,  56,  59,  60,   1,   1,  61,  36,  37,  21,  11,  40,
        38,  16,  39,  10,  23,  20,  62,  46,  63,  25,  20,  64,  65,
        66,  32,  33,  41,   9,  10,  67,   1,  68,  21,  22,   2,   1,
        69,  32,  33,  41,  70,  61,  71,  70,  54,  72,  73,  74,  75,
        76,  11,  77,  78,  61,  79,  80,  81,  82,  11,   1,   2,  16,
        83,   1,  84,  32,   1,  82,  35,  85,  28,  16,  29,  3

## Train/val/test split

In [42]:
from sklearn.model_selection import train_test_split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(df[['question', 'story_text']], 
                                                    df['answer_char_ranges'], 
                                                    test_size=0.1, 
                                                    random_state=42)

In [55]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Dataset

Notes:

- I have only included the `answer_char_ranges` column as the y variable. We can change it if needed!
- I am returning the encoded question, story, and y variable separately, but we can put the question and story text together if needed depending on our model architecture.

In [69]:
class NewsQADataset(Dataset):
    def __init__(self, X, y, N=400, padding_start=False):
        self.y = [k.split('|') for k in y]
        self.X = [[encode_text(x['question'], vocab2index, N, padding_start),
                   encode_text(x['story_text'], vocab2index, N, padding_start)] for i, x in X.iterrows()]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        return x[0], x[1], self.y[idx]

In [70]:
train_ds = NewsQADataset(X_train, y_train)
valid_ds = NewsQADataset(X_valid, y_valid)

In [71]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [72]:
train_ds[1]

(array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0