In [1]:
import pickle
import numpy as np

In [2]:
with open('../06-Deep-Learning/train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [3]:
with open('../06-Deep-Learning/test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

In [4]:
len(train_data)

10000

In [5]:
len(test_data)

1000

In [8]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [9]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [12]:
train_data[0][2]

'no'

In [13]:
all_data = test_data + train_data

In [14]:
len(all_data)

11000

In [18]:
vocab = set(['no', 'yes'])

for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

len(vocab)

37

In [20]:
# for padding later on
vocab_len = len(vocab) + 1

In [24]:
# determining the longest story and question
all_story_lens = [len(data[0]) for data in all_data]
all_question_lens = [len(data[1]) for data in all_data]

In [26]:
max_story_len = max(all_story_lens)
max_question_len = max(all_question_lens)
max_story_len, max_question_len

(156, 6)

## Data Vectorization

In [29]:
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer

In [30]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [31]:
tokenizer.word_index

{'no': 1,
 'there': 2,
 'daniel': 3,
 '?': 4,
 'garden': 5,
 'went': 6,
 'the': 7,
 'apple': 8,
 'back': 9,
 'kitchen': 10,
 '.': 11,
 'hallway': 12,
 'journeyed': 13,
 'office': 14,
 'up': 15,
 'dropped': 16,
 'got': 17,
 'milk': 18,
 'to': 19,
 'discarded': 20,
 'left': 21,
 'yes': 22,
 'in': 23,
 'john': 24,
 'sandra': 25,
 'is': 26,
 'mary': 27,
 'travelled': 28,
 'picked': 29,
 'put': 30,
 'moved': 31,
 'down': 32,
 'grabbed': 33,
 'football': 34,
 'bedroom': 35,
 'bathroom': 36,
 'took': 37}

In [34]:
train_story_text = []
train_question_text = []
train_answers = []

for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [35]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [36]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    # STORIES
    X = []
    # QUESTIONS
    Xq = []
    # ANSWERS
    Y = []

    for story, query, answer in data:
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1

        X.append(x)
        Xq.append(xq)
        Y.append(y)

    return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [37]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)