In [1]:
from keras.utils import get_file
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import tarfile
from nltk import FreqDist
from functools import reduce
import os
import re
import matplotlib.pyplot as plt

In [3]:
data_path = "../data/"

TRAIN_FILE = data_path + "qa1_single-supporting-fact_train.txt"
TEST_FILE = data_path + "qa1_single-supporting-fact_test.txt"

In [5]:
def read_data(dir):
    story_temp = []
    questions, answers, stories = [], [], []
    lines = open(dir, 'rb')
    
    for line in lines:
        line = line.decode("utf-8") # b' 제거
        line = line.strip() # '\n' 제거
        
        idx, text = line.split(' ', 1)
        
        if(int(idx)) == 1: # when new story comes
            story_temp = []
            
        if '\t' in text: # when 'Q\tA' comes
            question, answer, _ = text.split('\t')
            stories.append([x for x in story_temp if x])
            questions.append(question)
            answers.append(answer)
            
        else: # when story comes
            story_temp.append(text)
            
    lines.close()
    return stories, questions, answers

In [9]:
train_data = read_data(TRAIN_FILE)
test_data = read_data(TEST_FILE)

train_stories, train_questions, train_answers = read_data(TRAIN_FILE)
test_stories, test_questions, test_answers = read_data(TEST_FILE)

print(len(train_stories), len(train_questions), len(train_answers))
print(len(test_stories), len(test_questions), len(test_answers))

print(train_stories[3456], train_questions[3456], train_answers[3456])

10000 10000 10000
1000 1000 1000
['John travelled to the garden.', 'John journeyed to the office.', 'Daniel travelled to the kitchen.', 'Daniel moved to the bathroom.'] Where is Daniel?  bathroom


# Tokenizing

In [8]:
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)', sent) if x.strip()] 

def preprocess(train, test):
    counter = FreqDist() # frequency distribution
    flatten = lambda data: reduce(lambda x, y: x + y, data) # concatenate sentences
    
    story_len, question_len = [], []
    
    for stories, questions, answers in [train, test]:
        for story in stories:
            stories_tokenized = tokenize(flatten(story))
            story_len.append(len(stories_tokenized))
            for word in stories_tokenized:
                counter[word.lower()] += 1
        for question in questions:
            question_tokenized = tokenize(question)
            question_len.append(len(question_tokenized))
            for word in question_tokenized:
                counter[word.lower()] += 1
        for answer in answers:
            answer_tokenized = tokenize(answer)
            for word in answer_tokenized:
                counter[word.lower()] += 1

    # 단어 집합 생성
    word2idx = {word : (idx + 1) for idx, (word, _) in enumerate(counter.most_common())}
    idx2word = {idx : word for word, idx in word2idx.items()}

    # 가장 긴 샘플의 길이
    story_max_len = np.max(story_len)
    question_max_len = np.max(question_len)

    return word2idx, idx2word, story_max_len, question_max_len

In [15]:
word2idx, idx2word, story_max_len, question_max_len = preprocess(train_data, test_data)
vocab_size = len(word2idx) + 1

print(word2idx)
print(story_max_len, question_max_len)

{'to': 1, 'the': 2, '.': 3, 'went': 4, 'sandra': 5, 'john': 6, 'daniel': 7, 'mary': 8, 'travelled': 9, 'journeyed': 10, 'back': 11, 'bathroom': 12, 'garden': 13, 'hallway': 14, 'moved': 15, 'office': 16, 'kitchen': 17, 'bedroom': 18, 'where': 19, 'is': 20, '?': 21}
68 4


In [16]:
def vectorize(data, word2idx, story_maxlen, question_maxlen):
    Xs, Xq, Y = [], [], []
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    
    stories, questions, answers = data
    
    for story, question, answer in zip(stories, questions, answers):
        xs = [word2idx[w.lower()] for w in tokenize(flatten(story))]
        xq = [word2idx[w.lower()] for w in tokenize(question)]
        
        y = np.zeros(len(word2idx) + 1)
        y[word2idx[answer]] = 1
        
        Xs.append(xs)
        Xq.append(xq)
        Y.append(y)
        
    return pad_sequences(Xs, maxlen=story_maxlen), pad_sequences(Xq, maxlen=question_maxlen), np.array(Y)

In [17]:
Xstrain, Xqtrain, Ytrain = vectorize(train_data, word2idx, story_max_len, question_max_len)
Xstest, Xqtest, Ytest = vectorize(test_data, word2idx, story_max_len, question_max_len)

print(Xstrain.shape, Xqtrain.shape, Ytrain.shape)
print(Xstest.shape, Xqtest.shape, Ytest.shape)

(10000, 68) (10000, 4) (10000, 22)
(1000, 68) (1000, 4) (1000, 22)
