In [1]:
from ckonlpy.tag import Twitter
from keras.utils import get_file
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
from nltk import FreqDist
from functools import reduce
import os
import re
import matplotlib.pyplot as plt

In [2]:
data_path = "../data/"
TRAIN_FILE = os.path.join(data_path + "qa1_single-supporting-fact_train_kor.txt")
TEST_FILE = os.path.join(data_path + "qa1_single-supporting-fact_test_kor.txt")

In [3]:
def read_data(dir):
    story_temp = []
    questions, answers, stories = [], [], []
    lines = open(dir, 'rb')

    for line in lines:
        line = line.decode("utf-8") # b' 제거
        line = line.strip() # '\n' 제거

        idx, text = line.split(' ', 1)

        if(int(idx)) == 1: # when new story comes
            story_temp = []

        if '\t' in text: # when 'Q\tA' comes
            question, answer, _ = text.split('\t')
            stories.append([x for x in story_temp if x])
            questions.append(question)
            answers.append(answer)

        else: # when story comes
            story_temp.append(text)

    lines.close()
    return stories, questions, answers

train_data = read_data(TRAIN_FILE)
test_data = read_data(TEST_FILE)

train_stories, train_questions, train_answers = read_data(TRAIN_FILE)
test_stories, test_questions, test_answers = read_data(TEST_FILE)

print('훈련용 스토리의 개수 :', len(train_stories))
print('훈련용 질문의 개수 :',len(train_questions))
print('훈련용 답변의 개수 :',len(train_answers))
print('테스트용 스토리의 개수 :',len(test_stories))
print('테스트용 질문의 개수 :',len(test_questions))
print('테스트용 답변의 개수 :',len(test_answers))

훈련용 스토리의 개수 : 10000
훈련용 질문의 개수 : 10000
훈련용 답변의 개수 : 10000
테스트용 스토리의 개수 : 1000
테스트용 질문의 개수 : 1000
테스트용 답변의 개수 : 1000


# Tokenizing

In [4]:
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)', sent) if x.strip()]

def preprocess(train, test):
    counter = FreqDist() # frequency distribution
    flatten = lambda data: reduce(lambda x, y: x + y, data) # concatenate sentences

    story_len, question_len = [], []

    for stories, questions, answers in [train, test]:
        for story in stories:
            stories_tokenized = tokenize(flatten(story))
            story_len.append(len(stories_tokenized))
            for word in stories_tokenized:
                counter[word.lower()] += 1
        for question in questions:
            question_tokenized = tokenize(question)
            question_len.append(len(question_tokenized))
            for word in question_tokenized:
                counter[word.lower()] += 1
        for answer in answers:
            answer_tokenized = tokenize(answer)
            for word in answer_tokenized:
                counter[word.lower()] += 1

    # 단어 집합 생성
    word2idx = {word : (idx + 1) for idx, (word, _) in enumerate(counter.most_common())}
    idx2word = {idx : word for word, idx in word2idx.items()}

    # 가장 긴 샘플의 길이
    story_max_len = np.max(story_len)
    question_max_len = np.max(question_len)

    return word2idx, idx2word, story_max_len, question_max_len


In [5]:
word2idx, idx2word, story_max_len, question_max_len = preprocess(train_data, test_data)
vocab_size = len(word2idx) + 1


print(word2idx)
print(story_max_len, question_max_len)

{'.': 1, '경임이는': 2, '은경이는': 3, '수종이는': 4, '필웅이는': 5, '이동했습니다': 6, '가버렸습니다': 7, '뛰어갔습니다': 8, '복귀했습니다': 9, '갔습니다': 10, '화장실로': 11, '정원으로': 12, '복도로': 13, '어디야': 14, '?': 15, '부엌으로': 16, '사무실로': 17, '침실로': 18, '화장실': 19, '정원': 20, '사무실': 21, '침실': 22, '복도': 23, '부엌': 24}
40 3


In [6]:
twitter = Twitter()

print(twitter.morphs('은경이는 화장실로 이동했습니다.'))
print(twitter.morphs('경임이는 정원으로 가버렸습니다.'))
print(twitter.morphs('수종이는 복도로 뛰어갔습니다.'))
print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.'))
print(twitter.morphs('수종이는 사무실로 갔습니다.'))
print(twitter.morphs('은경이는 침실로 갔습니다.'))

# add korean name entity recognition
twitter.add_dictionary('은경이', 'Name')
twitter.add_dictionary('은경이', 'Noun')
twitter.add_dictionary('경임이', 'Name')
twitter.add_dictionary('경임이', 'Noun')
twitter.add_dictionary('수종이', 'Name')
twitter.add_dictionary('수종이', 'Noun')
twitter.add_dictionary('필웅이', 'Name')
twitter.add_dictionary('필웅이', 'Noun')

# test (not working currently)
print(twitter.morphs('은경이는 화장실로 이동했습니다.'))
print(twitter.morphs('경임이는 정원으로 가버렸습니다.'))
print(twitter.morphs('수종이는 복도로 뛰어갔습니다.'))
print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.'))
print(twitter.morphs('수종이는 사무실로 갔습니다.'))
print(twitter.morphs('은경이는 침실로 갔습니다.'))

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


['은경', '이', '는', '화장실', '로', '이동', '했습니다', '.']
['경', '임', '이', '는', '정원', '으로', '가버렸습니다', '.']
['수', '종이', '는', '복도', '로', '뛰어갔습니다', '.']
['필웅이', '는', '부엌', '으로', '복귀', '했습니다', '.']
['수', '종이', '는', '사무실', '로', '갔습니다', '.']
['은경', '이', '는', '침실', '로', '갔습니다', '.']
['은경', '이', '는', '화장실', '로', '이동', '했습니다', '.']
['경', '임', '이', '는', '정원', '으로', '가버렸습니다', '.']
['수', '종이', '는', '복도', '로', '뛰어갔습니다', '.']
['필웅이', '는', '부엌', '으로', '복귀', '했습니다', '.']
['수', '종이', '는', '사무실', '로', '갔습니다', '.']
['은경', '이', '는', '침실', '로', '갔습니다', '.']


In [7]:
def tokenize(sent):
    return twitter.morphs(sent)

word2idx, idx2word, story_max_len, question_max_len = preprocess(train_data, test_data)

In [8]:
def vectorize(data, word2idx, story_maxlen, question_maxlen):
    Xs, Xq, Y = [], [], []
    flatten = lambda data: reduce(lambda x, y: x + y, data)

    stories, questions, answers = data

    for story, question, answer in zip(stories, questions, answers):
        xs = [word2idx[w.lower()] for w in tokenize(flatten(story))]
        xq = [word2idx[w.lower()] for w in tokenize(question)]

        y = np.zeros(len(word2idx) + 1)
        y[word2idx[answer]] = 1

        Xs.append(xs)
        Xq.append(xq)
        Y.append(y)

    return pad_sequences(Xs, maxlen=story_maxlen), pad_sequences(Xq, maxlen=question_maxlen), np.array(Y)

In [9]:
Xstrain, Xqtrain, Ytrain = vectorize(train_data, word2idx, story_max_len, question_max_len)
Xstest, Xqtest, Ytest = vectorize(test_data, word2idx, story_max_len, question_max_len)

print(Xstrain.shape, Xqtrain.shape, Ytrain.shape)
print(Xstest.shape, Xqtest.shape, Ytest.shape)

(10000, 83) (10000, 7) (10000, 27)
(1000, 83) (1000, 7) (1000, 27)
