In [1]:
import re

def data_sequencing(path):
    data=[]
    with open(path,'r') as f:
        for line in f.readlines():
            line=line.strip()
            index,context=line.split(' ', 1)
            if '\t' in line:
                query,answer,supporting=context.split('\t')
                data.append([index,query,answer,supporting])
            else:
                data.append([index,context,'',''])
    return data

In [6]:
train_data=data_sequencing('../bAbI/tasks_1-20_v1-2/en/qa12_conjunction_train.txt')
test_data=data_sequencing('../bAbI/tasks_1-20_v1-2/en/qa12_conjunction_test.txt')

In [7]:
import pandas as pd

df_train=pd.DataFrame(train_data,columns=['Index','Query','Answer','Supporting'])
df_test=pd.DataFrame(test_data,columns=['Index','Query','Answer','Supporting'])

df_test[:10]

Unnamed: 0,Index,Query,Answer,Supporting
0,1,John and Mary travelled to the hallway.,,
1,2,Sandra and Mary journeyed to the bedroom.,,
2,3,Where is Mary?,bedroom,2.0
3,4,Mary and Daniel travelled to the bathroom.,,
4,5,Daniel and Sandra journeyed to the office.,,
5,6,Where is Mary?,bathroom,4.0
6,7,Daniel and Mary went to the bedroom.,,
7,8,Daniel and Sandra travelled to the hallway.,,
8,9,Where is Sandra?,hallway,8.0
9,10,Mary and Sandra journeyed to the garden.,,


In [8]:
from keras.preprocessing.text import Tokenizer

tokenizer=Tokenizer(filters='!?"#$%&()*+,-/:;<=>@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(df_train['Query'])

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
tokenizer.word_index

{'and': 1,
 'to': 2,
 'the': 3,
 'daniel': 4,
 'john': 5,
 'sandra': 6,
 'mary': 7,
 'where': 8,
 'is': 9,
 'went': 10,
 'journeyed': 11,
 'moved': 12,
 'back': 13,
 'travelled': 14,
 'hallway.': 15,
 'kitchen.': 16,
 'garden.': 17,
 'bathroom.': 18,
 'bedroom.': 19,
 'office.': 20}

데이터의 특징 / QA12
1. 한 query에 사람이 두 명 씩 나옴.  
    : 문장 번역시 `name`을 길이가 2인 list로
2. `back`은 `went back`의 형태로밖에 나타나지 않음  
    : `went`를 봤을 시 뒷 단어를 보고 결정

In [18]:
voca = {
    'name': {
        'daniel': '동수',
        'john'  : '준석',
        'sandra': '수아',
        'mary'  : '민경'
    },
    'verb': {
        'journeyed': '여행했다.',
        'moved'    : '이동했다.',
        'went'     : '갔다.',
        'travelled': '여행했다.'
    },
    'place': {
        'hallway' : '복도',
        'kitchen' : '부엌',
        'garden'  : '정원',
        'bathroom': '욕실',
        'bedroom' : '침실',
        'office'  : '사무실'
    }
}

In [20]:
from keras.preprocessing.text import text_to_word_sequence
text_to_word_sequence(df_test['Query'][0])

['john', 'and', 'mary', 'travelled', 'to', 'the', 'hallway']

In [100]:
# 조사 변경
# https://github.com/myevan/pyjosa/blob/master/pyjosa.py 참조
def josa(text, input):
    if input == '은는':
        if (ord(text[-1])- 0xac00)%28 != 0: # 종성이 있을 때
                output = '은 '
        else:
                output = '는 '
    elif input == '와과':
        if (ord(text[-1])- 0xac00)%28 != 0: # 종성이 있을 때
                output = '과 '
        else:
                output = '와 '
    elif input == '으로':
        if (ord(text[-1])- 0xac00)%28 in [0,8]: # 종성이 없거나 ㄹ일 때
                output = '로 '
        else:
                output = '으로 '
    return(text+output)

In [101]:
def data_translation(data):

    query_tr=[]
    for query in data['Query']:
        tokenized=text_to_word_sequence(query)
        name, verb, place = [], '', ''
        for word in tokenized:
            if word in voca['name'].keys():
                name.append(voca['name'][word])
            elif word in voca['verb'].keys():
                if word == 'went' and tokenized[tokenized.index(word)+1] == 'back':
                    verb = '돌아왔다.'
                else:
                    verb = voca['verb'][word]
            elif word in voca['place'].keys():
                place = voca['place'][word]

        # assemble
        if tokenized[0] == 'where':
            if place == '':
                place = '어디'
            query_tr.append(josa(name[0],'은는') + place + '에 있습니까?')
        else:
            query_tr.append(josa(name[0],'와과')+josa(name[1],'은는')  +josa(place,'으로')+verb)
                

    answer_tr=[]
    for answer in data['Answer']:
        if answer:
            answer_tr.append(voca['place'][answer])
        else:
            answer_tr.append(answer)
       
    return query_tr,answer_tr

In [102]:
train_tr=data_translation(df_train)
test_tr=data_translation(df_test)

In [103]:
def data_reconstruction(original_data,translated_data):
    
    data=[]
    for i in range(len(original_data)):
        index,supporting=original_data[i][0],original_data[i][3]
        query,answer=translated_data[0][i],translated_data[1][i]
                                                      
        data.append([index,query,answer,supporting])
                                                      
    return data

In [104]:
df_train_tr=pd.DataFrame(data_reconstruction(train_data,train_tr),
                         columns=['Index','Query','Answer','Supporting'])
df_test_tr=pd.DataFrame(data_reconstruction(test_data,test_tr),
                        columns=['Index','Query','Answer','Supporting'])

In [105]:
df_train_tr.to_csv('./qa12_conjunction_train_kr.csv',index=False, encoding = 'utf-8-sig')
df_test_tr.to_csv('./qa12_conjunction_test_kr.csv',index=False, encoding = 'utf-8-sig')