In [1]:
import re

def data_sequencing(path):
    data=[]
    with open(path,'r') as f:
        for line in f.readlines():
            line=line.strip()
            index,context=line.split(' ', 1)
            if '\t' in line:
                query,answer,supporting=context.split('\t')
                data.append([index,query,answer,supporting])
            else:
                data.append([index,context,'',''])
    return data

In [2]:
train_data=data_sequencing('./tasks/en-10k/qa1_single-supporting-fact_train.txt')
test_data=data_sequencing('./tasks/en-10k/qa1_single-supporting-fact_test.txt')

In [3]:
import pandas as pd

df_train=pd.DataFrame(train_data,columns=['Index','Query','Answer','Supporting'])
df_test=pd.DataFrame(test_data,columns=['Index','Query','Answer','Supporting'])

df_test[:10]

Unnamed: 0,Index,Query,Answer,Supporting
0,1,John travelled to the hallway.,,
1,2,Mary journeyed to the bathroom.,,
2,3,Where is John?,hallway,1.0
3,4,Daniel went back to the bathroom.,,
4,5,John moved to the bedroom.,,
5,6,Where is Mary?,bathroom,2.0
6,7,John went to the hallway.,,
7,8,Sandra journeyed to the kitchen.,,
8,9,Where is Sandra?,kitchen,8.0
9,10,Sandra travelled to the hallway.,,


In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer=Tokenizer(filters='!?"#$%&()*+,-/:;<=>@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(df_train['Query'])

Using TensorFlow backend.


In [5]:
tokenizer.word_index

{'to': 1,
 'the': 2,
 'where': 3,
 'is': 4,
 'went': 5,
 'mary': 6,
 'john': 7,
 'sandra': 8,
 'daniel': 9,
 'journeyed': 10,
 'travelled': 11,
 'back': 12,
 'moved': 13,
 'bathroom.': 14,
 'hallway.': 15,
 'office.': 16,
 'kitchen.': 17,
 'garden.': 18,
 'bedroom.': 19}

In [6]:
dic_name={'john':'철수','mary':'영희','daniel':'민수','sandra':'주희'}
dic_place={'bathroom':'화장실','hallway':'현관','office':'사무실','kitchen':'부엌','garden':'정원','bedroom':'침실'}

In [7]:
from keras.preprocessing.text import text_to_word_sequence

def data_translation(data):

    def post_position(verb,place):
        if verb=='갔습니다.':
            return '에 '
        else:
            if place in ['화장실','사무실','침실']:
                return '로 '
            else:
                return '으로 '

    query_tr=[]
    for query in data['Query']:
        tokenized=text_to_word_sequence(query)
        cnt=0
        for word in tokenized:
            if word in dic_name.keys():
                name=dic_name[word]
            elif word in dic_place.keys():
                place=dic_place[word]
                pp=post_position(verb,place)
            elif word in ['where']:
                verb='어디에 있습니까?'
                cnt=1
            elif word in ['went']:
                verb='갔습니다.'
            elif word in ['moved']:
                verb='이동했습니다.'
            elif word in ['journeyed','travelled']:
                verb='떠났습니다.'
            elif word in ['back']:
                verb='돌아갔습니다.'
        if cnt==1:
            query_tr.append(name+'는 '+verb)
        else:
            query_tr.append(name+'는 '+place+pp+verb)
            
    answer_tr=[]
    for answer in data['Answer']:
        if answer in dic_place.keys():
            answer_tr.append(dic_place[answer])
        else:
            answer_tr.append(answer)
        
    return query_tr,answer_tr

In [8]:
train_tr=data_translation(df_train)
test_tr=data_translation(df_test)

In [9]:
def data_reconstruction(original_data,translated_data):
    
    data=[]
    for i in range(len(original_data)):
        index,supporting=original_data[i][0],original_data[i][3]
        query,answer=translated_data[0][i],translated_data[1][i]
                                                      
        data.append([index,query,answer,supporting])
                                                      
    return data

In [10]:
df_train_tr=pd.DataFrame(data_reconstruction(train_data,train_tr),
                         columns=['Index','Query','Answer','Supporting'])
df_test_tr=pd.DataFrame(data_reconstruction(test_data,test_tr),
                        columns=['Index','Query','Answer','Supporting'])

In [11]:
df_test_tr[:10]

Unnamed: 0,Index,Query,Answer,Supporting
0,1,철수는 현관으로 떠났습니다.,,
1,2,영희는 화장실로 떠났습니다.,,
2,3,철수는 어디에 있습니까?,현관,1.0
3,4,민수는 화장실로 돌아갔습니다.,,
4,5,철수는 침실로 이동했습니다.,,
5,6,영희는 어디에 있습니까?,화장실,2.0
6,7,철수는 현관에 갔습니다.,,
7,8,주희는 부엌으로 떠났습니다.,,
8,9,주희는 어디에 있습니까?,부엌,8.0
9,10,주희는 현관으로 떠났습니다.,,


In [12]:
df_train_tr.to_csv('./qa1_single-supporting-fact_train_ko.csv',index=False)
df_test_tr.to_csv('./qa1_single-supporting-fact_test_ko.csv',index=False)