In [1]:
import re

def data_sequencing(path):
    data=[]
    with open(path,'r') as f:
        for line in f.readlines():
            line=line.strip()
            index,context=line.split(' ', 1)
            if '\t' in line:
                query,answer,supporting=context.split('\t')
                data.append([index,query,answer,supporting])
            else:
                data.append([index,context,'',''])
    return data

In [2]:
train_data=data_sequencing('../bAbI/tasks_1-20_v1-2/en/qa17_positional-reasoning_train.txt')
test_data=data_sequencing('../bAbI/tasks_1-20_v1-2/en/qa17_positional-reasoning_test.txt')

In [3]:
import pandas as pd

df_train=pd.DataFrame(train_data,columns=['Index','Query','Answer','Supporting'])
df_test=pd.DataFrame(test_data,columns=['Index','Query','Answer','Supporting'])

df_test[:10]

Unnamed: 0,Index,Query,Answer,Supporting
0,1,The pink rectangle is to the left of the trian...,,
1,2,The triangle is to the left of the red square.,,
2,3,Is the pink rectangle to the right of the red ...,no,1 2
3,4,Is the pink rectangle to the left of the red s...,yes,1 2
4,5,Is the pink rectangle to the left of the red s...,yes,1 2
5,6,Is the pink rectangle to the left of the red s...,yes,1 2
6,7,Is the pink rectangle to the right of the red ...,no,1 2
7,8,Is the red square to the right of the pink rec...,yes,2 1
8,9,Is the pink rectangle to the left of the red s...,yes,1 2
9,10,Is the pink rectangle to the left of the red s...,yes,1 2


In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer=Tokenizer(filters='!?"#$%&()*+,-/:;<=>@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(df_train['Query'])

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
tokenizer.word_index

{'the': 1,
 'is': 2,
 'square': 3,
 'red': 4,
 'to': 5,
 'of': 6,
 'pink': 7,
 'rectangle': 8,
 'yellow': 9,
 'blue': 10,
 'sphere': 11,
 'triangle': 12,
 'above': 13,
 'below': 14,
 'right': 15,
 'left': 16,
 'square.': 17,
 'rectangle.': 18,
 'sphere.': 19,
 'triangle.': 20}

In [9]:
dic_figure={'square':'정사각형', 'triangle':'삼각형', 'sphere':'구', 'rectangle':'직사각형'}
dic_color = {'red':'빨간색','pink':'분홍색','blue':'파란색','yellow':'노란색'}
dic_position = {'above':'위','below':'아래','left':'왼쪽','right':'오른쪽'}
dic_answer = {'yes':'네','no':'아니요'}

In [15]:
text_to_word_sequence(df_test['Query'][0])

['the',
 'pink',
 'rectangle',
 'is',
 'to',
 'the',
 'left',
 'of',
 'the',
 'triangle']

In [17]:
a = [1,2,3]
a.index(3)

2

In [46]:
from keras.preprocessing.text import text_to_word_sequence

def data_translation(data):

    query_tr=[]
    for query in data['Query']:
        tokenized=text_to_word_sequence(query)
        figure, color = [], []
        for word in tokenized:
            if word in dic_figure.keys():
                figure.append(dic_figure[word])
                former = tokenized[tokenized.index(word)-1]
                if former in dic_color.keys():
                    color.append(dic_color[former])
                else:
                    color.append('')
            elif word in dic_position.keys():
                position=dic_position[word]
                
        # 조사 변경
        # https://github.com/myevan/pyjosa/blob/master/pyjosa.py 참조
        if (ord(figure[0][-1])- 0xac00)%28 != 0: # 종성이 있을 때
            josa = '은'
        else:
            josa = '는'
        
        # assemble
        if tokenized[0] is 'is':
            query_tr.append(color[0]+' '+figure[0]+josa+' '+color[1]+' '+figure[1]+'에 있습니까?')
        else:
            query_tr.append(color[0]+' '+figure[0]+josa+' '+color[1]+' '+figure[1]+' '+position+'에 있다')

    answer_tr=[]
    for answer in data['Answer']:
        if answer in dic_answer.keys():
            answer_tr.append(dic_answer[answer])
        else:
            answer_tr.append(answer)
        
    return query_tr,answer_tr

In [47]:
train_tr=data_translation(df_train)
test_tr=data_translation(df_test)

In [48]:
def data_reconstruction(original_data,translated_data):
    
    data=[]
    for i in range(len(original_data)):
        index,supporting=original_data[i][0],original_data[i][3]
        query,answer=translated_data[0][i],translated_data[1][i]
                                                      
        data.append([index,query,answer,supporting])
                                                      
    return data

In [49]:
df_train_tr=pd.DataFrame(data_reconstruction(train_data,train_tr),
                         columns=['Index','Query','Answer','Supporting'])
df_test_tr=pd.DataFrame(data_reconstruction(test_data,test_tr),
                        columns=['Index','Query','Answer','Supporting'])

In [12]:
df_train_tr.to_csv('./qa1_single-supporting-fact_train_ko.csv',index=False)
df_test_tr.to_csv('./qa1_single-supporting-fact_test_ko.csv',index=False)