In [1]:
import numpy as np
import pandas as pd
import re
import json

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# Data Preprocessing

In [2]:
DATA_IN_PATH = './quora_data/'
train_data = pd.read_csv(DATA_IN_PATH + 'train.csv', encoding='utf-8')

train_data

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [3]:
# 데이터를 중복인 경우(pos)와 아닌 경우(neg)로 나눈 후 중복이 아닌 개수가 비슷하도록 데이터를 다시 뽑음
train_pos_data = train_data.loc[train_data['is_duplicate'] == 1]
train_neg_data = train_data.loc[train_data['is_duplicate'] == 0]

# EDA를 통해 중복이 아닌 데이터가 중복인 데이터보다 많음을 확인했음
class_difference = len(train_neg_data) - len(train_pos_data)
sample_frac = 1 - (class_difference / len(train_neg_data))

train_neg_data = train_neg_data.sample(frac = sample_frac)

In [4]:
# 중복인 질문의 개수와 중복이 아닌 질문의 개수가 이제 동일해짐
print('중복 질문 개수: {}'.format(len(train_pos_data)))
print('중복이 아닌 질문 개수: {}'.format(len(train_neg_data)))

# 학습을 위해 두 데이터를 하나로 합침
train_data = pd.concat([train_neg_data, train_pos_data])

중복 질문 개수: 149263
중복이 아닌 질문 개수: 149263


In [5]:
# 구두점 및 기호를 제거하고 모든 문자를 소문자로 바꾸는 전처리
FILTERS = "([~.,!?\"':;)(])"
change_filter = re.compile(FILTERS)

questions1 = [str(s) for s in train_data['question1']]
questions2 = [str(s) for s in train_data['question2']]

filtered_questions1 = []
filtered_questions2 = []

# 모든 기호들을 제거하고 모든 문자를 소문자로 바꿈
for q in questions1:
    filtered_questions1.append(re.sub(change_filter, "", q).lower())

for q in questions2:
    filtered_questions2.append(re.sub(change_filter, "", q).lower())

In [6]:
# 문자열 토크나이징
# 토크나이징 객체는 두 질문 텍스트를 합친 리스트에 대해 적용하지만, 토크나이징은 각 질문에 대해 따로 진행

# 토크나이징 객체 생성
tokenizer = Tokenizer()
tokenizer.fit_on_texts(filtered_questions1 + filtered_questions2)

# 각 질문 텍스트에 대해 토크나이징 후 각 단어를 인덱스로 변환
questions1_sequence = tokenizer.texts_to_sequences(filtered_questions1)
questions2_sequence = tokenizer.texts_to_sequences(filtered_questions2)

# 전체 데이터 길이를 맞추기 위해 패딩 처리
# MAX_SEQUENCE_LENGTH 값(EDA에서 확인한 단어 개수의 상위 99퍼센트인 31)보다 긴 데이터는 자르고, 짧은 데이터는 default padding value인 0을 뒤에서부터 채워넣음
MAX_SEQUENCE_LENGTH = 31
q1_data = pad_sequences(questions1_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(questions2_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [7]:
# 전처리한 데이터의 크기를 확인
# 단어 사전을 생성, 단어의 개수를 확인해보기 위해 패딩 처리된 부분은 제외
word_vocab = {}
word_vocab = tokenizer.word_index
word_vocab["<PAD"] = 0

labels = np.array(train_data["is_duplicate"], dtype=int)

print('Shape of question1 data: {}'.format(q1_data.shape))
print('Shape of question2 data: {}'.format(q2_data.shape))
print('Shape of label: {}'.format(labels.shape))
print('Words in index: {}'.format(len(word_vocab)))

Shape of question1 data: (298526, 31)
Shape of question2 data: (298526, 31)
Shape of label: (298526,)
Words in index: 76467


In [8]:
# 단어 사전과 단어 개수를 딕셔너리 형태로 저장
data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab)

In [9]:
# 전처리한 각 데이터들을 저장
DATA_OUT_PATH = './quora_data/'
TRAIN_Q1_DATA = 'q1_train.npy'
TRAIN_Q2_DATA = 'q2_train.npy'
TRAIN_LABEL_DATA = 'label_train.npy'
DATA_CONFIGS = 'data_configs.npy'

np.save(open(DATA_OUT_PATH + TRAIN_Q1_DATA, 'wb'), q1_data)
np.save(open(DATA_OUT_PATH + TRAIN_Q2_DATA, 'wb'), q2_data)
np.save(open(DATA_OUT_PATH + TRAIN_LABEL_DATA, 'wb'), labels)

json.dump(data_configs, open(DATA_OUT_PATH + DATA_CONFIGS, 'w'))

In [10]:
# 위의 과정을 평가 데이터에 대해서도 동일하게 진행
test_data = pd.read_csv(DATA_IN_PATH + 'test.csv', encoding='utf-8')
valid_ids = [type(x) == int for x in test_data.test_id]
test_data = test_data[valid_ids].drop_duplicates()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [11]:
test_questions1 = [str(s) for s in test_data['question1']]
test_questions2 = [str(s) for s in test_data['question2']]

filtered_test_questions1 = list()
filtered_test_questions2 = list()

for q in test_questions1:
     filtered_test_questions1.append(re.sub(change_filter, "", q).lower())
        
for q in test_questions2:
     filtered_test_questions2.append(re.sub(change_filter, "", q).lower())

In [12]:
test_questions1_sequence = tokenizer.texts_to_sequences(filtered_test_questions1)
test_questions2_sequence = tokenizer.texts_to_sequences(filtered_test_questions2)

test_q1_data = pad_sequences(test_questions1_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_q2_data = pad_sequences(test_questions2_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [13]:
test_id = np.array(test_data['test_id'])

print('Shape of question1 data: {}'.format(test_q1_data.shape))
print('Shape of question2 data:{}'.format(test_q2_data.shape))
print('Shape of ids: {}'.format(test_id.shape))

Shape of question1 data: (2345796, 31)
Shape of question2 data:(2345796, 31)
Shape of ids: (2345796,)


In [14]:
# 전처리한 테스트 텍스트 데이터를 저장
TEST_Q1_DATA = 'test_q1.npy'
TEST_Q2_DATA = 'test_q2.npy'
TEST_ID_DATA = 'test_id.npy'

np.save(open(DATA_IN_PATH + TEST_Q1_DATA, 'wb'), test_q1_data)
np.save(open(DATA_IN_PATH + TEST_Q2_DATA , 'wb'), test_q2_data)
np.save(open(DATA_IN_PATH + TEST_ID_DATA , 'wb'), test_id)