In [1]:
import pandas as pd
import numpy as np
import re
import json

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [2]:
data_path = 'C:/nlp/tensorflow-ml-nlp-tf2/5.TEXT_SIM/data_in/'

In [20]:
train_data = pd.read_csv(data_path + 'train.csv', encoding = 'utf-8')
FILTERS = "([~.,!?\"':;)(])"
max_sequence = 31


In [17]:
print(train_data.head())
train_data.shape

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  


(404290, 6)

In [13]:
train_pos = train_data.loc[train_data['is_duplicate'] == 1]
train_neg = train_data.loc[train_data['is_duplicate'] == 0]

print(train_pos.shape)
print(train_neg.shape)

class_dif = len(train_neg) - len(train_pos)
sample = 1 - (class_dif / len(train_neg))

train_neg = train_neg.sample(frac = sample)

(149263, 6)
(255027, 6)


In [11]:
print("중복 질문 개수: {}".format(len(train_pos)))
print("중복이 아닌 질문 개수: {}".format(len(train_neg)))

중복 질문 개수: 149263
중복이 아닌 질문 개수: 149263


In [14]:
train_data = pd.concat([train_neg, train_pos])

In [18]:
train_data.head()
train_data.shape

(404290, 6)

In [21]:
change_filter = re.compile(FILTERS)

questions1 = [str(s) for s in train_data['question1']]
questions2 = [str(s) for s in train_data['question2']]

f_questions1 = list()
f_questions2 = list()

for q in questions1:
     f_questions1.append(re.sub(change_filter, "", q).lower())
        
for q in questions2:
     f_questions2.append(re.sub(change_filter, "", q).lower())

In [26]:
token = Tokenizer()
token.fit_on_texts(f_questions1 + f_questions2)


In [27]:
q1_sequence = token.texts_to_sequences(f_questions1)
q2_sequence = token.texts_to_sequences(f_questions2)


In [30]:
q1_data = pad_sequences(q1_sequence, maxlen = max_sequence, padding = 'post')
q2_data = pad_sequences(q2_sequence, maxlen = max_sequence, padding = 'post')

In [31]:
word_vocab = {}
word_vocab = token.word_index 
word_vocab["<PAD>"] = 0

labels = np.array(train_data['is_duplicate'], dtype=int)

print('Shape of question1 data: {}'.format(q1_data.shape))
print('Shape of question2 data:{}'.format(q2_data.shape))
print('Shape of label: {}'.format(labels.shape))
print("Words in index: {}".format(len(word_vocab)))

Shape of question1 data: (404290, 31)
Shape of question2 data:(404290, 31)
Shape of label: (404290,)
Words in index: 95569


In [35]:
data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab)

data_configs

{'vocab': {'the': 1,
  'what': 2,
  'is': 3,
  'how': 4,
  'i': 5,
  'a': 6,
  'to': 7,
  'in': 8,
  'do': 9,
  'of': 10,
  'are': 11,
  'and': 12,
  'can': 13,
  'for': 14,
  'you': 15,
  'why': 16,
  'my': 17,
  'best': 18,
  'it': 19,
  'on': 20,
  'does': 21,
  'or': 22,
  'be': 23,
  'which': 24,
  'if': 25,
  'some': 26,
  'have': 27,
  'that': 28,
  'with': 29,
  'get': 30,
  'should': 31,
  'an': 32,
  'from': 33,
  'your': 34,
  'india': 35,
  'will': 36,
  'when': 37,
  'people': 38,
  'who': 39,
  'like': 40,
  'at': 41,
  'good': 42,
  'would': 43,
  'there': 44,
  'as': 45,
  'about': 46,
  'not': 47,
  'between': 48,
  'one': 49,
  'most': 50,
  'we': 51,
  'make': 52,
  'way': 53,
  'quora': 54,
  'did': 55,
  'where': 56,
  'by': 57,
  'any': 58,
  'was': 59,
  'me': 60,
  'life': 61,
  'so': 62,
  'after': 63,
  'time': 64,
  'this': 65,
  'they': 66,
  'money': 67,
  'know': 68,
  'difference': 69,
  'has': 70,
  'learn': 71,
  'am': 72,
  'whats': 73,
  'new': 74,
  

In [36]:
train_q1_data = 'train_q1.npy'
train_q2_data = 'train_q2.npy'
train_label= 'train_label.npy'
data_config = 'data_configs.json'

np.save(open(data_path + train_q1_data, 'wb'), q1_data)
np.save(open(data_path + train_q2_data, 'wb'), q2_data)
np.save(open(data_path + train_label, 'wb'), train_label)

json.dump(data_configs, open(data_path + data_config, 'w'))

In [37]:
test_data = pd.read_csv(data_path + 'test.csv', encoding='utf-8')
# test_data = test_data.drop(test_data.tail(1217679).index,inplace=True) # drop last n rows

valid_ids = [type(x) == int for x in test_data.test_id] 
test_data = test_data[valid_ids].drop_duplicates()

  interactivity=interactivity, compiler=compiler, result=result)


In [64]:
test_q1 = [str(s) for s in test_data['question1']]
test_q2 = [str(s) for s in test_data['question2']]

f_test_q1 = []
f_test_q2 = []

for q in test_q1:
     f_test_q1.append(re.sub(change_filter, "", q).lower())
        
for q in test_q2:
    f_test_q2.append(re.sub(change_filter, "", q).lower())

In [65]:
test_q1_sequence = token.texts_to_sequences(f_test_q1)
test_q2_sequence = token.texts_to_sequences(f_test_q2)

test_q1_data = pad_sequences(test_q1_sequence, maxlen=max_sequence, padding='post')
test_q2_data = pad_sequences(test_q2_sequence, maxlen=max_sequence, padding='post')

In [66]:
test_id = np.array(test_data['test_id'])

print('Shape of question1 data: {}'.format(test_q1_data.shape))
print('Shape of question2 data:{}'.format(test_q2_data.shape))
print('Shape of ids: {}'.format(test_id.shape))

Shape of question1 data: (2345796, 31)
Shape of question2 data:(2345796, 31)
Shape of ids: (2345796,)


In [68]:
TEST_Q1_DATA = 'test_q1.npy'
TEST_Q2_DATA = 'test_q2.npy'
TEST_ID_DATA = 'test_id.npy'

np.save(open(data_path + TEST_Q1_DATA, 'wb'), test_q1_data)
np.save(open(data_path + TEST_Q2_DATA , 'wb'), test_q2_data)
np.save(open(data_path + TEST_ID_DATA , 'wb'), test_id)