# Quora Question Pairs _ simple LSTM

- Word2Vec으로 사전 학습된 Word Vector 활용, LSTM 모형을 만들어봅니다. lystdo의 [해당 커널](https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings) 을 주로 참조했습니다. <br>

### 1. loading packages, import data

In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


LSTM모형의 기본 파라미터를 지정합니다.

In [2]:
MAX_SEQUENCE_LENGTH = 30 # 최대 시퀀스 길이
MAX_NB_WORDS = 200000 # 최대 어휘 갯수 (학습 대상이 되는 어휘)
EMBEDDING_DIM = 300 # Word Vector의 차원수
VALIDATION_SPLIT = 0.1

In [3]:
num_lstm = np.random.randint(175, 275) # LSTM의 차원수
num_dense = np.random.randint(100, 150) # Dense Layer의 차원수
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

pretrained word2vec vector를 불러옵니다.

In [4]:
word2vec = KeyedVectors.load_word2vec_format('../../Analysis/WordVectors/GoogleNews-vectors-negative300.bin', binary=True)

In [7]:
# 텍스트 정리하기 위한 function 불러오기

def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    
    # 소문자로 변환 후 공백 기준으로 분리
    text = text.lower().split()
    
    # stopwords를 불러온 후 stopwords 아닌 단어만 리스트에 넣기
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        
    # 어휘들을 공백으로 합치기
    text = " ".join(text)
    
    # 각종.....각종 변환 및 정제..
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # 영단어를 어간으로 변환 후 다시 공백으로 합치기
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
        
    return(text)

In [9]:
# 텍스트 불러오기

texts_1 = []
texts_2 = []
labels = []

# 파일을 한 줄씩 열고
with codecs.open('../../Kaggle_IO/QuoraQuestionPairs/input/train.csv', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    
    # text1, text2 각각 text_to_wordlist 펑션으로 가공하여 리스트에 append
    for values in reader:
        texts_1.append(text_to_wordlist(values[3])) # train셋의 Text 1
        texts_2.append(text_to_wordlist(values[4])) # train셋의 Text 2
        labels.append(int(values[5]))
        
# 테스트 데이터도 똑같이 반복        
test_texts_1 = []
test_texts_2 = []
test_ids = []

with codecs.open('../../Kaggle_IO/QuoraQuestionPairs/input/test.csv', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1])) # test셋의 Text 1
        test_texts_2.append(text_to_wordlist(values[2])) # test셋의 Text 2
        test_ids.append(values[0])

In [14]:
# 정제한 Word 단위로 Tokenizer 실행하여 단어별 dictionary 구성

tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

In [15]:
# 문장의 단어를 index로 변환하여 sequence로 변환

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)

test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

In [18]:
# tokenizer의 word_index 가져오기

word_index = tokenizer.word_index 

In [20]:
# sequence에 길이에 맞게 padding하여 구성

data_1 = pad_sequences(sequences_1, maxlen = MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen = MAX_SEQUENCE_LENGTH)
labels = np.array(labels)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

In [21]:
# pretrained embedding matrix 가져오기

nb_words =min(MAX_NB_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)

In [22]:
# train과 vaild 구분

perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1) * (1-VALIDATION_SPLIT))]
idx_valid = perm[int(len(data_1) * (1-VALIDATION_SPLIT)):]

In [23]:
# 1번 문장과 2번 문장 / 2번 문장과 1번 문장을 번갈아가며 넣는 set 구성


data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_valid], data_2[idx_valid]))
data_2_val = np.vstack((data_2[idx_valid], data_1[idx_valid]))
labels_val = np.concatenate((labels[idx_valid], labels[idx_valid]))

In [24]:
# Weight 보정

re_weight = True

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

In [25]:
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

In [27]:
# Embedding Layer 만들기

embedding_layer = Embedding(nb_words, 
                                                   EMBEDDING_DIM, 
                                                   weights=[embedding_matrix], 
                                                   input_length=MAX_SEQUENCE_LENGTH, 
                                                   trainable=False)

In [28]:
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

In [29]:
# input을 embedding layer에 매핑시킨 후 해당 결과 lstm 인풋에 넣기

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

In [32]:
# input을 embedding layer에 매핑시킨 후 해당 결과 lstm 인풋에 넣기

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

In [34]:
# LSTM으로 나온 결과 merge -> dropout -> BN -> FC로 만든 뒤 -> sigmoid

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation='relu')(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [35]:
model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])

In [40]:
STAMP = '../../Kaggle_IO/QuoraQuestionPairs/submission/lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, rate_drop_dense)

In [41]:
early_stopping =EarlyStopping(monitor='val_loss', patience=5)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

In [42]:
hist = model.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200


In [43]:
bst_val_score = min(hist.history['val_loss'])

In [45]:
preds = model.predict([test_data_1, test_data_2], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=8192, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv(STAMP+'%.4f_'%(bst_val_score)+'.csv', index=False)

In [62]:
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

.31475 (1094 / 3307, 33%)