In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Dropout, Flatten, Embedding, LSTM, merge, TimeDistributed, concatenate, PReLU
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

from keras.callbacks import EarlyStopping, ModelCheckpoint

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1060 6GB (CNMeM is enabled with initial size: 75.0% of memory, cuDNN not available)


In [2]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [3]:
path = '/cinc/data/quora-question-pairs/'
save_path = '/cinc/data/quora-question-pairs/save_data/'
model_path = '/cinc/data/quora-question-pairs/model/'

word2vec_file = '/cinc/data/word2vec/GoogleNews-vectors-negative300.bin'

glove_file = '/cinc/data/glove/glove.840B.300d.txt'

MAX_SEQUENCE_LENGTH = 30
EMBEDDING_DIM = 300

# load data

In [4]:
train_sequence_1 = load_array(save_path + 'train_sequence_1')
train_sequence_2 = load_array(save_path + 'train_sequence_2')

test_sequence_1 = load_array(save_path + 'test_sequence_1')
test_sequence_2 = load_array(save_path + 'test_sequence_2')

train_labels = load_array(save_path + 'train_labels')

word_index = load_array(save_path + 'word_index')[0]

In [5]:
nb_words = len(word_index) + 1
nb_words

120501

In [6]:
train_sequence_1.shape, train_sequence_2.shape, test_sequence_1.shape, test_sequence_2.shape

((404290, 30), (404290, 30), (2345796, 30), (2345796, 30))

# building embedding

In [6]:
#embedding_matrix = load_array(save_path + 'embedding_matrix')
#embedding_matrix.shape

In [10]:
#word_index

In [9]:
embeddings_index = {}
f = open(glove_file)
count = 0
for line in f:
    count = count+1
    if (count % 100000) == 0:
        print 'processing ', count
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

processing  100000
processing  200000
processing  300000
processing  400000
processing  500000
processing  600000
processing  700000
processing  800000
processing  900000
processing  1000000
processing  1100000
processing  1200000
processing  1300000
processing  1400000
processing  1500000
processing  1600000
processing  1700000
processing  1800000
processing  1900000
processing  2000000
processing  2100000


In [10]:
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [11]:
save_array(save_path + 'embedding_matrix', embedding_matrix)

In [7]:
embedding_matrix = load_array(save_path + 'embedding_matrix')
embedding_matrix.shape

(120501, 300)

# leak input

In [8]:
train_data = pd.read_csv(save_path + '../save_data_tf_idf/train_data.csv')
train_data.shape

(404290, 28)

In [45]:
test_data = pd.read_csv(save_path + '../save_data_tf_idf/test_data.csv')
test_data.shape

(2345796, 28)

In [10]:
predictors = train_data.columns[6:]
predictors, len(predictors)

(Index([u'diff_len', u'ratio_len', u'cos_sim', u'log_diff_len', u'test_id',
        u'unigram_jaccard', u'unigram_jaccard_all', u'unigram_jaccard_max',
        u'bigram_jaccard', u'bigram_jaccard_all', u'bigram_jaccard_max',
        u'trigram_jaccard', u'trigram_jaccard_all', u'trigram_jaccard_max',
        u'intersection_count', u'fuzz_qratio', u'fuzz_wratio',
        u'fuzz_partial_ratio', u'fuzz_partial_token_set_ratio',
        u'fuzz_partial_token_sort_ratio', u'fuzz_token_set_ratio',
        u'fuzz_token_sort_ratio'],
       dtype='object'), 22)

In [11]:
train_data[predictors].head()

Unnamed: 0,diff_len,ratio_len,cos_sim,log_diff_len,test_id,unigram_jaccard,unigram_jaccard_all,unigram_jaccard_max,bigram_jaccard,bigram_jaccard_all,...,trigram_jaccard_all,trigram_jaccard_max,intersection_count,fuzz_qratio,fuzz_wratio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio
0,9,0.865672,0.895532,2.198335,-1,0.99995,0.463411,0.863623,0.906956,0.462806,...,0.462181,0.859362,0,93,95,100,100,88,100,93
1,37,0.593407,0.474331,3.611188,-1,0.666639,0.356641,0.554342,0.499993,0.319147,...,0.280574,0.389996,0,67,86,75,100,73,86,63
2,14,0.810811,0.380873,2.639771,-1,0.739098,0.386361,0.629622,0.430374,0.284613,...,0.234373,0.306119,0,43,60,47,100,71,63,63
3,13,0.793651,0.0,2.565718,-1,0.565193,0.315312,0.46052,0.108107,0.073394,...,0.0,0.0,0,9,27,14,33,30,28,24
4,35,0.533333,0.206084,3.555634,-1,0.666639,0.318581,0.467526,0.338023,0.225223,...,0.165136,0.1978,0,36,86,56,100,67,67,47


In [12]:
train_leak = train_data[predictors].as_matrix()
train_leak.shape

(404290, 22)

In [46]:
test_leak = test_data[predictors].as_matrix()
test_leak.shape

(2345796, 22)

# random split

In [19]:
nb_samples = train_sequence_1.shape[0]
#nb_samples = 1000
msk = np.random.rand(nb_samples) < 0.8
msk

array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)

In [27]:
(X_all_1, X_all_2, X_all_3) = (train_sequence_1, train_sequence_2, train_leak)
X_all_1.shape, X_all_2.shape, X_all_3.shape

((404290, 30), (404290, 30), (404290, 22))

In [28]:
y_all = np.array(train_labels)
y_all.shape, y_all[0]

((404290,), 0)

In [29]:
X_train_1 = X_all_1[msk]
X_train_2 = X_all_2[msk]
X_train_3 = X_all_3[msk]

X_valid_1 = X_all_1[~msk]
X_valid_2 = X_all_2[~msk]
X_valid_3 = X_all_3[~msk]

y_train = y_all[msk]
y_valid = y_all[~msk]

print (X_train_1.shape, X_train_1.shape, X_train_3.shape, y_train.shape)
print (X_valid_1.shape, X_valid_2.shape, X_valid_3.shape, y_valid.shape)

((323699, 30), (323699, 30), (323699, 22), (323699,))
((80591, 30), (80591, 30), (80591, 22), (80591,))


In [30]:
save_array(save_path + 'msk', msk)

save_array(save_path + 'X_train_1', X_train_1)
save_array(save_path + 'X_train_2', X_train_2)
save_array(save_path + 'X_train_3', X_train_3)

save_array(save_path + 'X_valid_1', X_valid_1)
save_array(save_path + 'X_valid_2', X_valid_2)
save_array(save_path + 'X_valid_3', X_valid_3)

save_array(save_path + 'y_train', y_train)
save_array(save_path + 'y_valid', y_valid)


In [31]:
msk = load_array(save_path + 'msk')

X_train_1 = load_array(save_path + 'X_train_1')
X_train_2 = load_array(save_path + 'X_train_2')
X_train_3 = load_array(save_path + 'X_train_3')

X_valid_1 = load_array(save_path + 'X_valid_1')
X_valid_2 = load_array(save_path + 'X_valid_2')
X_valid_3 = load_array(save_path + 'X_valid_3')

y_train = load_array(save_path + 'y_train')
y_valid = load_array(save_path + 'y_valid')

print (X_train_1.shape, X_train_1.shape, X_train_3.shape, y_train.shape)
print (X_valid_1.shape, X_valid_2.shape, X_valid_3.shape, y_valid.shape)

((323699, 30), (323699, 30), (323699, 22), (323699,))
((80591, 30), (80591, 30), (80591, 22), (80591,))


# assign weights

In [14]:
weight_valid = np.ones(len(y_valid))
weight_valid *= 0.472001959
weight_valid[y_valid==0] = 1.309028344
weight_valid[0:10]

array([ 1.30902834,  0.47200196,  1.30902834,  1.30902834,  0.47200196,
        0.47200196,  1.30902834,  1.30902834,  1.30902834,  1.30902834])

In [15]:
class_weight = {1.309028344, 0.472001959}

# build lstm model

In [32]:
embedding_layer = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable = False
)

In [33]:
num_ltsm = 200
lstm_layer = LSTM(num_ltsm, dropout=0.3, recurrent_dropout=0.3)

In [34]:
input_sequence_1 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_sequence_1 = embedding_layer(input_sequence_1)
x1 = lstm_layer(embedding_sequence_1)

In [35]:
input_sequence_2 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_sequence_2 = embedding_layer(input_sequence_2)
x2 = lstm_layer(embedding_sequence_2)

In [36]:
dense_input = Input(shape=(len(predictors),))

x3 = Dense(256, kernel_initializer = 'he_normal')(dense_input)
x3 = PReLU()(x3)
x3 = BatchNormalization()(x3)
x3 = Dropout(0.4)(x3)

x3 = Dense(512, kernel_initializer = 'he_normal')(x3)
x3 = PReLU()(x3)
x3 = BatchNormalization()(x3)
x3 = Dropout(0.2)(x3)

x3 = Dense(512, kernel_initializer = 'he_normal')(x3)
x3 = PReLU()(x3)
x3 = Dropout(0.2)(x3)

In [37]:
merged = concatenate([x1, x2, x3])
merged = BatchNormalization()(merged)
merged = Dropout(0.3)(merged)

merged = Dense(128, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(0.3)(merged)

pred_layer = Dense(1, activation='sigmoid')(merged)

In [38]:
model = Model(inputs=[input_sequence_1, input_sequence_2, dense_input],
              outputs=pred_layer
             )
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_6 (InputLayer)             (None, 22)            0                                            
____________________________________________________________________________________________________
dense_6 (Dense)                  (None, 256)           5888        input_6[0][0]                    
____________________________________________________________________________________________________
p_re_lu_4 (PReLU)                (None, 256)           256         dense_6[0][0]                    
____________________________________________________________________________________________________
batch_normalization_5 (BatchNorm (None, 256)           1024        p_re_lu_4[0][0]                  
___________________________________________________________________________________________

In [39]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [40]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=0),
    ModelCheckpoint(model_path + 'glove_lstm_class_weight_leak_keras2.h5', monitor='val_loss', save_best_only=True, verbose=0)
]

In [41]:
hist = model.fit([X_train_1, X_train_2, X_train_3], y_train, 
                 validation_data=([X_valid_1, X_valid_2, X_valid_3], y_valid, weight_valid),
                 class_weight=class_weight,
                 batch_size=2048, epochs=200, verbose=2, shuffle=True,
                 callbacks=callbacks
                )

Train on 323699 samples, validate on 80591 samples
Epoch 1/200
73s - loss: 0.4174 - acc: 0.7895 - val_loss: 1.2273 - val_acc: 0.5935
Epoch 2/200
75s - loss: 0.3422 - acc: 0.8354 - val_loss: 0.6349 - val_acc: 0.7281
Epoch 3/200
75s - loss: 0.3170 - acc: 0.8515 - val_loss: 0.2746 - val_acc: 0.8630
Epoch 4/200
75s - loss: 0.3032 - acc: 0.8600 - val_loss: 0.4147 - val_acc: 0.8031
Epoch 5/200
75s - loss: 0.2940 - acc: 0.8657 - val_loss: 0.2467 - val_acc: 0.8732
Epoch 6/200
75s - loss: 0.2898 - acc: 0.8671 - val_loss: 0.2544 - val_acc: 0.8717
Epoch 7/200
75s - loss: 0.2833 - acc: 0.8706 - val_loss: 0.2946 - val_acc: 0.8621
Epoch 8/200
76s - loss: 0.2800 - acc: 0.8724 - val_loss: 0.2199 - val_acc: 0.8755
Epoch 9/200
76s - loss: 0.2762 - acc: 0.8743 - val_loss: 0.2012 - val_acc: 0.8675
Epoch 10/200
75s - loss: 0.2720 - acc: 0.8764 - val_loss: 0.2871 - val_acc: 0.8645
Epoch 11/200
75s - loss: 0.2686 - acc: 0.8783 - val_loss: 0.2113 - val_acc: 0.8772
Epoch 12/200
75s - loss: 0.2654 - acc: 0.8797

In [42]:
#model.save_weights(model_path + 'word2vec_lstm.h5')
model.load_weights(model_path + 'glove_lstm_class_weight_leak_keras2.h5', by_name=True)

In [50]:
model.optimizer = Adam(lr=0.0001)
hist2 = model.fit([X_train_1, X_train_2, X_train_3], y_train, 
                 validation_data=([X_valid_1, X_valid_2, X_valid_3], y_valid, weight_valid),
                 class_weight=class_weight,
                 batch_size=2048, epochs=200, verbose=2, shuffle=True,
                 callbacks=callbacks
                )

Train on 323699 samples, validate on 80591 samples
Epoch 1/200
76s - loss: 0.3608 - acc: 0.8274 - val_loss: 0.3290 - val_acc: 0.8215
Epoch 2/200
76s - loss: 0.3559 - acc: 0.8300 - val_loss: 0.3257 - val_acc: 0.8204
Epoch 3/200
76s - loss: 0.3520 - acc: 0.8322 - val_loss: 0.3330 - val_acc: 0.8204
Epoch 4/200
76s - loss: 0.3475 - acc: 0.8344 - val_loss: 0.3283 - val_acc: 0.8242
Epoch 5/200
76s - loss: 0.3439 - acc: 0.8373 - val_loss: 0.3407 - val_acc: 0.8241
Epoch 6/200
77s - loss: 0.3397 - acc: 0.8397 - val_loss: 0.3005 - val_acc: 0.8255
Epoch 7/200
76s - loss: 0.3357 - acc: 0.8417 - val_loss: 0.3244 - val_acc: 0.8244
Epoch 8/200
76s - loss: 0.3309 - acc: 0.8443 - val_loss: 0.3412 - val_acc: 0.8251
Epoch 9/200
76s - loss: 0.3285 - acc: 0.8451 - val_loss: 0.3279 - val_acc: 0.8263
Epoch 10/200
76s - loss: 0.3245 - acc: 0.8485 - val_loss: 0.3190 - val_acc: 0.8291
Epoch 11/200
76s - loss: 0.3203 - acc: 0.8507 - val_loss: 0.3122 - val_acc: 0.8290
Epoch 12/200
76s - loss: 0.3181 - acc: 0.8518

In [43]:
pred_train = model.predict([X_train_1, X_train_2, X_train_3], batch_size=8192, verbose=2)
sum(np.round(pred_train) == y_train.reshape(len(y_train), 1)) * 1.0 / len(y_train)

array([ 0.8862678])

In [44]:
pred_valid = model.predict([X_valid_1, X_valid_2, X_valid_3], batch_size=8192, verbose=2)
sum(np.round(pred_valid) == y_valid.reshape(len(y_valid), 1)) * 1.0 / len(y_valid)

array([ 0.8722438])

In [47]:
(X_test_1, X_test_2, X_test_3) = (test_sequence_1, test_sequence_2, test_leak)
X_test_1.shape, X_test_2.shape, X_test_3.shape

((2345796, 30), (2345796, 30), (2345796, 22))

In [51]:
pred_test = model.predict([X_test_1, X_test_2, X_test_3], batch_size=8192, verbose=1)



# generate submission file

In [52]:
df_submission = pd.read_csv(path + 'submission/sample_submission.csv')
print df_submission.shape
df_submission.head()

(2345796, 2)


Unnamed: 0,test_id,is_duplicate
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [53]:
df_submission['is_duplicate'] = pred_test
df_submission.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.007666
1,1,0.066784
2,2,0.169738
3,3,0.002279
4,4,0.296542


In [54]:
df_submission.to_csv(path + 'submission/quora_glove_lstm_class_weight_leak_keras2_20170615.csv', index=False)