In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Dropout, Flatten, Embedding, LSTM, merge, TimeDistributed, concatenate
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

from tqdm import tqdm

from keras.callbacks import EarlyStopping, ModelCheckpoint

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1060 6GB (CNMeM is enabled with initial size: 75.0% of memory, cuDNN not available)


In [2]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [3]:
path = '/cinc/data/quora-question-pairs/'
save_path = '/cinc/data/quora-question-pairs/save_data/'
model_path = '/cinc/data/quora-question-pairs/model/'

word2vec_file = '/cinc/data/word2vec/GoogleNews-vectors-negative300.bin'

glove_file = '/cinc/data/glove/glove.840B.300d.txt'

MAX_SEQUENCE_LENGTH = 30
EMBEDDING_DIM = 300

# load data

In [4]:
train_sequence_1 = load_array(save_path + 'train_sequence_1')
train_sequence_2 = load_array(save_path + 'train_sequence_2')

test_sequence_1 = load_array(save_path + 'test_sequence_1')
test_sequence_2 = load_array(save_path + 'test_sequence_2')

train_labels = load_array(save_path + 'train_labels')

word_index = load_array(save_path + 'word_index')[0]

In [5]:
nb_words = len(word_index) + 1
nb_words

120501

In [6]:
train_sequence_1.shape, train_sequence_2.shape, test_sequence_1.shape, test_sequence_2.shape

((404290, 30), (404290, 30), (2345796, 30), (2345796, 30))

# building embedding

In [6]:
#embedding_matrix = load_array(save_path + 'embedding_matrix')
#embedding_matrix.shape

In [10]:
#word_index

In [9]:
embeddings_index = {}
f = open(glove_file)
count = 0
for line in f:
    count = count+1
    if (count % 100000) == 0:
        print 'processing ', count
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

processing  100000
processing  200000
processing  300000
processing  400000
processing  500000
processing  600000
processing  700000
processing  800000
processing  900000
processing  1000000
processing  1100000
processing  1200000
processing  1300000
processing  1400000
processing  1500000
processing  1600000
processing  1700000
processing  1800000
processing  1900000
processing  2000000
processing  2100000


In [10]:
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [11]:
save_array(save_path + 'embedding_matrix', embedding_matrix)

In [7]:
embedding_matrix = load_array(save_path + 'embedding_matrix')
embedding_matrix.shape

(120501, 300)

# build X_all, y_all

In [12]:
(X_all_1, X_all_2) = (train_sequence_1, train_sequence_2)
X_all_1.shape, X_all_2.shape

((404290, 30), (404290, 30))

In [13]:
y_all = np.array(train_labels)
y_all.shape, y_all[0]

((404290,), 0)

# random split

In [30]:
nb_samples = train_sequence_1.shape[0]
#nb_samples = 1000
msk = np.random.rand(nb_samples) < 0.8
msk

array([ True, False, False, ...,  True,  True,  True], dtype=bool)

In [11]:
X_train_1 = X_all_1[msk]
X_train_2 = X_all_2[msk]

X_valid_1 = X_all_1[~msk]
X_valid_2 = X_all_2[~msk]

y_train = y_all[msk]
y_valid = y_all[~msk]

(X_train_1.shape, X_train_1.shape, y_train.shape, X_valid_1.shape, X_valid_2.shape, y_valid.shape)

((323814, 30), (323814, 30), (323814,), (80476, 30), (80476, 30), (80476,))

# assign weights

In [12]:
weight_valid = np.ones(len(y_valid))
weight_valid *= 0.472001959
weight_valid[y_valid==0] = 1.309028344
weight_valid[0:10]

In [11]:
class_weight = {1.309028344, 0.472001959}

# cross validation

In [18]:
def createModel():
    embedding_layer = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable = False
    )
    num_ltsm = 200
    lstm_layer = LSTM(num_ltsm, dropout=0.3, recurrent_dropout=0.3)

    input_sequence_1 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedding_sequence_1 = embedding_layer(input_sequence_1)
    x1 = lstm_layer(embedding_sequence_1)

    input_sequence_2 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedding_sequence_2 = embedding_layer(input_sequence_2)
    x2 = lstm_layer(embedding_sequence_2)

    merged = concatenate([x1, x2])
    merged = BatchNormalization()(merged)
    merged = Dropout(0.3)(merged)

    merged = Dense(256, activation='relu')(merged)
    merged = BatchNormalization()(merged)
    merged = Dropout(0.3)(merged)

    pred_layer = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[input_sequence_1, input_sequence_2],
                  outputs=pred_layer
                 )
    #model.summary()

    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

    return model


In [19]:
def trainModel(i, model, X_train_1, X_train_2, y_train, X_valid_1, X_valid_2, y_valid):
    model_filename = model_path + 'glove_lstm_class_weight_keras2_Dense256_cv_' + str(i) + '.h5'
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=5, verbose=0),
        ModelCheckpoint(model_filename, monitor='val_loss', save_best_only=True, verbose=0)
    ]
    
    class_weight = {1.309028344, 0.472001959}
    
    weight_valid = np.ones(len(y_valid))
    weight_valid *= 0.472001959
    weight_valid[y_valid==0] = 1.309028344
    #weight_valid[0:10]

    model.load_weights(model_path + 'glove_lstm_class_weight_keras2_Dense256.h5')
    #model.load_weights(model_filename)
    hist = model.fit([X_train_1, X_train_2], y_train, 
                     validation_data=([X_valid_1, X_valid_2], y_valid, weight_valid),
                     class_weight=class_weight,
                     batch_size=2048, epochs=200, verbose=2, shuffle=True,
                     callbacks=callbacks
                    )
    
    model.load_weights(model_filename)
    pred_valid = model.predict([X_valid_1, X_valid_2], batch_size=8192, verbose=2)
    #print sum(np.round(pred_valid) == y_valid.reshape(len(y_valid), 1)) * 1.0 / len(y_valid)
    score = log_loss(y_valid, pred_valid)
    print ('Score log_loss: ', score)

In [20]:
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, random_state=37, shuffle=True)

In [21]:
train_splits = []
valid_splits = []
for train_index, valid_index in skf.split(X_all_1, y_all):
    print train_index.shape, valid_index.shape, train_index[0:10], valid_index[0:4]
    train_splits.append(train_index)
    valid_splits.append(valid_index)

print len(train_splits), len(valid_splits)
for i in range(n_folds):
    train_index = train_splits[i]
    valid_index = valid_splits[i]
    print train_index.shape, valid_index.shape, train_index[0:10], valid_index[0:4]


(323431,) (80859,) [ 0  1  3  4  6  7  8  9 10 11] [ 2  5 17 33]
(323431,) (80859,) [ 2  3  4  5  6  7  8  9 12 13] [ 0  1 10 11]
(323432,) (80858,) [ 0  1  2  3  4  5  7  8  9 10] [ 6 12 18 23]
(323433,) (80857,) [ 0  1  2  3  4  5  6  7  8 10] [ 9 20 21 22]
(323433,) (80857,) [ 0  1  2  5  6  9 10 11 12 17] [3 4 7 8]
5 5
(323431,) (80859,) [ 0  1  3  4  6  7  8  9 10 11] [ 2  5 17 33]
(323431,) (80859,) [ 2  3  4  5  6  7  8  9 12 13] [ 0  1 10 11]
(323432,) (80858,) [ 0  1  2  3  4  5  7  8  9 10] [ 6 12 18 23]
(323433,) (80857,) [ 0  1  2  3  4  5  6  7  8 10] [ 9 20 21 22]
(323433,) (80857,) [ 0  1  2  5  6  9 10 11 12 17] [3 4 7 8]


In [22]:
models = []
#for i, (train, valid) in enumerate(skf):
for i in range(5):
    train_index = train_splits[i]
    valid_index = valid_splits[i]
    print i, train_index.shape, valid_index.shape, train_index[0:10], valid_index[0:4]
    model = createModel()
    models.append(model)
    
    trainModel(i, model, X_all_1[train_index], X_all_2[train_index], y_all[train_index], 
               X_all_1[valid_index], X_all_2[valid_index], y_all[valid_index])
    print

0 (323431,) (80859,) [ 0  1  3  4  6  7  8  9 10 11] [ 2  5 17 33]
Train on 323431 samples, validate on 80859 samples
Epoch 1/200
70s - loss: 0.4742 - acc: 0.7726 - val_loss: 0.3839 - val_acc: 0.7867
Epoch 2/200
73s - loss: 0.4682 - acc: 0.7759 - val_loss: 0.3846 - val_acc: 0.7864
Epoch 3/200
74s - loss: 0.4635 - acc: 0.7787 - val_loss: 0.3727 - val_acc: 0.7854
Epoch 4/200
74s - loss: 0.4591 - acc: 0.7812 - val_loss: 0.3865 - val_acc: 0.7853
Epoch 5/200
74s - loss: 0.4555 - acc: 0.7836 - val_loss: 0.3986 - val_acc: 0.7861
Epoch 6/200
74s - loss: 0.4512 - acc: 0.7862 - val_loss: 0.4032 - val_acc: 0.7845
Epoch 7/200
74s - loss: 0.4466 - acc: 0.7879 - val_loss: 0.3818 - val_acc: 0.7870
Epoch 8/200
74s - loss: 0.4431 - acc: 0.7914 - val_loss: 0.4026 - val_acc: 0.7851
Epoch 9/200
74s - loss: 0.4408 - acc: 0.7921 - val_loss: 0.3751 - val_acc: 0.7876
('Score log_loss: ', 0.45377006892675636)

1 (323431,) (80859,) [ 2  3  4  5  6  7  8  9 12 13] [ 0  1 10 11]
Train on 323431 samples, validate 

In [None]:
#models[0].optimizer.lr.get_value()

# prediction

In [36]:
(X_test_1, X_test_2) = (test_sequence_1, test_sequence_2)
X_test_1.shape, X_test_2.shape

((2345796, 30), (2345796, 30))

In [37]:
for i in tqdm(range(n_folds)):
    pred_test_i = models[i].predict([X_test_1, X_test_1], batch_size=8192, verbose=2)
    #print pred_test_i[0:5][:,0]
    if (i == 0):
        pred_test = pred_test_i
    else:
        pred_test += pred_test_i
pred_test = pred_test / 5
print pred_test[0:5][:,0]

100%|██████████| 5/5 [16:14<00:00, 194.88s/it]

[ 0.02602234  0.30488801  0.08215218  0.36361808  0.56934977]





# generate submission file

In [38]:
df_submission = pd.read_csv(path + 'submission/sample_submission.csv')
print df_submission.shape
df_submission.head()

(2345796, 2)


Unnamed: 0,test_id,is_duplicate
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [39]:
df_submission['is_duplicate'] = pred_test
df_submission.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.026022
1,1,0.304888
2,2,0.082152
3,3,0.363618
4,4,0.56935


In [40]:
df_submission.to_csv(path + 'submission/quora_glove_lstm_class_weight_keras2_cv_attempt2_20170605.csv', index=False)