In [1]:
from __future__ import division, unicode_literals, print_function
import spacy

import plac
from pathlib import Path
import ujson as json
import numpy
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

from spacy_hook import get_embeddings, get_word_ids
from spacy_hook import create_similarity_pipeline

from keras_decomposable_attention_quora import build_model

try:
    import cPickle as pickle
except ImportError:
    import pickle
    
import pandas as pd
import numpy as np

from tqdm import tqdm

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [3]:
tree_truncate=False
gru_encode=False
max_length=100
nr_hidden=100
dropout=0.2
learn_rate=0.001
batch_size=1024
nr_epoch=100
        
shape = (max_length, nr_hidden, 1)

settings = {
    'lr': learn_rate,
    'dropout': dropout,
    'batch_size': batch_size,
    'nr_epoch': nr_epoch,
    'tree_truncate': tree_truncate,
    'gru_encode': gru_encode
}

In [4]:
train_loc = '/cinc/data/snli/train_1000/snli_1.0_train.jsonl'
dev_loc   = '/cinc/data/snli/dev_1000/snli_1.0_dev.jsonl'

path = '/cinc/data/quora-question-pairs/'
save_path = '/cinc/data/quora-question-pairs/save_data/'
model_path = '/cinc/data/quora-question-pairs/model/'

# functions

In [5]:
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
def read_snli(path):
    texts1 = []
    texts2 = []
    labels = []
    with open(path) as file_:
        for line in file_:
            eg = json.loads(line)
            label = eg['gold_label']
            if label == '-':
                continue
            texts1.append(eg['sentence1'])
            texts2.append(eg['sentence2'])
            labels.append(LABELS[label])
    return texts1, texts2, to_categorical(numpy.asarray(labels, dtype='int32'))

In [6]:
nlp = spacy.load('en')

In [7]:
assert nlp.path is not None

# load training data

In [24]:
train_data = pd.read_csv(path + 'train/train.csv', encoding='utf-8')

In [25]:
#train_data = train_data[0:1000]
print (train_data.shape)

(404290, 6)


In [26]:
nb_samples = train_data.shape[0]

In [27]:
print (train_data['question1'][0])
print (train_data['question2'][0])
ids_0 = get_word_ids(list(nlp.pipe((unicode(train_data['question1'][0]), unicode(train_data['question2'][0])))),
             max_length=shape[0],
             rnn_encode=settings['gru_encode'],
             tree_truncate=settings['tree_truncate'])
ids_0

What is the step by step guide to invest in share market in india?
What is the step by step guide to invest in share market?


array([[ 143,   10,    4, 1130,   67, 1130, 4418,    5, 2570,   15,  998,
         287,   15,   22,    1,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [ 143,   10,    4, 1130,   67, 1130, 4418,    5, 2570,   15,  998,
         287,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [28]:
all_texts1 = train_data['question1'].apply(lambda x: unicode(x))
all_texts2 = train_data['question2'].apply(lambda x: unicode(x))
all_texts1[0], all_texts2[0]

(u'What is the step by step guide to invest in share market in india?',
 u'What is the step by step guide to invest in share market?')

In [30]:
Xs_all = []
for texts in (all_texts1, all_texts2):
    #print (texts)
    Xs_all.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
                     max_length=shape[0],
                     rnn_encode=settings['gru_encode'],
                     tree_truncate=settings['tree_truncate']))
X1_all, X2_all = Xs_all

In [14]:
y_all = train_data['is_duplicate']

In [17]:
X1_all.shape, X2_all.shape, y_all.shape

((404290, 100), (404290, 100), (404290,))

# q1 <-> q2

In [32]:
X1_all_temp = np.vstack((X1_all, X2_all))
X2_all_temp = np.vstack((X2_all, X1_all))
y_all_temp = np.hstack((y_all, y_all))

X1_all_temp.shape, X2_all_temp.shape, y_all_temp.shape

((808580, 100), (808580, 100), (808580,))

In [35]:
X1_all = X1_all_temp
X2_all = X2_all_temp
y_all = y_all_temp

nb_samples = train_data.shape[0] * 2

X1_all.shape, X2_all.shape, y_all.shape, nb_samples

((808580, 100), (808580, 100), (808580,), 808580)

In [36]:
# save for later
save_array(save_path + 'X1_all', X1_all)
save_array(save_path + 'X2_all', X2_all)
save_array(save_path +  'y_all',  y_all)

In [37]:
# load back
X1_all = load_array(save_path + 'X1_all')
X2_all = load_array(save_path + 'X2_all')
y_all  = load_array(save_path +  'y_all')
X1_all.shape, X2_all.shape, y_all.shape

((808580, 100), (808580, 100), (808580,))

# random split

In [20]:
msk = np.random.rand(nb_samples) < 0.8
#msk

In [21]:
X1_train = X1_all[msk]
X1_valid = X1_all[~msk]

X2_train = X2_all[msk]
X2_valid = X2_all[~msk]

y_train = y_all[msk]
y_valid = y_all[~msk]

(X1_train.shape, X2_train.shape, y_train.shape, X1_valid.shape, X2_valid.shape, y_valid.shape)

((647100, 100),
 (647100, 100),
 (647100,),
 (161480, 100),
 (161480, 100),
 (161480,))

In [22]:
# save for later
save_array(save_path + 'X1_train', X1_train)
save_array(save_path + 'X2_train', X2_train)
save_array(save_path +  'y_train',  y_train)

save_array(save_path + 'X1_valid', X1_valid)
save_array(save_path + 'X2_valid', X2_valid)
save_array(save_path +  'y_valid',  y_valid)


In [11]:
# load back
X1_train = load_array(save_path + 'X1_train')
X2_train = load_array(save_path + 'X2_train')
y_train  = load_array(save_path +  'y_train')

X1_valid = load_array(save_path + 'X1_valid')
X2_valid = load_array(save_path + 'X2_valid')
y_valid  = load_array(save_path +  'y_valid')

print (X1_train.shape, X2_train.shape, y_train.shape)
print (X1_valid.shape, X2_valid.shape, y_valid.shape)


(647100, 100) (647100, 100) (647100,)
(161480, 100) (161480, 100) (161480,)


# build model

In [12]:
model = build_model(get_embeddings(nlp.vocab), shape, settings)

In [13]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=0),
    ModelCheckpoint(model_path + 'spacy_attention_quora_q1221.h5', monitor='val_loss', save_best_only=True, verbose=0)
]

In [17]:
model.fit(
    [X1_train, X2_train],
    y_train,
    validation_data=([X1_valid, X2_valid], y_valid),
    nb_epoch=settings['nr_epoch'],
    batch_size=settings['batch_size'],
    callbacks=callbacks,
    verbose=2
)

Train on 647100 samples, validate on 161480 samples
Epoch 1/100
375s - loss: 0.3937 - acc: 0.8099 - val_loss: 0.4092 - val_acc: 0.7994
Epoch 2/100
383s - loss: 0.3869 - acc: 0.8145 - val_loss: 0.3963 - val_acc: 0.8074
Epoch 3/100
384s - loss: 0.3826 - acc: 0.8175 - val_loss: 0.3962 - val_acc: 0.8061
Epoch 4/100
374s - loss: 0.3799 - acc: 0.8181 - val_loss: 0.4133 - val_acc: 0.7944
Epoch 5/100
374s - loss: 0.3772 - acc: 0.8202 - val_loss: 0.4026 - val_acc: 0.8009
Epoch 6/100
383s - loss: 0.3749 - acc: 0.8214 - val_loss: 0.3953 - val_acc: 0.8022
Epoch 7/100
383s - loss: 0.3726 - acc: 0.8232 - val_loss: 0.3875 - val_acc: 0.8098
Epoch 8/100
373s - loss: 0.3711 - acc: 0.8242 - val_loss: 0.3896 - val_acc: 0.8079
Epoch 9/100
373s - loss: 0.3687 - acc: 0.8251 - val_loss: 0.3899 - val_acc: 0.8080
Epoch 10/100
373s - loss: 0.3675 - acc: 0.8255 - val_loss: 0.4031 - val_acc: 0.7983
Epoch 11/100
383s - loss: 0.3659 - acc: 0.8267 - val_loss: 0.3804 - val_acc: 0.8134
Epoch 12/100
374s - loss: 0.3651 

<keras.callbacks.History at 0x7f27416f9250>

In [19]:
#model.save_weights(save_path + 'model/spacy_attention_quora_q1221.h5')

In [18]:
model.load_weights(model_path + 'spacy_attention_quora_q1221.h5')

In [39]:
all_pred = model.predict([X1_all[0:10], X2_all[0:10]])

In [40]:
all_pred.shape

(10, 1)

In [41]:
all_texts1[0:3], all_texts2[0:3], all_pred[0:3]

(0    What is the step by step guide to invest in sh...
 1    What is the story of Kohinoor (Koh-i-Noor) Dia...
 2    How can I increase the speed of my internet co...
 Name: question1, dtype: object,
 0    What is the step by step guide to invest in sh...
 1    What would happen if the Indian government sto...
 2    How can Internet speed be increased by hacking...
 Name: question2, dtype: object,
 array([[ 0.14724822],
        [ 0.34619609],
        [ 0.66504312]], dtype=float32))

In [26]:
#valid_pred[:,0]

# predict test in batches

In [42]:
get_word_ids(list(nlp.pipe(('I am ok.', 'I am good'))),
             max_length=shape[0],
             rnn_encode=settings['gru_encode'],
             tree_truncate=settings['tree_truncate']).shape

(2, 100)

In [43]:
test_data = pd.read_csv(path + 'test/test.csv', encoding='utf-8')

In [44]:
#test_data = test_data[0:82]
test_data.shape

(2345796, 3)

In [45]:
def predict_batch(df_data, start, end):
    test_texts1 = test_data['question1'][start:end].apply(lambda x: unicode(x))
    test_texts2 = test_data['question2'][start:end].apply(lambda x: unicode(x))

    Xs_test = []
    for texts in (test_texts1, test_texts2):
        #print (texts)
        Xs_test.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
                         max_length=shape[0],
                         rnn_encode=settings['gru_encode'],
                         tree_truncate=settings['tree_truncate']))
    test_X1, test_X2 = Xs_test

    return model.predict([test_X1, test_X2], batch_size=1024)

a = predict_batch(test_data, 10, 12)

In [46]:
#test_pred = []
batch_size = 10000
for i in tqdm(range(int(test_data.shape[0] / batch_size)+1)):
    start = i*batch_size
    end   = min((i+1)*batch_size, test_data.shape[0])
    #print ('proceccinng: ', i, start, end)
    batch_pred = predict_batch(test_data, start, end)
    if (i == 0):
        test_pred = batch_pred
    else:
        test_pred = np.vstack( (test_pred, batch_pred) )


100%|██████████| 235/235 [30:33<00:00,  6.88s/it]


In [47]:
test_data.shape[0], len(test_pred)

(2345796, 2345796)

In [48]:
test_pred[0]

array([ 0.20613281], dtype=float32)

# generate submission file

In [49]:
df_submission = pd.read_csv(path + 'submission/sample_submission.csv')
print (df_submission.shape)
df_submission.head()

(2345796, 2)


Unnamed: 0,test_id,is_duplicate
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [50]:
df_submission['is_duplicate'] = test_pred
df_submission.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.206133
1,1,0.286344
2,2,0.490187
3,3,0.153228
4,4,0.616452


In [51]:
df_submission.to_csv(path + 'submission/spacy_attention_quora_q1221_20170522.csv', index=False)