In [1]:
from __future__ import division, unicode_literals, print_function
import spacy

import plac
from pathlib import Path
import ujson as json
import numpy
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

from spacy_hook import get_embeddings, get_word_ids
from spacy_hook import create_similarity_pipeline

from keras_decomposable_attention import build_model

try:
    import cPickle as pickle
except ImportError:
    import pickle
    
import pandas as pd
import numpy as np

from tqdm import tqdm
#import sys
#reload(sys)  
#sys.setdefaultencoding('utf8')

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [3]:
tree_truncate=False
gru_encode=False
max_length=100
nr_hidden=100
dropout=0.2
learn_rate=0.001
batch_size=100
nr_epoch=100
        
shape = (max_length, nr_hidden, 3)

settings = {
    'lr': learn_rate,
    'dropout': dropout,
    'batch_size': batch_size,
    'nr_epoch': nr_epoch,
    'tree_truncate': tree_truncate,
    'gru_encode': gru_encode
}

In [4]:
#train_loc = '/cinc/data/snli/train_1000/snli_1.0_train.jsonl'
#dev_loc   = '/cinc/data/snli/dev_1000/snli_1.0_dev.jsonl'

train_loc = '/cinc/data/snli/train/snli_1.0_train.jsonl'
dev_loc   = '/cinc/data/snli/dev/snli_1.0_dev.jsonl'

path = '/cinc/data/quora-question-pairs/'
save_path = '/cinc/data/snli/save_data/'
model_path = '/cinc/data/snli/save_data/model/'

# functions

In [6]:
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
def read_snli(path):
    texts1 = []
    texts2 = []
    labels = []
    with open(path) as file_:
        for line in file_:
            eg = json.loads(line)
            label = eg['gold_label']
            if label == '-':
                continue
            texts1.append(eg['sentence1'])
            texts2.append(eg['sentence2'])
            labels.append(LABELS[label])
    return texts1, texts2, to_categorical(numpy.asarray(labels, dtype='int32'))

In [7]:
nlp = spacy.load('en')

In [8]:
assert nlp.path is not None

# load data

In [8]:
train_texts1, train_texts2, train_labels = read_snli(train_loc)

In [9]:
print (len(train_texts1), len(train_texts2), len(train_labels))
train_texts1[0], train_texts2[0], train_labels[0]

549367 549367 549367


(u'A person on a horse jumps over a broken down airplane.',
 u'A person is training his horse for a competition.',
 array([ 0.,  0.,  1.]))

In [10]:
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)

In [11]:
print (len(dev_texts1), len(dev_texts2), len(dev_labels))
dev_texts1[0], dev_texts2[0], dev_labels[0]

9842 9842 9842


(u'Two women are embracing while holding to go packages.',
 u'The sisters are hugging goodbye while holding to go packages after just eating lunch.',
 array([ 0.,  0.,  1.]))

In [14]:
Xs = []
for texts in (train_texts1, train_texts2, dev_texts1, dev_texts2):
    #print (texts)
    Xs.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
                     max_length=shape[0],
                     rnn_encode=settings['gru_encode'],
                     tree_truncate=settings['tree_truncate']))
train_X1, train_X2, dev_X1, dev_X2 = Xs

In [19]:
train_X1.shape, train_X2.shape, dev_X1.shape, dev_X2.shape

((549367, 100), (549367, 100), (9842, 100), (9842, 100))

In [12]:
# save for later
save_array(save_path + 'train_X1', train_X1)
save_array(save_path + 'train_X2', train_X2)
save_array(save_path + 'train_labels', train_labels)

save_array(save_path + 'dev_X1', dev_X1)
save_array(save_path + 'dev_X2', dev_X2)
save_array(save_path + 'dev_labels', dev_labels)

In [12]:
# load back
train_X1 = load_array(save_path + 'train_X1')
train_X2 = load_array(save_path + 'train_X2')
train_labels = load_array(save_path + 'train_labels')

dev_X1 = load_array(save_path + 'dev_X1')
dev_X2 = load_array(save_path + 'dev_X2')
dev_labels = load_array(save_path + 'dev_labels')


# build model

In [9]:
model = build_model(get_embeddings(nlp.vocab), shape, settings)

In [15]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, verbose=0),
    ModelCheckpoint(model_path + 'spacy_attention_snli.h5', monitor='val_loss', save_best_only=True, verbose=0)
]

In [16]:
model.fit(
    [train_X1, train_X2],
    train_labels,
    validation_data=([dev_X1, dev_X2], dev_labels),
    nb_epoch=settings['nr_epoch'],
    batch_size=settings['batch_size'],
    callbacks=callbacks,
    verbose=2
)

Train on 549367 samples, validate on 9842 samples
Epoch 1/100
674s - loss: 0.7730 - acc: 0.6607 - val_loss: 0.6204 - val_acc: 0.7465
Epoch 2/100
683s - loss: 0.6702 - acc: 0.7210 - val_loss: 0.6172 - val_acc: 0.7466
Epoch 3/100
681s - loss: 0.6488 - acc: 0.7321 - val_loss: 0.5911 - val_acc: 0.7597
Epoch 4/100
685s - loss: 0.6359 - acc: 0.7378 - val_loss: 0.5884 - val_acc: 0.7668
Epoch 5/100
687s - loss: 0.6265 - acc: 0.7421 - val_loss: 0.5701 - val_acc: 0.7677
Epoch 6/100
686s - loss: 0.6202 - acc: 0.7449 - val_loss: 0.5552 - val_acc: 0.7748
Epoch 7/100
687s - loss: 0.6146 - acc: 0.7477 - val_loss: 0.5471 - val_acc: 0.7776
Epoch 8/100
678s - loss: 0.6093 - acc: 0.7506 - val_loss: 0.5477 - val_acc: 0.7810
Epoch 9/100
687s - loss: 0.6081 - acc: 0.7509 - val_loss: 0.5467 - val_acc: 0.7779
Epoch 10/100
680s - loss: 0.6051 - acc: 0.7518 - val_loss: 0.5463 - val_acc: 0.7797
Epoch 11/100
669s - loss: 0.6035 - acc: 0.7523 - val_loss: 0.5493 - val_acc: 0.7780
Epoch 12/100
676s - loss: 0.6031 - 

<keras.callbacks.History at 0x7f4f33b2b850>

In [19]:
#model.save_weights(save_path + 'model/spacy_attention.h5')

In [10]:
model.load_weights(save_path + 'model/spacy_attention_snli.h5')

In [13]:
valid_pred = model.predict([dev_X1, dev_X2])

In [14]:
valid_pred.shape

(9842, 3)

In [16]:
dev_texts1[1], dev_texts2[1], valid_pred[1]

array([ 0.95089561,  0.02457028,  0.02453408], dtype=float32)

In [26]:
#valid_pred[:,0]

# predict in batches

In [17]:
get_word_ids(list(nlp.pipe(('I am ok.', 'I am good'))),
             max_length=shape[0],
             rnn_encode=settings['gru_encode'],
             tree_truncate=settings['tree_truncate']).shape

(2, 100)

In [18]:
test_data = pd.read_csv(path + 'test/test.csv', encoding='utf-8')

In [19]:
#test_data = test_data[0:82]
print (test_data.shape)

(2345796, 3)


In [20]:
def predict_batch(df_data, start, end):
    test_texts1 = test_data['question1'][start:end].apply(lambda x: unicode(x))
    test_texts2 = test_data['question2'][start:end].apply(lambda x: unicode(x))

    Xs_test = []
    for texts in (test_texts1, test_texts2):
        #print (texts)
        Xs_test.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
                         max_length=shape[0],
                         rnn_encode=settings['gru_encode'],
                         tree_truncate=settings['tree_truncate']))
    test_X1, test_X2 = Xs_test

    return model.predict([test_X1, test_X2], batch_size=1024)

a = predict_batch(test_data, 10, 12)

In [21]:
#test_pred = []
batch_size = 10000
for i in tqdm(range(int(test_data.shape[0] / batch_size)+1)):
    start = i*batch_size
    end   = min((i+1)*batch_size, test_data.shape[0])
    #print ('proceccinng: ', i, start, end)
    batch_pred = predict_batch(test_data, start, end)
    if (i == 0):
        test_pred = batch_pred
    else:
        test_pred = np.vstack( (test_pred, batch_pred) )


100%|██████████| 235/235 [59:36<00:00, 13.25s/it]


In [22]:
test_data.shape[0], len(test_pred)

(2345796, 2345796)

In [23]:
test_pred[:,0]

array([ 0.15557578,  0.82303774,  0.94565886, ...,  0.31952131,
        0.75785476,  0.42257354], dtype=float32)

# generate submission file

In [24]:
df_submission = pd.read_csv(path + 'submission/sample_submission.csv')
print (df_submission.shape)
df_submission.head()

(2345796, 2)


Unnamed: 0,test_id,is_duplicate
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [25]:
df_submission['is_duplicate'] = test_pred[:,0]
df_submission.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.155576
1,1,0.823038
2,2,0.945659
3,3,0.75488
4,4,0.756286


In [26]:
df_submission.to_csv(path + 'submission/spacy_attention_snli_20170512.csv', index=False)