In [56]:
from __future__ import division, unicode_literals, print_function
import spacy

import plac
from pathlib import Path
import ujson as json
import numpy
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

from spacy_hook import get_embeddings, get_word_ids
from spacy_hook import create_similarity_pipeline

from keras_decomposable_attention_quora import build_model

try:
    import cPickle as pickle
except ImportError:
    import pickle
    
import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
tree_truncate=False
gru_encode=False
max_length=100
nr_hidden=100
dropout=0.2
learn_rate=0.001
batch_size=100
nr_epoch=5
        
shape = (max_length, nr_hidden, 1)

settings = {
    'lr': learn_rate,
    'dropout': dropout,
    'batch_size': batch_size,
    'nr_epoch': nr_epoch,
    'tree_truncate': tree_truncate,
    'gru_encode': gru_encode
}

In [3]:
train_loc = '/cinc/data/snli/train_1000/snli_1.0_train.jsonl'
dev_loc   = '/cinc/data/snli/dev_1000/snli_1.0_dev.jsonl'

path = '/cinc/data/quora-question-pairs/'
save_path = '/cinc/data/snli/save_data/'

# functions

In [4]:
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
def read_snli(path):
    texts1 = []
    texts2 = []
    labels = []
    with open(path) as file_:
        for line in file_:
            eg = json.loads(line)
            label = eg['gold_label']
            if label == '-':
                continue
            texts1.append(eg['sentence1'])
            texts2.append(eg['sentence2'])
            labels.append(LABELS[label])
    return texts1, texts2, to_categorical(numpy.asarray(labels, dtype='int32'))

In [5]:
nlp = spacy.load('en')

In [6]:
assert nlp.path is not None

# load training data

In [11]:
train_data = pd.read_csv(path + 'train/train.csv', encoding='utf-8')

In [12]:
train_data = train_data[0:1000]
print (train_data.shape)

(1000, 6)


In [19]:
nb_samples = train_data.shape[0]

In [13]:
print (train_data['question1'][0])
print (train_data['question2'][0])
ids_0 = get_word_ids(list(nlp.pipe((unicode(train_data['question1'][0]), unicode(train_data['question2'][0])))),
             max_length=shape[0],
             rnn_encode=settings['gru_encode'],
             tree_truncate=settings['tree_truncate'])
ids_0

What is the step by step guide to invest in share market in india?
What is the step by step guide to invest in share market?


array([[ 143,   10,    4, 1130,   67, 1130, 4418,    5, 2570,   15,  998,
         287,   15,   22,    1,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [ 143,   10,    4, 1130,   67, 1130, 4418,    5, 2570,   15,  998,
         287,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [15]:
all_texts1 = train_data['question1'].apply(lambda x: unicode(x))
all_texts2 = train_data['question2'].apply(lambda x: unicode(x))
all_texts1[0], all_texts2[0]

(u'What is the step by step guide to invest in share market in india?',
 u'What is the step by step guide to invest in share market?')

In [24]:
Xs_all = []
for texts in (all_texts1, all_texts2):
    #print (texts)
    Xs_all.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
                     max_length=shape[0],
                     rnn_encode=settings['gru_encode'],
                     tree_truncate=settings['tree_truncate']))
X1_all, X2_all = Xs_all

In [25]:
X1_all.shape, X2_all.shape

((1000, 100), (1000, 100))

In [31]:
#X1_all[0], X2_all[0]

In [27]:
y_all = train_data['is_duplicate']
y_all.shape

(1000,)

# random split

In [23]:
msk = np.random.rand(nb_samples) < 0.8
#msk

In [28]:
X1_train = X1_all[msk]
X1_valid = X1_all[~msk]

X2_train = X2_all[msk]
X2_valid = X2_all[~msk]

y_train = y_all[msk]
y_valid = y_all[~msk]

(X1_train.shape, X2_train.shape, y_train.shape, X1_valid.shape, X2_valid.shape, y_valid.shape)

((814, 100), (814, 100), (814,), (186, 100), (186, 100), (186,))

# build model

In [29]:
model = build_model(get_embeddings(nlp.vocab), shape, settings)

In [32]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, verbose=0),
    #ModelCheckpoint(model_path + 'word2vec_dropout.h5', monitor='val_loss', save_best_only=True, verbose=0)
]

In [34]:
model.fit(
    [X1_train, X2_train],
    y_train,
    validation_data=([X1_valid, X2_valid], y_valid),
    nb_epoch=settings['nr_epoch']*10,
    batch_size=settings['batch_size'],
    callbacks=callbacks,
    verbose=2
)

Train on 814 samples, validate on 186 samples
Epoch 1/50
7s - loss: 0.6603 - acc: 0.6167 - val_loss: 0.6756 - val_acc: 0.6344
Epoch 2/50
7s - loss: 0.6566 - acc: 0.6167 - val_loss: 0.6679 - val_acc: 0.6344
Epoch 3/50
7s - loss: 0.6363 - acc: 0.6167 - val_loss: 0.6648 - val_acc: 0.6344
Epoch 4/50
8s - loss: 0.6235 - acc: 0.6167 - val_loss: 0.6678 - val_acc: 0.6344
Epoch 5/50
7s - loss: 0.6010 - acc: 0.6143 - val_loss: 0.6582 - val_acc: 0.6344
Epoch 6/50
7s - loss: 0.5702 - acc: 0.6216 - val_loss: 0.6466 - val_acc: 0.6344
Epoch 7/50
7s - loss: 0.5186 - acc: 0.6536 - val_loss: 0.6401 - val_acc: 0.6344
Epoch 8/50
8s - loss: 0.4552 - acc: 0.7629 - val_loss: 0.6290 - val_acc: 0.6237
Epoch 9/50
7s - loss: 0.3861 - acc: 0.8366 - val_loss: 0.6181 - val_acc: 0.6398
Epoch 10/50
7s - loss: 0.3067 - acc: 0.8747 - val_loss: 0.6198 - val_acc: 0.6237
Epoch 11/50
8s - loss: 0.2690 - acc: 0.8894 - val_loss: 0.6550 - val_acc: 0.6398
Epoch 12/50
7s - loss: 0.2310 - acc: 0.9017 - val_loss: 0.6707 - val_acc

<keras.callbacks.History at 0x7fd51ba4e890>

In [19]:
#model.save_weights(save_path + 'model/spacy_attention.h5')

In [None]:
#model.load_weights(save_path + 'model/spacy_attention.h5', by_name=True)

In [38]:
all_pred = model.predict([X1_all, X2_all])

In [39]:
all_pred.shape

(1000, 1)

In [42]:
all_texts1[0:3], all_texts2[0:3], all_pred[0:3]

(0    What is the step by step guide to invest in sh...
 1    What is the story of Kohinoor (Koh-i-Noor) Dia...
 2    How can I increase the speed of my internet co...
 Name: question1, dtype: object,
 0    What is the step by step guide to invest in sh...
 1    What would happen if the Indian government sto...
 2    How can Internet speed be increased by hacking...
 Name: question2, dtype: object,
 array([[ 0.23589465],
        [ 0.082427  ],
        [ 0.04486652]], dtype=float32))

In [26]:
#valid_pred[:,0]

# predict test

In [44]:
get_word_ids(list(nlp.pipe(('I am ok.', 'I am good'))),
             max_length=shape[0],
             rnn_encode=settings['gru_encode'],
             tree_truncate=settings['tree_truncate']).shape

(2, 100)

In [48]:
test_data = pd.read_csv(path + 'test/test.csv', encoding='utf-8')

In [49]:
test_data = test_data[0:82]
test_data.shape

(82, 3)

# predict in batches

In [50]:
def predict_batch(df_data, start, end):
    test_texts1 = test_data['question1'][start:end].apply(lambda x: unicode(x))
    test_texts2 = test_data['question2'][start:end].apply(lambda x: unicode(x))

    Xs_test = []
    for texts in (test_texts1, test_texts2):
        #print (texts)
        Xs_test.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
                         max_length=shape[0],
                         rnn_encode=settings['gru_encode'],
                         tree_truncate=settings['tree_truncate']))
    test_X1, test_X2 = Xs_test

    return model.predict([test_X1, test_X2])

a = predict_batch(test_data, 10, 12)

In [58]:
#test_pred = []
batch_size = 10
for i in tqdm(range(int(test_data.shape[0] / batch_size)+1)):
    start = i*batch_size
    end   = min((i+1)*batch_size, test_data.shape[0])
    #print ('proceccinng: ', i, start, end)
    batch_pred = predict_batch(test_data, start, end)
    if (i == 0):
        test_pred = batch_pred
    else:
        test_pred = np.vstack( (test_pred, batch_pred) )


100%|██████████| 9/9 [00:02<00:00,  3.57it/s]


In [59]:
test_data.shape[0], len(test_pred)

(82, 82)

In [60]:
test_pred[0]

array([ 0.24061327], dtype=float32)