In [1]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, AveragePooling1D
from keras.datasets import imdb
import common
import time
import numpy as np
import pickle

Using TensorFlow backend.


In [2]:
# set parameters:
maxlen = 7
embedding_dims = 300
filters = 80
kernel_size_1 = 3
kernel_size_2 = maxlen-((kernel_size_1-1)/2) # will not work for even kernel_size_1
hidden_dims = 250
batch_size = 32
epochs = 20

In [3]:
google_model = common.load_model("google")

In [6]:
def generate_training_sample(file_name, num_records,is_training_data=True):
    if num_records!=0:
        lines = np.array(common.read_lines_from_file(file_name))
        sampled_lines = lines[np.random.randint(len(lines), size = num_records), :]
    else:
        sampled_lines = np.array(common.read_lines_from_file(file_name))
    return common.convert_lines_to_question_pairs(sampled_lines.tolist(),is_training_data)

def sentence2vec(words_in_sentence, model):
    array_of_vectors = map(lambda (x): common.vec(x, model), words_in_sentence)
    filtered = np.array([x for x in array_of_vectors if len(x) != 0])
    return filtered.transpose()

def get_embedded_sentence(question_pairs):
    dataset_x = []
    dataset_y = []
    for ind, question_pair in enumerate(question_pairs):
        v1 = sentence2vec(question_pair.question_1, google_model)
        v2 = sentence2vec(question_pair.question_2, google_model)
        v3 = sequence.pad_sequences(v1.tolist(), dtype='float', maxlen=maxlen)
        v4 = sequence.pad_sequences(v2.tolist(), dtype='float', maxlen=maxlen)
        if(len(v3)==embedding_dims and len(v4)==embedding_dims):
            v5 = np.concatenate((v3,v4),axis=1)
            dataset_x.append(v5.transpose())
            dataset_y.append(question_pair.is_duplicate)
    return (np.array(dataset_x),np.array(dataset_y))

def build_model():
    print('Building model...')
    model = Sequential()

    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    model.add(Conv1D(filters, kernel_size_1, padding='valid', activation='relu', strides=1, input_shape=(2*maxlen,embedding_dims)))
    model.add(Dropout(0.3))

    # TODO might want to add a pooling layer
    model.add(Conv1D(10, kernel_size_2, padding='valid', activation='relu', strides=kernel_size_2))
    model.add(Dropout(0.3))
    # using global max pooling:
    model.add(GlobalMaxPooling1D())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [78]:
startTime = time.time()
train_qn_pairs = pickle.load(open( "data/train_qn_pairs.p", "rb" ))
endTime = time.time()
print ("Loaded Pickle : %.2f min" % ((endTime - startTime)/60))

Loaded Pickle : 1.20 min


In [8]:
# training_questions = generate_training_sample("data/train_sample.csv", 0)
training_questions_1 = generate_training_sample("data/train_cleaned.csv", 40000)

In [9]:
startTime = time.time()
x_train, y_train = get_embedded_sentence(training_questions_1)
endTime = time.time()
print (endTime - startTime)
print (x_train.shape)

93.987071991
(39900, 14, 300)


In [10]:
model = build_model()
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=8,
          validation_split=0.3)

Building model...
Train on 27930 samples, validate on 11970 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x21ba39dd0>

In [12]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda
from keras.optimizers import RMSprop
from keras import backend as K

def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

def cosine_distance(vects):
    x, y = vects
    # define the cosine function
    return 

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) +
                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def get_embedded_sentence(question_pairs):
    dataset_x_1 = []
    dataset_x_2 = []
    dataset_y = []
    for ind, question_pair in enumerate(question_pairs):
        v1 = sentence2vec(question_pair.question_1, google_model)
        v2 = sentence2vec(question_pair.question_2, google_model)
        v3 = sequence.pad_sequences(v1.tolist(), dtype='float', maxlen=maxlen)
        v4 = sequence.pad_sequences(v2.tolist(), dtype='float', maxlen=maxlen)
        if(len(v3)==embedding_dims and len(v4)==embedding_dims):
            dataset_x_1.append(v3.transpose())
            dataset_x_2.append(v4.transpose())
            dataset_y.append(question_pair.is_duplicate)
    return (np.array(dataset_x_1),np.array(dataset_x_1),np.array(dataset_y))

def create_base_network():
    '''Base network to be shared (eq. to feature extraction).
    '''
    model = Sequential()

    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    model.add(Conv1D(filters, kernel_size_1, padding='valid', activation='relu', strides=1, input_shape=(maxlen,embedding_dims)))
    model.add(Dropout(0.3))

#     # TODO might want to add a pooling layer
#     model.add(Conv1D(10, kernel_size_2, padding='valid', activation='relu', strides=kernel_size_2))
#     model.add(Dropout(0.3))
    # using global max pooling:
    model.add(GlobalMaxPooling1D())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    return model


def compute_accuracy(predictions, labels):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return labels[predictions.ravel() < 0.5].mean()


# network definition
print ("creating base network")
base_network = create_base_network()

input_a = Input(shape=(maxlen,embedding_dims))
input_b = Input(shape=(maxlen,embedding_dims))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance)

print ("getting training data")
x_train_1, x_train_2, y_train = get_embedded_sentence(training_questions_1)

# train
print ("training")
rms = RMSprop()
# model.compile(loss=contrastive_loss, optimizer=rms)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.fit([x_train_1, x_train_2], y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.3)


creating base network
getting training data
training
Train on 27930 samples, validate on 11970 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x253c2c750>

In [15]:
model.compile(loss=contrastive_loss, optimizer=rms, metrics=['accuracy'])
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
x_tr_1 = x_train_1[:20000]
x_tr_2 = x_train_1[:20000]
y_tr = y_train[:20000]
x_va_1 = x_train_1[20000:30000]
x_va_2 = x_train_1[20000:30000]
y_va = y_train[20000:30000]
x_te_1 = x_train_1[30000:40000]
x_te_2 = x_train_1[30000:40000]
y_te = y_train[30000:40000]
model.fit([x_tr_1, x_tr_2], y_tr,
          batch_size=batch_size,
          epochs=8,
          validation_data=([x_va_1, x_va_2], y_va))

Train on 20000 samples, validate on 10000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x21c3db910>

In [16]:
# compute final accuracy on training and test sets
pred = model.predict([x_tr_1, x_tr_2])
tr_acc = compute_accuracy(pred, y_tr)
pred = model.predict([x_te_1, x_te_2])
te_acc = compute_accuracy(pred, y_te)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

TypeError: cannot perform reduce with flexible type