In [1]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import sys
sys.path.insert(0, '../scripts')
from utils import *

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
plt.rcParams['figure.figsize'] = (12, 10)

In [5]:
train_data = pd.read_csv('../data/clean-train.csv').dropna(axis=0)
test_data = pd.read_csv('../data/clean-test.csv').dropna(axis=0)

In [6]:
q1_list = train_data['q1'].tolist()
q2_list = train_data['q2'].tolist()

In [7]:
vocab_size = 10000

In [8]:
token = Tokenizer(nb_words=vocab_size)
token.fit_on_texts(q1_list + q2_list)



In [9]:
question1_seq = token.texts_to_sequences(q1_list)
question2_seq = token.texts_to_sequences(q2_list)

In [10]:
unique_words = {}

for sent in q1_list+q2_list:
    for word in sent.split(' '):
        unique_words[word] = 1.

In [11]:
MAX_SEQUENCE_LENGTH = 25

In [12]:
q1_data = pad_sequences(question1_seq, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_seq, maxlen=MAX_SEQUENCE_LENGTH)

In [13]:
import gensim
from gensim.models import word2vec, Word2Vec

In [14]:
def get_vecs(q1, q2):
    vecs_q1 = []
    vecs_q2 = []

    for i, j in zip(q1, q2):
        vecs_q1.append(sum(model.word_vec(x) if x in model.vocab else np.zeros(300) for x in i))
        vecs_q2.append(sum(model.word_vec(x) if x in model.vocab else np.zeros(300) for x in j))
        
    return np.array(vecs_q1), np.array(vecs_q2)

In [19]:
model = gensim.models.KeyedVectors.load_word2vec_format('../data/glove.w2v.txt')

In [20]:
embed = np.empty((len(unique_words.keys()), 300))

for i, word in enumerate(unique_words.keys()):
    embed[i, :] = model.word_vec(word) if word in model.vocab else np.zeros(300)

In [26]:
from parikh import build_model

In [27]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [28]:
decomposable_model = build_model(embed, num_class=1, maxlen=25, trainable=True)

In [29]:
decomposable_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])

In [None]:
callbacks = [ModelCheckpoint('../Saved Models/decomposable-trainable', monitor='val_acc', save_best_only=True),
            EarlyStopping(monitor='val_acc', patience=2)]
decomposable_model.fit([q1_data, q2_data], train_data['is_same'], validation_split=0.1, epochs=100,
                      callbacks=callbacks)

Train on 291067 samples, validate on 32341 samples
Epoch 1/100
 41152/291067 [===>..........................] - ETA: 40:44 - loss: 0.5329 - acc: 0.7299

In [None]:
q1_list_test = test_data['q1'].tolist()
q2_list_test = test_data['q2'].tolist()

question1_seq_test = token.texts_to_sequences(q1_list_test)
question2_seq_test = token.texts_to_sequences(q2_list_test)

q1_data_test = pad_sequences(question1_seq_test, maxlen=MAX_SEQUENCE_LENGTH)
q2_data_test = pad_sequences(question2_seq_test, maxlen=MAX_SEQUENCE_LENGTH)

test_labels = test_data['is_same'].values

In [None]:
preds = decomposable_model.predict([q1_data_test, q2_data_test])

In [None]:
classes = (preds > 0.5).astype('int').flatten()

In [None]:
generate_report(test_labels, classes.flatten(), preds.flatten())