In [1]:
import pandas as pd
import numpy as np
import keras
from keras.layers import Embedding, Input, LSTM, Dense, Subtract
from keras.models import Model
from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_data[train_data.isnull().any(axis=1)]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [5]:
train_data = train_data.drop(train_data.index[[105780, 201841, 363362]])

In [6]:
test_data[test_data.isnull().any(axis=1)]

Unnamed: 0,test_id,question1,question2
379205,379205,How I can learn android app development?,
817520,817520,How real can learn android app development?,
943911,943911,How app development?,
1046690,1046690,,How I what can learn android app development?
1270024,1270024,How I can learn app development?,
1461432,1461432,,How distinct can learn android app development?


In [7]:
test_data = test_data.drop(test_data.index[[379205, 817520, 943911, 1046690, 1270024, 1461432]])

In [8]:
ques1_train = train_data['question1'].tolist()
ques2_train = train_data['question2'].tolist()
labels_train = train_data['is_duplicate']
num_samples_train = len(ques1_train)
print("Total number of valid train samples: %s" % num_samples_train)

Total number of valid train samples: 404287


In [9]:
ques1_test = test_data['question1'].tolist()
ques2_test = test_data['question2'].tolist()
num_samples_test = len(ques1_test)
print("Total number of valid test samples: %s" % num_samples_test)

Total number of valid test samples: 2345790


In [10]:
texts = ques1_train + ques2_train + ques1_test + ques2_test
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print('Number of sentences tokenizer was trained on: %s' % tokenizer.document_count)

Found 137041 unique tokens.
Number of sentences tokenizer was trained on: 5500154


In [11]:
max_seq_len = 15
data = pad_sequences(sequences, maxlen=max_seq_len)
ques1_train = data[:num_samples_train]
ques2_train = data[num_samples_train: 2*num_samples_train]
ques1_test = data[2* num_samples_train: 2*num_samples_train + num_samples_test]
ques2_test = data[2*num_samples_train + num_samples_test:]

In [12]:
embeddings_index = {}
f = open('glove.42B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))


Found 1917494 word vectors.


In [13]:
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1, 
                            EMBEDDING_DIM, 
                            weights=[embedding_matrix], 
                            input_length=max_seq_len, 
                            trainable=False)

In [14]:
ques1_input = Input(shape=(max_seq_len,), dtype='int32')
ques1_embedded = embedding_layer(ques1_input)
ques1_output = LSTM(50)(ques1_embedded)

ques2_input = Input(shape=(max_seq_len,), dtype='int32')
ques2_embedded = embedding_layer(ques2_input)
ques2_output = LSTM(50)(ques2_embedded)

difference_tensor = Subtract()([ques1_output, ques2_output])
predictions = Dense(1, activation='sigmoid')(difference_tensor)

model = Model(inputs=[ques1_input, ques2_input], outputs=predictions)

In [15]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
model.fit([ques1_train, ques2_train], np.array(labels_train), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f1cb7ada588>

In [17]:
preds_test = model.predict([ques1_test, ques2_test])

In [20]:
print(preds_test[:,0].shape)

(2345790,)


In [22]:
result_df = pd.DataFrame({'test_id': test_data['test_id'], 'is_duplicate':preds_test[:,0]}, 
                         columns=['test_id', 'is_duplicate'])
result_df.to_csv('submissions.csv', header=['test_id', 'is_duplicate'], index=False)