In [1]:
import sys, os
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Lambda, TimeDistributed, Dense, BatchNormalization, Merge, Concatenate
from keras.layers.merge import concatenate
from keras import backend as K
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
cur_dir=os.getcwd()
data_dir=os.path.join(cur_dir, 'data')
save_dir=os.path.join(data_dir, 'save')
model_dir=os.path.join(os.path.dirname(cur_dir), 'models')
glove_file=os.path.join(model_dir, 'glove.6B/glove.6B.300d.txt')
if not os.path.exists(save_dir): os.mkdir(save_dir)
    
max_nb_words=200000
max_seq_length=25
embedding_dim=300
val_split=0.1
test_split=0.1

In [3]:
df=pd.read_csv(os.path.join(data_dir, 'train.csv'))
df['question1']=df['question1'].apply(str)
df['question2']=df['question2'].apply(str)

In [4]:
questions=list(df['question1'])+list(df['question2'])
tokenizer=Tokenizer(max_nb_words)
tokenizer.fit_on_texts(questions)
q1_seqs=tokenizer.texts_to_sequences(list(df['question1']))
q2_seqs=tokenizer.texts_to_sequences(list(df['question2']))
word_index=tokenizer.word_index


In [5]:
nb_words=min(max_nb_words, len(word_index))

In [6]:
embeddings_index={}
f=open(glove_file, encoding='utf-8')
for line in tqdm(f):
    values=line.split(' ')
    word=values[0]
    coefs=np.asarray(values[1:], dtype='float32')
    embeddings_index[word]=coefs
f.close()

400000it [00:37, 10732.98it/s]


In [7]:
embedding_matrix=np.zeros((nb_words+1, embedding_dim))
for word, i in word_index.items():
    if i>max_nb_words:
        continue
    embedding_vector=embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector
        
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 35260


In [13]:
pickle.dump(embedding_matrix, open('glovemat6B.300d.pickle', 'wb'))

In [6]:
embedding_matrix=pickle.load(open('glovemat840B.300d.pickle', 'rb'))
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 29276


In [7]:
q1_data=pad_sequences(q1_seqs, maxlen=max_seq_length)
q2_data=pad_sequences(q2_seqs, maxlen=max_seq_length)
labels=np.asarray(df['is_duplicate'], dtype=int)
print('Q1 shape: ', q1_data.shape)
print('Q2 shape: ', q2_data.shape)
print('Labels shape: ', labels.shape)

Q1 shape:  (404290, 25)
Q2 shape:  (404290, 25)
Labels shape:  (404290,)


In [9]:
X=np.stack((q1_data, q2_data), axis=1)
y=labels
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=val_split, random_state=13)
Q1_train=X_train[:,0]
Q2_train=X_train[:,1]
Q1_test=X_test[:,0]
Q2_test=X_test[:,1]

In [8]:
Q1=Sequential()
Q1.add(Embedding(nb_words+1, 
                 embedding_dim, 
                 weights=[embedding_matrix], 
                 input_length=max_seq_length, 
                 trainable=False))
Q1.add(TimeDistributed(Dense(embedding_dim, activation='relu')))
Q1.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(embedding_dim, )))

Q2=Sequential()
Q2.add(Embedding(nb_words+1, 
                 embedding_dim, 
                 weights=[embedding_matrix], 
                 input_length=max_seq_length, 
                 trainable=False))
Q2.add(TimeDistributed(Dense(embedding_dim, activation='relu')))
Q2.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(embedding_dim, )))

In [9]:
model=Sequential()
model.add(Merge([Q1, Q2], mode='concat'))
model.add(BatchNormalization())
model.add(Dense(200,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(200,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(200,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1,activation='sigmoid'))

  from ipykernel import kernelapp as app


In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 600)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 600)               2400      
_________________________________________________________________
dense_3 (Dense)              (None, 200)               120200    
_________________________________________________________________
batch_normalization_2 (Batch (None, 200)               800       
_________________________________________________________________
dense_4 (Dense)              (None, 200)               40200     
_________________________________________________________________
batch_normalization_3 (Batch (None, 200)               800       
_________________________________________________________________
dense_5 (Dense)              (None, 200)               40200     
__________

In [11]:
loss='binary_crossentropy'
optimizer='adam'
metrics=['accuracy']
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [12]:
model.load_weights("W4-{'val_loss': [0.37383292400570683], 'val_acc': [0.82675802024921841], 'loss': [0.32894619806340158], 'acc': [0.85018729679789484]}.h5")

In [None]:
model.fit([q1_data, q2_data], labels, epochs=1, validation_split=0.1)

In [13]:
dft=pd.read_csv(os.path.join(data_dir, 'test.csv'))
dft['question1']=dft['question1'].apply(str)
dft['question2']=dft['question2'].apply(str)

In [16]:
test_id=dft['test_id']

In [14]:
tq1_seqs=tokenizer.texts_to_sequences(list(dft['question1']))
tq2_seqs=tokenizer.texts_to_sequences(list(dft['question2']))

In [15]:
tq1_data=pad_sequences(tq1_seqs, maxlen=max_seq_length)
tq2_data=pad_sequences(tq2_seqs, maxlen=max_seq_length)

In [31]:
preds=model.predict_classes([tq1_data, tq2_data])
preds=np.apply_along_axis(int, -1, preds)
out_df = pd.DataFrame({"test_id":test_id[:100], "is_duplicate":preds})
out_df.to_csv("s1.csv", index=False)



In [30]:
np.apply_along_axis(int, -1, preds)

(100,)