In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from glob import glob
from datetime import datetime

In [2]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

In [3]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

In [4]:
from keras.preprocessing.text import Tokenizer

MAX_WORD_NUMS = 10000

word_tokenizer = Tokenizer(MAX_WORD_NUMS)
word_tokenizer.fit_on_texts(question_data["words"])

Using TensorFlow backend.


In [5]:
word_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),
        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_WORD_NUMS]].values
    ),
    axis=0
)
word_embedding_data.shape

(10001, 300)

In [6]:
from keras.preprocessing.sequence import pad_sequences

WORD_SEQ_LEN = 30

def gen_data(data):
    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["words"])
    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["words"])
    return pad_sequences(seq_word1, maxlen=WORD_SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_word2, maxlen=WORD_SEQ_LEN, padding="pre",truncating="pre")

word1, word2 = gen_data(train_data)
test_word1, test_word2 = gen_data(test_data)

word1.shape, word2.shape, test_word1.shape, test_word2.shape

((254386, 30), (254386, 30), (172956, 30), (172956, 30))

In [7]:
label = train_data["label"].values

In [8]:
from keras.layers import Input, Embedding, Conv1D, GlobalAveragePooling1D, MaxPool1D, Lambda, Dropout, BatchNormalization, Dense, Flatten, K
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.merge import concatenate

CONV_LEN_1 = 128
CONV_LEN_2 = 128
CONV_LEN_3 = 128
CONV_LEN_4 = 128
CONV_LEN_5 = 128
CONV_LEN_6 = 128
CONV_LEN = CONV_LEN_1 + CONV_LEN_2 + CONV_LEN_3 + CONV_LEN_4 + CONV_LEN_5 + CONV_LEN_6
DROP_RATE = 0.6
DENSE_SIZE = 300
BATCH_SIZE = 2048
NUM_EPOCHES = 50

In [9]:
from sklearn.model_selection import train_test_split

train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(
    word1, word2, train_data["label"].values,
    test_size=0.2
)

word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")

embedding_layer = Embedding(
    input_dim=word_embedding_data.shape[0],
    output_dim=word_embedding_data.shape[1],
    weights=[word_embedding_data],
    input_length=WORD_SEQ_LEN,
    trainable=False
)

word_vector1 = embedding_layer(word_input1)
word_vector2 = embedding_layer(word_input2)

def cnn_layer(input1, input2, kernel_size, filters):
    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu")
    
    conv_a = conv(input1)
    conv_a = MaxPool1D(pool_size=WORD_SEQ_LEN, strides=WORD_SEQ_LEN, padding="same")(conv_a)
    conv_a = Flatten()(conv_a)
    
    conv_b = conv(input2)
    conv_b = MaxPool1D(pool_size=WORD_SEQ_LEN, strides=WORD_SEQ_LEN, padding="same")(conv_b)
    conv_b = Flatten()(conv_b)
    return conv_a, conv_b

conv1a, conv1b = cnn_layer(word_vector1, word_vector2, kernel_size=1, filters=CONV_LEN_1)
conv2a, conv2b = cnn_layer(word_vector1, word_vector2, kernel_size=2, filters=CONV_LEN_2)
conv3a, conv3b = cnn_layer(word_vector1, word_vector2, kernel_size=3, filters=CONV_LEN_3)
conv4a, conv4b = cnn_layer(word_vector1, word_vector2, kernel_size=4, filters=CONV_LEN_4)
conv5a, conv5b = cnn_layer(word_vector1, word_vector2, kernel_size=5, filters=CONV_LEN_5)
conv6a, conv6b = cnn_layer(word_vector1, word_vector2, kernel_size=6, filters=CONV_LEN_6)

merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])
mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])
merge = concatenate([diff, mult])

x = Dropout(DROP_RATE)(merge)
x = BatchNormalization()(x)
x = Dense(DENSE_SIZE, activation="relu")(x)

x = Dropout(DROP_RATE)(x)
x = BatchNormalization()(x)
pred = Dense(1, activation="sigmoid")(x)

model = Model(
    inputs = [word_input1, word_input2],
    outputs = pred
)
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["acc"]
)

early_stop = EarlyStopping("val_loss", patience=10)
check_point = ModelCheckpoint(
    "./log/%s.cnn.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True
)

model_res = model.fit(
    x=[train_word1, train_word2],
    y=train_y,
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHES,
    validation_data=([dev_word1, dev_word2], dev_y),
    shuffle=True,
    callbacks=[early_stop, check_point]
)

test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
pd.DataFrame(test_pred, columns=["y_pre"]).to_csv("./result/pred_last.csv", index=False)

print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))
test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
pd.DataFrame(test_pred, columns=["y_pre"]).to_csv("./result/pred_best.csv", index=False)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 203508 samples, validate on 50878 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
load model ./log/20180703-234800.cnn.034.hdf5


KeyboardInterrupt: 

In [None]:
# from sklearn.model_selection import train_test_split

# train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(
#     word1, word2, train_data["label"].values,
#     test_size=0.2
# )

# word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
# word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")

# embedding_layer = Embedding(
#     input_dim=word_embedding_data.shape[0],
#     output_dim=word_embedding_data.shape[1],
#     weights=[word_embedding_data],
#     input_length=WORD_SEQ_LEN,
#     trainable=False
# )

# word_vector1 = embedding_layer(word_input1)
# word_vector2 = embedding_layer(word_input2)

# conv1 = Conv1D(filters=CONV_LEN_1, kernel_size=1, padding="same", activation="relu")
# conv1a = conv1(word_vector1)
# conv1a = GlobalAveragePooling1D()(conv1a)
# conv1b = conv1(word_vector2)
# conv1b = GlobalAveragePooling1D()(conv1b)

# conv2 = Conv1D(filters=CONV_LEN_2, kernel_size=2, padding="same", activation="relu")
# conv2a = conv2(word_vector1)
# conv2a = GlobalAveragePooling1D()(conv2a)
# conv2b = conv2(word_vector2)
# conv2b = GlobalAveragePooling1D()(conv2b)

# conv3 = Conv1D(filters=CONV_LEN_3, kernel_size=3, padding="same", activation="relu")
# conv3a = conv3(word_vector1)
# conv3a = GlobalAveragePooling1D()(conv3a)
# conv3b = conv3(word_vector2)
# conv3b = GlobalAveragePooling1D()(conv3b)

# conv4 = Conv1D(filters=CONV_LEN_4, kernel_size=4, padding="same", activation="relu")
# conv4a = conv4(word_vector1)
# conv4a = GlobalAveragePooling1D()(conv4a)
# conv4b = conv4(word_vector2)
# conv4b = GlobalAveragePooling1D()(conv4b)

# conv5 = Conv1D(filters=CONV_LEN_5, kernel_size=5, padding="same", activation="relu")
# conv5a = conv5(word_vector1)
# conv5a = GlobalAveragePooling1D()(conv5a)
# conv5b = conv5(word_vector2)
# conv5b = GlobalAveragePooling1D()(conv5b)

# conv6 = Conv1D(filters=CONV_LEN_6, kernel_size=6, padding="same", activation="relu")
# conv6a = conv6(word_vector1)
# conv6a = GlobalAveragePooling1D()(conv6a)
# conv6b = conv6(word_vector2)
# conv6b = GlobalAveragePooling1D()(conv6b)

# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
# # merge = concatenate([merge_a, merge_b])

# diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(CONV_LEN,))([merge_a, merge_b])
# mult = Lambda(lambda x: x[0] * x[1], output_shape=(CONV_LEN,))([merge_a, merge_b])
# merge = concatenate([diff, mult])

# x = Dropout(DROP_RATE)(merge)
# x = BatchNormalization()(x)
# x = Dense(DENSE_SIZE, activation="relu")(x)

# x = Dropout(DROP_RATE)(x)
# x = BatchNormalization()(x)
# pred = Dense(1, activation="sigmoid")(x)

# model = Model(
#     inputs = [word_input1, word_input2],
#     outputs = pred
# )
# model.compile(
#     optimizer="adam",
#     loss="binary_crossentropy",
#     metrics=["acc"]
# )

# early_stop = EarlyStopping("val_loss", patience=10)
# check_point = ModelCheckpoint(
#     "./log/%s.cnn.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
#     monitor="val_loss",
#     save_best_only=True,
#     save_weights_only=True
# )

# model_res = model.fit(
#     x=[train_word1, train_word2],
#     y=train_y,
#     batch_size=BATCH_SIZE,
#     epochs=NUM_EPOCHES,
#     validation_data=([dev_word1, dev_word2], dev_y),
#     shuffle=True,
#     callbacks=[early_stop, check_point]
# )

# test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
# pd.DataFrame(test_pred, columns=["y_pre"]).to_csv("./result/pred_last.csv", index=False)

# print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
# model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))
# test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
# pd.DataFrame(test_pred, columns=["y_pre"]).to_csv("./result/pred_best.csv", index=False)

In [None]:
# from sklearn.model_selection import StratifiedKFold

# pred_collect = []

# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):
#     train_word1, train_word2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]
#     dev_word1, dev_word2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]
    
#     word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
#     word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
    
#     embedding_layer = Embedding(
#         input_dim=word_embedding_data.shape[0],
#         output_dim=word_embedding_data.shape[1],
#         weights=[word_embedding_data],
#         input_length=WORD_SEQ_LEN,
#         trainable=False
#     )
    
#     word_vector1 = embedding_layer(word_input1)
#     word_vector2 = embedding_layer(word_input2)
    
#     conv1 = Conv1D(filters=128, kernel_size=1, padding="same", activation="relu")
#     conv1a = conv1(word_vector1)
#     conv1a = GlobalAveragePooling1D()(conv1a)
#     conv1b = conv1(word_vector2)
#     conv1b = GlobalAveragePooling1D()(conv1b)
    
#     conv2 = Conv1D(filters=128, kernel_size=2, padding="same", activation="relu")
#     conv2a = conv2(word_vector1)
#     conv2a = GlobalAveragePooling1D()(conv2a)
#     conv2b = conv2(word_vector2)
#     conv2b = GlobalAveragePooling1D()(conv2b)
    
#     conv3 = Conv1D(filters=128, kernel_size=3, padding="same", activation="relu")
#     conv3a = conv3(word_vector1)
#     conv3a = GlobalAveragePooling1D()(conv3a)
#     conv3b = conv3(word_vector2)
#     conv3b = GlobalAveragePooling1D()(conv3b)
    
#     conv4 = Conv1D(filters=128, kernel_size=4, padding="same", activation="relu")
#     conv4a = conv4(word_vector1)
#     conv4a = GlobalAveragePooling1D()(conv4a)
#     conv4b = conv4(word_vector2)
#     conv4b = GlobalAveragePooling1D()(conv4b)
    
#     conv5 = Conv1D(filters=128, kernel_size=5, padding="same", activation="relu")
#     conv5a = conv5(word_vector1)
#     conv5a = GlobalAveragePooling1D()(conv5a)
#     conv5b = conv5(word_vector2)
#     conv5b = GlobalAveragePooling1D()(conv5b)
    
#     conv6 = Conv1D(filters=128, kernel_size=6, padding="same", activation="relu")
#     conv6a = conv6(word_vector1)
#     conv6a = GlobalAveragePooling1D()(conv6a)
#     conv6b = conv6(word_vector2)
#     conv6b = GlobalAveragePooling1D()(conv6b)
    
#     merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
#     merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
    
#     diff = Lambda(lambda x: x[0] - x[1], output_shape=(CONV_LEN,))([merge_a, merge_b])
#     mult = Lambda(lambda x: x[0] * x[1], output_shape=(CONV_LEN,))([merge_a, merge_b])
    
#     merge = concatenate([diff, mult])
    
#     x = Dropout(DROP_RATE)(merge)
#     x = BatchNormalization()(x)
#     x = Dense(DENSE_SIZE, activation="relu")(x)
    
#     x = Dropout(DROP_RATE)(x)
#     x = BatchNormalization()(x)
#     pred = Dense(1, activation="sigmoid")(x)
    
#     model = Model(
#         inputs = [word_input1, word_input2],
#         outputs = pred
#     )
#     model.compile(
#         optimizer="adam",
#         loss="binary_crossentropy",
#         metrics=["acc"]
#     )
    
#     early_stop = EarlyStopping("val_loss", patience=10)
#     check_point = ModelCheckpoint(
#         "./log/cnn_%02d.{epoch:02d}_{val_loss:.3f}.hdf5" % (i + 1),
#         monitor="val_loss",
#         save_best_only=True,
#         save_weights_only=True
#     )
    
#     model_res = model.fit(
#         x=[train_word1, train_word2],
#         y=train_y,
#         batch_size=BATCH_SIZE,
#         epochs=NUM_EPOCHES,
#         validation_data=([dev_word1, dev_word2], dev_y),
#         shuffle=True,
#         callbacks=[early_stop, check_point]
#     )
    
#     print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
#     model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))

#     test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
#     pred_collect.append(pd.DataFrame(test_pred, columns=["y_pre"]))

# pd.DataFrame(pd.concat(pred_collect, axis=1).mean(axis=1), columns=["y_pre"]).to_csv("./result/pred.csv", index=False)