# LAST: 0.217161
# BEST: 0.217487

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from glob import glob
from datetime import datetime

In [2]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

In [3]:
from keras.preprocessing.text import Tokenizer

MAX_WORD_NUMS = 10000

word_tokenizer = Tokenizer(MAX_WORD_NUMS)
word_tokenizer.fit_on_texts(question_data["words"])

word_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),
        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_WORD_NUMS]].values
    ),
    axis=0
)
word_embedding_data.shape

Using TensorFlow backend.


(10001, 300)

In [4]:
from keras.preprocessing.sequence import pad_sequences

WORD_SEQ_LEN = 30

def gen_data(data):
    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["words"])
    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["words"])
    return pad_sequences(seq_word1, maxlen=WORD_SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_word2, maxlen=WORD_SEQ_LEN, padding="pre",truncating="pre")

word1, word2 = gen_data(train_data)
test_word1, test_word2 = gen_data(test_data)

word1.shape, word2.shape, test_word1.shape, test_word2.shape

((254386, 30), (254386, 30), (172956, 30), (172956, 30))

In [5]:
label = train_data["label"].values

In [6]:
from keras.layers import Input, Embedding, Dropout, Dense, BatchNormalization, K, LSTM
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.merge import concatenate

DROP_RATE = 0.25
LSTM_SIZE_1 = 256
LSTM_SIZE_2 = 256
DENSE_SIZE = 300
BATCH_SIZE = 2048
NUM_EPOCHES = 50

In [7]:
# from sklearn.model_selection import train_test_split

# train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(
#     word1, word2, train_data["label"].values,
#     test_size=0.2
# )

# word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
# word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")

# embedding_layer = Embedding(
#     input_dim=word_embedding_data.shape[0],
#     output_dim=word_embedding_data.shape[1],
#     weights=[word_embedding_data],
#     input_length=WORD_SEQ_LEN,
#     trainable=False
# )

# word_vector1 = embedding_layer(word_input1)
# word_vector2 = embedding_layer(word_input2)

# lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)
# word_first_1 = lstm_layer1(word_vector1)
# word_first_1 = Dropout(DROP_RATE)(word_first_1)
# word_first_2 = lstm_layer1(word_vector2)
# word_first_2 = Dropout(DROP_RATE)(word_first_2)

# lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=False)
# word_second_1 = lstm_layer2(word_first_1)
# word_second_2 = lstm_layer2(word_first_2)

# x = concatenate([word_second_1, word_second_2])
# x = Dropout(DROP_RATE)(x)
# x = BatchNormalization()(x)

# x = Dense(DENSE_SIZE, activation="relu")(x)
# x = Dropout(DROP_RATE)(x)
# x = BatchNormalization()(x)

# pred = Dense(1, activation="sigmoid")(x)

# model = Model(inputs=[word_input1, word_input2], outputs=pred)
# model.compile(
#     optimizer="nadam",
#     loss="binary_crossentropy",
#     metrics=["acc"]
# )

# early_stopping = EarlyStopping("val_loss", patience=10)
# check_point = ModelCheckpoint(
#     "./log/%s.multi_lstm.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
#     monitor="val_loss",
#     save_best_only=True,
#     save_weights_only=True
# )

# train_res = model.fit(
#     x=[train_word1, train_word2],
#     y=train_y,
#     batch_size=BATCH_SIZE,
#     epochs=NUM_EPOCHES,
#     validation_data=([dev_word1, dev_word2], dev_y),
#     shuffle=True,
#     callbacks=[early_stopping, check_point]
# )

# print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
# model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))

# test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
# pd.DataFrame(test_pred, columns=["y_pre"]).to_csv("./result/pred.csv", index=False)

In [8]:
from sklearn.model_selection import StratifiedKFold

pred_collect = []

for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):
    train_word1, train_word2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]
    dev_word1, dev_word2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]

    word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
    word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")

    embedding_layer = Embedding(
        input_dim=word_embedding_data.shape[0],
        output_dim=word_embedding_data.shape[1],
        weights=[word_embedding_data],
        input_length=WORD_SEQ_LEN,
        trainable=False
    )

    word_vector1 = embedding_layer(word_input1)
    word_vector2 = embedding_layer(word_input2)

    lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)
    word_first_1 = lstm_layer1(word_vector1)
    word_first_1 = Dropout(DROP_RATE)(word_first_1)
    word_first_2 = lstm_layer1(word_vector2)
    word_first_2 = Dropout(DROP_RATE)(word_first_2)

    lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=False)
    word_second_1 = lstm_layer2(word_first_1)
    word_second_2 = lstm_layer2(word_first_2)

    x = concatenate([word_second_1, word_second_2])
    x = Dropout(DROP_RATE)(x)
    x = BatchNormalization()(x)

    x = Dense(DENSE_SIZE, activation="relu")(x)
    x = Dropout(DROP_RATE)(x)
    x = BatchNormalization()(x)

    pred = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=[word_input1, word_input2], outputs=pred)
    model.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=["acc"]
    )

    early_stopping = EarlyStopping("val_loss", patience=10)
    check_point = ModelCheckpoint(
        "./log/%s.multi_lstm.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
        monitor="val_loss",
        save_best_only=True,
        save_weights_only=True
    )

    train_res = model.fit(
        x=[train_word1, train_word2],
        y=train_y,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHES,
        validation_data=([dev_word1, dev_word2], dev_y),
        shuffle=True,
        callbacks=[early_stopping, check_point]
    )

    print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
    model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))

    test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
    pred_collect.append(pd.DataFrame(test_pred, columns=["y_pre"]))

pd.DataFrame(pd.concat(pred_collect, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
    "./result/%s-pred.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
    index=False
)

Train on 228946 samples, validate on 25440 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
load model ./log/20180703-074404.multi_lstm.017.hdf5
Train on 228946 samples, validate on 25440 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
load model ./log/20180703-083358.multi_lstm.018.hdf5
Train on 228947 samples, validate on 25439 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
load model ./log/20180703-103906.multi_lstm.018.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
load model ./log/20180703-113130.multi_lstm.022.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
load model ./log/20180703-123135.multi_lstm.022.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
load model ./log/20180703-133100.multi_lstm.016.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
load model ./log/20180703-142013.multi_lstm.016.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
load model ./log/20180703-150948.multi_lstm.029.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/50
Epoch 2/50


Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
load model ./log/20180703-162436.multi_lstm.018.hdf5
