In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from glob import glob
from datetime import datetime

In [2]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

label = train_data["label"].values

In [3]:
from keras.preprocessing.text import Tokenizer

MAX_WORD_NUMS = 10000

word_tokenizer = Tokenizer(MAX_WORD_NUMS)
word_tokenizer.fit_on_texts(question_data["words"])

word_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),
        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_WORD_NUMS]].values
    ),
    axis=0
)
word_embedding_data.shape

Using TensorFlow backend.


(10001, 300)

In [4]:
from keras.preprocessing.sequence import pad_sequences

WORD_SEQ_LEN = 30

def gen_data(data):
    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["words"])
    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["words"])
    return pad_sequences(seq_word1, maxlen=WORD_SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_word2, maxlen=WORD_SEQ_LEN, padding="pre",truncating="pre")

word1, word2 = gen_data(train_data)
test_word1, test_word2 = gen_data(test_data)

word1.shape, word2.shape, test_word1.shape, test_word2.shape

((254386, 30), (254386, 30), (172956, 30), (172956, 30))

In [5]:
from keras.models import Model
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM, Bidirectional, TimeDistributed
from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K

In [6]:
# general
NUM_EPOCHES = 50
BATCH_SIZE = 1024
DENSE_SIZE = 300 # 512
DROP_RATE = 0.3

# cnn
CONV_LEN_1 = 128
CONV_LEN_2 = 128
CONV_LEN_3 = 128
CONV_LEN_4 = 128
CONV_LEN_5 = 128
CONV_LEN_6 = 128
CONV_LEN = CONV_LEN_1 + CONV_LEN_2 + CONV_LEN_3 + CONV_LEN_4 + CONV_LEN_5 + CONV_LEN_6

# lstm
LSTM_SIZE_1 = 256
LSTM_SIZE_2 = 256
DROP_RATE_LSTM = 0.3

In [7]:
def cnn_layer_1(input1, input2, kernel_size, filters):
    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu")
    
    conv_a = conv(input1)
    conv_a = GlobalAveragePooling1D()(conv_a)
    
    conv_b = conv(input2)
    conv_b = GlobalAveragePooling1D()(conv_b)
    return conv_a, conv_b

In [8]:
# from sklearn.model_selection import train_test_split

# train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(
#     word1, word2, train_data["label"].values,
#     test_size=0.2
# )

# word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
# word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")

# embedding_layer = Embedding(
#     input_dim=word_embedding_data.shape[0],
#     output_dim=word_embedding_data.shape[1],
#     weights=[word_embedding_data],
#     input_length=WORD_SEQ_LEN,
#     trainable=False
# )

# word_vector1 = embedding_layer(word_input1)
# word_vector2 = embedding_layer(word_input2)

# lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)
# word_first_1 = lstm_layer1(word_vector1)
# word_first_1 = Dropout(DROP_RATE)(word_first_1)
# word_first_2 = lstm_layer1(word_vector2)
# word_first_2 = Dropout(DROP_RATE)(word_first_2)

# lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)
# word_second_1 = lstm_layer2(word_first_1)
# word_second_2 = lstm_layer2(word_first_2)

# conv1a, conv1b = cnn_layer_1(word_second_1, word_second_2, kernel_size=1, filters=CONV_LEN_1)
# conv2a, conv2b = cnn_layer_1(word_second_1, word_second_2, kernel_size=2, filters=CONV_LEN_2)
# conv3a, conv3b = cnn_layer_1(word_second_1, word_second_2, kernel_size=3, filters=CONV_LEN_3)
# conv4a, conv4b = cnn_layer_1(word_second_1, word_second_2, kernel_size=4, filters=CONV_LEN_4)
# conv5a, conv5b = cnn_layer_1(word_second_1, word_second_2, kernel_size=5, filters=CONV_LEN_5)
# conv6a, conv6b = cnn_layer_1(word_second_1, word_second_2, kernel_size=6, filters=CONV_LEN_6)

# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
# diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])
# mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])
# merge = concatenate([diff, mult])

# x = Dropout(DROP_RATE)(merge)
# x = BatchNormalization()(x)

# x = Dense(DENSE_SIZE, activation="relu")(x)
# x = Dropout(DROP_RATE)(x)
# x = BatchNormalization()(x)

# pred = Dense(1, activation="sigmoid")(x)

# model = Model(inputs=[word_input1, word_input2], outputs=pred)
# model.compile(
#     optimizer="nadam",
#     loss="binary_crossentropy",
#     metrics=["acc"]
# )

# early_stopping = EarlyStopping("val_loss", patience=10)
# check_point = ModelCheckpoint(
#     "./log/%s.multi_lstm_cnn.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
#     monitor="val_loss",
#     save_best_only=True,
#     save_weights_only=True
# )

# train_res = model.fit(
#     x=[train_word1, train_word2],
#     y=train_y,
#     batch_size=BATCH_SIZE,
#     epochs=NUM_EPOCHES,
#     validation_data=([dev_word1, dev_word2], dev_y),
#     shuffle=True,
#     callbacks=[early_stopping, check_point]
# )

# pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
# pd.DataFrame(pred_last, columns=["y_pre"]).to_csv(
#     "./result/%s-multilstm_cnn_pred_last.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
#     index=False
# )

# print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
# model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))
# pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
# pd.DataFrame(pred_best, columns=["y_pre"]).to_csv(
#     "./result/%s-multilstm_cnn_pred_best.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
#     index=False
# )

In [9]:
from sklearn.model_selection import StratifiedKFold

best_results = []
last_results = []

for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):
    train_word1, train_word2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]
    dev_word1, dev_word2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]
    
    word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
    word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")

    embedding_layer = Embedding(
        input_dim=word_embedding_data.shape[0],
        output_dim=word_embedding_data.shape[1],
        weights=[word_embedding_data],
        input_length=WORD_SEQ_LEN,
        trainable=False
    )

    word_vector1 = embedding_layer(word_input1)
    word_vector2 = embedding_layer(word_input2)

    lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)
    word_first_1 = lstm_layer1(word_vector1)
    word_first_1 = Dropout(DROP_RATE)(word_first_1)
    word_first_2 = lstm_layer1(word_vector2)
    word_first_2 = Dropout(DROP_RATE)(word_first_2)

    lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)
    word_second_1 = lstm_layer2(word_first_1)
    word_second_2 = lstm_layer2(word_first_2)

    conv1a, conv1b = cnn_layer_1(word_second_1, word_second_2, kernel_size=1, filters=CONV_LEN_1)
    conv2a, conv2b = cnn_layer_1(word_second_1, word_second_2, kernel_size=2, filters=CONV_LEN_2)
    conv3a, conv3b = cnn_layer_1(word_second_1, word_second_2, kernel_size=3, filters=CONV_LEN_3)
    conv4a, conv4b = cnn_layer_1(word_second_1, word_second_2, kernel_size=4, filters=CONV_LEN_4)
    conv5a, conv5b = cnn_layer_1(word_second_1, word_second_2, kernel_size=5, filters=CONV_LEN_5)
    conv6a, conv6b = cnn_layer_1(word_second_1, word_second_2, kernel_size=6, filters=CONV_LEN_6)

    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])
    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])
    merge = concatenate([diff, mult])

    x = Dropout(DROP_RATE)(merge)
    x = BatchNormalization()(x)

    x = Dense(DENSE_SIZE, activation="relu")(x)
    x = Dropout(DROP_RATE)(x)
    x = BatchNormalization()(x)

    pred = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=[word_input1, word_input2], outputs=pred)
    model.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=["acc"]
    )

    early_stopping = EarlyStopping("val_loss", patience=10)
    check_point = ModelCheckpoint(
        "./log/%s.multi_lstm_cnn.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
        monitor="val_loss",
        save_best_only=True,
        save_weights_only=True
    )

    train_res = model.fit(
        x=[train_word1, train_word2],
        y=train_y,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHES,
        validation_data=([dev_word1, dev_word2], dev_y),
        shuffle=True,
        callbacks=[early_stopping, check_point]
    )

    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
    last_results.append(pd.DataFrame(pred_last, columns=["y_pre"]))
    

    print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
    model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))
    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
    best_results.append(pd.DataFrame(pred_best, columns=["y_pre"]))

pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
    "./result/%s-multilstm_cnn_pred_last.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
    index=False
)
pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
    "./result/%s-multilstm_cnn_pred_best.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
    index=False
)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 228946 samples, validate on 25440 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

KeyboardInterrupt: 

In [None]:
# train_encode = []
# test_encode = []

# for model_name in glob("./models/*.hdf5"):
#     word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
#     word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")

#     embedding_layer = Embedding(
#         input_dim=word_embedding_data.shape[0],
#         output_dim=word_embedding_data.shape[1],
#         weights=[word_embedding_data],
#         input_length=WORD_SEQ_LEN,
#         trainable=False
#     )

#     word_vector1 = embedding_layer(word_input1)
#     word_vector2 = embedding_layer(word_input2)

#     lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)
#     word_first_1 = lstm_layer1(word_vector1)
#     word_first_1 = Dropout(DROP_RATE)(word_first_1)
#     word_first_2 = lstm_layer1(word_vector2)
#     word_first_2 = Dropout(DROP_RATE)(word_first_2)

#     lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)
#     word_second_1 = lstm_layer2(word_first_1)
#     word_second_2 = lstm_layer2(word_first_2)

#     conv1a, conv1b = cnn_layer_1(word_second_1, word_second_2, kernel_size=1, filters=CONV_LEN_1)
#     conv2a, conv2b = cnn_layer_1(word_second_1, word_second_2, kernel_size=2, filters=CONV_LEN_2)
#     conv3a, conv3b = cnn_layer_1(word_second_1, word_second_2, kernel_size=3, filters=CONV_LEN_3)
#     conv4a, conv4b = cnn_layer_1(word_second_1, word_second_2, kernel_size=4, filters=CONV_LEN_4)
#     conv5a, conv5b = cnn_layer_1(word_second_1, word_second_2, kernel_size=5, filters=CONV_LEN_5)
#     conv6a, conv6b = cnn_layer_1(word_second_1, word_second_2, kernel_size=6, filters=CONV_LEN_6)

#     merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
#     merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
#     diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])
#     mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])
#     merge = concatenate([diff, mult])

#     x = Dropout(DROP_RATE)(merge)
#     x = BatchNormalization()(x)

#     x = Dense(DENSE_SIZE, activation="relu")(x)
#     x = Dropout(DROP_RATE)(x)
#     x = BatchNormalization()(x)

#     pred = Dense(1, activation="sigmoid")(x)

#     model = Model(inputs=[word_input1, word_input2], outputs=pred)
#     model.load_weights(model_name.replace("\\", "/"))
    
#     encode_model = Model(inputs=[word_input1, word_input2], outputs=model.layers[-4].output)
#     train_feature = encode_model.predict([word1, word2], batch_size=BATCH_SIZE)
#     test_feature = encode_model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)
#     train_encode.append(train_feature)
#     test_encode.append(test_feature)

# train_dense = train_encode[0].copy()
# for t in train_encode[1:]:
#     train_dense += t
# train_dense = train_dense / 10
# pd.DataFrame(train_dense).to_csv("train_input.csv", index=False)

# test_dense = test_encode[0].copy()
# for t in test_encode[1:]:
#     test_dense += t
# test_dense = test_dense / 10
# pd.DataFrame(test_dense).to_csv("test_input.csv", index=False)

# import xgboost as xgb
# from sklearn.model_selection import train_test_split

# xgb_train_x, xgb_dev_x, xgb_train_y, xgb_dev_y = train_test_split(mean_dense, label, test_size=0.2, stratify=label)

# train_data = xgb.DMatrix(xgb_train_x, xgb_train_y)
# dev_data = xgb.DMatrix(xgb_dev_x, xgb_dev_y)

# params = {
#     "objective": "binary:logistic",
#     "eval_metric": "logloss",
#     "eta": 0.01,
#     "max_depth": 5,
#     "subsample": 0.8,
#     "colsample_bytree": 0.8,
#     "lambda": 1,
# }


# boost = xgb.train(
#     params=params,
#     dtrain=train_data,
#     num_boost_round=200,
#     evals=[(dev_data, "dev")],
#     early_stopping_rounds=10,
# )

# test_pred = boost.predict(xgb.DMatrix(test_dense))
# pd.DataFrame(test_pred, columns=["y_pre"]).to_csv(
#     "./result/%s-xgb_multilstm_cnn_pred_best.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
#     index=False
# )