## 模型结构

- 两层单向LSTM, 输出序列结果, 即(batch_size, step_size, feature_size)
- 分别输入到1, 2, 3, 4, 5, 6共6个不同长度的卷积层中
- 卷积层为双层, 最后的池化层有Average和Max两种
- 对于每个问题, 将所有卷积核结果并起来
- 将两个问题并起来的结果, 分别[相减并取绝对值], [x]
- 融入特征

In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import time
import numpy as np
import pandas as pd
from glob import glob
from datetime import datetime

In [2]:
try:
    os.mkdir("./log/")
    os.mkdir("./result/")
except FileExistsError:
    pass

In [3]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"
TRAIN_FEATURE = DATA_PATH + "train_feature.csv"
TEST_FEATURE = DATA_PATH + "test_feature.csv"

In [4]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)
train_feature = pd.read_csv(TRAIN_FEATURE).values
test_feature = pd.read_csv(TEST_FEATURE).values

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

label = train_data["label"].values

from keras.preprocessing.text import Tokenizer

MAX_COUNT = 10000

word_tokenizer = Tokenizer(MAX_COUNT)
word_tokenizer.fit_on_texts(question_data["words"])

word_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),
        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values
    ),
    axis=0
)

char_tokenizer = Tokenizer(MAX_COUNT)
char_tokenizer.fit_on_texts(question_data["chars"])

char_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),
        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values
    ),
    axis=0
)

word_embedding_data.shape, char_embedding_data.shape

Using TensorFlow backend.


((10001, 300), (3049, 300))

In [5]:
from keras.preprocessing.sequence import pad_sequences

SEQ_LEN = 25

def gen_word_data(data):
    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["words"])
    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["words"])
    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding="pre",truncating="pre")
    
def gen_char_data(data):
    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["chars"])
    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["chars"])
    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding="pre",truncating="pre")

word1, word2 = gen_word_data(train_data)
char1, char2 = gen_char_data(train_data)
test_word1, test_word2 = gen_word_data(test_data)
test_char1, test_char2 = gen_char_data(test_data)

word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape

((254386, 25),
 (254386, 25),
 (172956, 25),
 (172956, 25),
 (254386, 25),
 (254386, 25),
 (172956, 25),
 (172956, 25))

In [6]:
from keras.models import Model
from keras.layers.merge import concatenate
from keras.optimizers import Adam, SGD, Nadam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import LSTM, Bidirectional, TimeDistributed
from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation

In [7]:
# general
NUM_EPOCHES = 25
EPOCHES1 = 5
EPOCHES2 = 25 # 5
EPOCHES3 = 22
BATCH_SIZE = 1024
DROP_RATE = 0.3

# cnn
CONV_LEN1 = 128
CONV_LEN2 = 128
CONV_LEN3 = 128
CONV_LEN4 = 128
CONV_LEN5 = 128
CONV_LEN6 = 128

# lstm
LSTM_SIZE1 = 256
LSTM_SIZE2 = 256
LSTM_DROP_RATE = 0.3

# dense
DENSE_SIZE1 = 512
DENSE_SIZE2 = 256
DENSE_FEATURE = 32

In [8]:
def cnn_double_layer(inputa, inputb, filters, kernel_size):
    conv1 = Conv1D(filters=filters, kernel_size=kernel_size, padding="same")
    conv2 = Conv1D(filters=filters, kernel_size=kernel_size, padding="same")
    
    conv1a = conv1(inputa)
    conv1a = BatchNormalization()(conv1a)
    conv1a = Activation(activation="relu")(conv1a)
    conv2a = conv2(conv1a)
    conv2a = BatchNormalization()(conv2a)
    conv2a = Activation(activation="relu")(conv2a)
    output_avg_a = GlobalAveragePooling1D()(conv2a)
    output_max_a = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2a))
    output_a = concatenate([output_avg_a, output_max_a])
    
    conv1b = conv1(inputb)
    conv1b = BatchNormalization()(conv1b)
    conv1b = Activation(activation="relu")(conv1b)
    conv2b = conv2(conv1b)
    conv2b = BatchNormalization()(conv2b)
    conv2b = Activation(activation="relu")(conv2b)
    output_avg_b = GlobalAveragePooling1D()(conv2b)
    output_max_b = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2b))
    output_b = concatenate([output_avg_b, output_max_b])
    
    return output_a, output_b

# WORDS

In [9]:
# from sklearn.model_selection import StratifiedKFold

# best_results = []
# # last_results = []
# best_file_names = []
# dev_predictions = []

# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):  # word/char switch
#     print("-" * 60)
#     print("Fold {} training start...".format(i))
    
#     train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch
#     dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch
#     train_f, dev_f = train_feature[train_index, :], train_feature[dev_index, :]
    
#     input1 = Input(shape=(SEQ_LEN,), dtype="int32")
#     input2 = Input(shape=(SEQ_LEN,), dtype="int32")
#     inputf = Input(shape=(train_f.shape[1],), dtype="float32")

#     embedding_layer = Embedding(
#         input_dim=word_embedding_data.shape[0],  # word/char switch
#         output_dim=word_embedding_data.shape[1],  # word/char switch
#         weights=[word_embedding_data],  # word/char switch
#         input_length=SEQ_LEN,
#         trainable=False
#     )
    
#     vector1 = embedding_layer(input1)
#     vector2 = embedding_layer(input2)
    
#     lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
#     layer1a = lstm_layer1(vector1)
#     layer1a = Dropout(LSTM_DROP_RATE)(layer1a)
#     layer1b = lstm_layer1(vector2)
#     layer1b = Dropout(LSTM_DROP_RATE)(layer1b)

#     lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
#     layer2a = lstm_layer2(layer1a)
#     layer2b = lstm_layer2(layer1b)
    
#     conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)
#     conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)
#     conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)
#     conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)
#     conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)
#     conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)
    
#     merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
#     merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
#     diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])
#     mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])
#     merge = concatenate([diff, mult])
    
#     x = Dropout(DROP_RATE)(merge)
#     x = BatchNormalization()(x)
#     x = Dense(DENSE_SIZE1, activation="relu")(x)
    
#     fe = BatchNormalization()(inputf)
#     fe = Dense(DENSE_FEATURE, activation="relu")(fe)
    
#     x = concatenate([x, fe])
    
#     x = Dropout(DROP_RATE)(x)
#     x = BatchNormalization()(x)
#     x = Dense(DENSE_SIZE2, activation="relu")(x)
#     x = Dropout(DROP_RATE)(x)
#     x = BatchNormalization()(x)
#     pred = Dense(1, activation="sigmoid")(x)
    
#     model = Model(inputs=[input1, input2, inputf], outputs=pred)
#     model.compile(
#         optimizer="nadam",
#         loss="binary_crossentropy",
#         metrics=["acc"]
#     )
    
#     early_stopping = EarlyStopping("val_loss", patience=6)
#     lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0005)
#     check_point = ModelCheckpoint(
#         "./log/%s.Multi_LSTM_CNN_v3.word.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
#         monitor="val_loss",
#         save_best_only=True,
#     )
    
#     fit_res = model.fit(
#         x=[train_x1, train_x2, train_f],
#         y=train_y,
#         batch_size=BATCH_SIZE,
#         epochs=NUM_EPOCHES,
#         validation_data=([dev_x1, dev_x2, dev_f], dev_y),
#         shuffle=True,
#         callbacks=[early_stopping, lr_reducer, check_point]
#     )
    
# #     pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch
# #     last_results.append(pd.DataFrame(pred_last, columns=["y_pre"]))
    
#     best_model_file = glob("./log/*.hdf5")[-1].replace("\\", "/")
#     best_file_names.append(best_model_file)
#     print("load model %s" % (best_model_file,))
#     model.load_weights(best_model_file)
#     pred_best = model.predict([test_word1, test_word2, test_feature], batch_size=BATCH_SIZE)  # word/char switch
#     best_results.append(pd.DataFrame(pred_best, columns=["y_pre"]))
    
#     dev_pred = model.predict([dev_x1, dev_x2, dev_f], batch_size=BATCH_SIZE)
#     dev_result = pd.DataFrame({"pred": dev_pred.ravel(), "label": dev_y})
#     dev_predictions.append(dev_result)

# # pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
# #     "./result/%s-Multi_LSTM_CNN_v5_word_last.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
# #     index=False
# # )
# pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
#     "./result/%s-Multi_LSTM_CNN_v5_word_best.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
#     index=False
# )

# total_dev = pd.concat(dev_predictions, axis=0)
# total_dev.to_csv(
#     "./result/%s-Multi_LSTM_CNN_v5_word_dev_result.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),
#     index=False
# )

# model_path = "./log/" + datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
# os.mkdir(model_path)
# for model_name in best_file_names:
#     abs_name = os.path.split(model_name)[1]
#     os.rename(model_name, model_path + abs_name)

# CHARS

In [10]:
from sklearn.model_selection import StratifiedKFold

best_results = []
# last_results = []
best_file_names = []
dev_predictions = []

for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=char1, y=label)):  # word/char switch
    print("-" * 60)
    print("Fold {} training start...".format(i))
    
    train_x1, train_x2, train_y = char1[train_index, :], char2[train_index, :], label[train_index]  # word/char switch
    dev_x1, dev_x2, dev_y = char1[dev_index, :], char2[dev_index, :], label[dev_index]  # word/char switch
    train_f, dev_f = train_feature[train_index, :], train_feature[dev_index, :]
    
    input1 = Input(shape=(SEQ_LEN,), dtype="int32")
    input2 = Input(shape=(SEQ_LEN,), dtype="int32")
    inputf = Input(shape=(train_f.shape[1],), dtype="float32")

    embedding_layer = Embedding(
        input_dim=char_embedding_data.shape[0],  # word/char switch
        output_dim=char_embedding_data.shape[1],  # word/char switch
        weights=[char_embedding_data],  # word/char switch
        input_length=SEQ_LEN,
        trainable=False
    )
    
    vector1 = embedding_layer(input1)
    vector2 = embedding_layer(input2)
    
    lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
    layer1a = lstm_layer1(vector1)
    layer1a = Dropout(LSTM_DROP_RATE)(layer1a)
    layer1b = lstm_layer1(vector2)
    layer1b = Dropout(LSTM_DROP_RATE)(layer1b)

    lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
    layer2a = lstm_layer2(layer1a)
    layer2b = lstm_layer2(layer1b)
    
    conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)
    conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)
    conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)
    conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)
    conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)
    conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)
    
    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])
    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])
    merge = concatenate([diff, mult])
    
    x = Dropout(DROP_RATE)(merge)
    x = BatchNormalization()(x)
    x = Dense(DENSE_SIZE1, activation="relu")(x)
    
    fe = BatchNormalization()(inputf)
    fe = Dense(DENSE_FEATURE, activation="relu")(fe)
    
    x = concatenate([x, fe])
    
    x = Dropout(DROP_RATE)(x)
    x = BatchNormalization()(x)
    x = Dense(DENSE_SIZE2, activation="relu")(x)
    x = Dropout(DROP_RATE)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation="sigmoid")(x)
    
    model = Model(inputs=[input1, input2, inputf], outputs=pred)
    model.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=["acc"]
    )
    
    early_stopping = EarlyStopping("val_loss", patience=6)
    lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0005)
    check_point = ModelCheckpoint(
        "./log/%s.Multi_LSTM_CNN_v5.char.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
        monitor="val_loss",
        save_best_only=True,
    )
    
    fit_res = model.fit(
        x=[train_x1, train_x2, train_f],
        y=train_y,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHES,
        validation_data=([dev_x1, dev_x2, dev_f], dev_y),
        shuffle=True,
        callbacks=[early_stopping, lr_reducer, check_point]
    )
    
#     pred_last = model.predict([test_char1, test_char2, test_feature], batch_size=BATCH_SIZE)  # word/char switch
#     last_results.append(pd.DataFrame(pred_last, columns=["y_pre"]))
    
    best_model_file = glob("./log/*.hdf5")[-1].replace("\\", "/")
    best_file_names.append(best_model_file)
    print("load model %s" % (best_model_file,))
    model.load_weights(best_model_file)
    pred_best = model.predict([test_char1, test_char2, test_feature], batch_size=BATCH_SIZE)  # word/char switch
    best_results.append(pd.DataFrame(pred_best, columns=["y_pre"]))
    
    dev_pred = model.predict([dev_x1, dev_x2, dev_f], batch_size=BATCH_SIZE)
    dev_result = pd.DataFrame({"pred": dev_pred.ravel(), "label": dev_y})
    dev_predictions.append(dev_result)

# pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
#     "./result/%s-Multi_LSTM_CNN_v5_char_last.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
#     index=False
# )
pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
    "./result/%s-Multi_LSTM_CNN_v5_char_best.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
    index=False
)

total_dev = pd.concat(dev_predictions, axis=0)
total_dev.to_csv(
    "./result/%s-Multi_LSTM_CNN_v5_char_dev_result.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")), # word/char switch
    index=False
)

model_path = "./log/" + datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
os.mkdir(model_path)
for model_name in best_file_names:
    abs_name = os.path.split(model_name)[1]
    os.rename(model_name, model_path + abs_name)

------------------------------------------------------------
Fold 0 training start...
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 228946 samples, validate on 25440 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
load model ./log/20180715-102931.Multi_LSTM_CNN_v5.char.023.hdf5
------------------------------------------------------------
Fold 1 training start...
Train on 228946 samples, validate on 25440 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25

Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25

KeyboardInterrupt: 