## 模型结构

- 两层单向LSTM, 输出序列结果, 即(batch_size, step_size, feature_size)
- 分别输入到1, 2, 3, 4, 5, 6共6个不同长度的卷积层中
- 卷积层为单层
- 对于每个问题, 将所有卷积核结果并起来
- 将两个问题并起来的结果, 分别[相减并取绝对值], [相乘], 再将所有的结果合并
- 输出到两层Dense层中, 最后一层为sigmoid激活函数, 输出结果

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from glob import glob
from datetime import datetime

In [2]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

label = train_data["label"].values

from keras.preprocessing.text import Tokenizer

MAX_COUNT = 10000

word_tokenizer = Tokenizer(MAX_COUNT)
word_tokenizer.fit_on_texts(question_data["words"])

word_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),
        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values
    ),
    axis=0
)

char_tokenizer = Tokenizer(MAX_COUNT)
char_tokenizer.fit_on_texts(question_data["chars"])

char_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),
        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values
    ),
    axis=0
)

word_embedding_data.shape, char_embedding_data.shape

Using TensorFlow backend.


((10001, 300), (3049, 300))

In [3]:
from keras.preprocessing.sequence import pad_sequences

SEQ_LEN = 25

def gen_word_data(data):
    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["words"])
    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["words"])
    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding="pre",truncating="pre")
    
def gen_char_data(data):
    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["chars"])
    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["chars"])
    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding="pre",truncating="pre")

word1, word2 = gen_word_data(train_data)
char1, char2 = gen_char_data(train_data)
test_word1, test_word2 = gen_word_data(test_data)
test_char1, test_char2 = gen_char_data(test_data)

word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape

((254386, 30),
 (254386, 30),
 (172956, 30),
 (172956, 30),
 (254386, 30),
 (254386, 30),
 (172956, 30),
 (172956, 30))

In [4]:
from keras.models import Model
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM, Bidirectional, TimeDistributed
from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K

In [5]:
# general
NUM_EPOCHES = 30
EPOCHES1 = 3
EPOCHES2 = 5
EPOCHES3 = 22
BATCH_SIZE = 1024
DROP_RATE = 0.3
PATIENCE = 6

# cnn
CONV_LEN1 = 128
CONV_LEN2 = 128
CONV_LEN3 = 128
CONV_LEN4 = 128
CONV_LEN5 = 128
CONV_LEN6 = 128
CONV_LEN = CONV_LEN1 + CONV_LEN2 + CONV_LEN3 + CONV_LEN4 + CONV_LEN5 + CONV_LEN6

# lstm
LSTM_SIZE1 = 256
LSTM_SIZE2 = 256
LSTM_DROP_RATE = 0.3

# dense
DENSE_SIZE1 = 512
DENSE_SIZE2 = 256

In [6]:
def cnn_layer1(inputa, inputb, filters, kernel_size):
    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu")
    conv_outputa = conv(inputa)
    conv_outputa = GlobalAveragePooling1D()(conv_outputa)
    conv_outputb = conv(inputb)
    conv_outputb = GlobalAveragePooling1D()(conv_outputb)
    return conv_outputa, conv_outputb
    
def cnn_layer2(inputa, inputb, filters, kernel_size):
    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu")
    conv_outputa = conv(inputa)
    conv_outputa = MaxPool1D(pool_size=SEQ_LEN)(conv_outputa)
    conv_outputa = Flatten()(conv_outputa)
    conv_outputb = conv(inputb)
    conv_outputb = MaxPool1D(pool_size=SEQ_LEN)(conv_outputb)
    conv_outputb = Flatten()(conv_outputb)
    return conv_outputa, conv_outputb

def cnn_layer3(inputa, inputb, filters, kernel_size):
    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu")
    
    conv_outputa = conv(inputa)
    conv_outputa1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputa))
    conv_outputa2 = GlobalAveragePooling1D()(conv_outputa)
    conv_outputa = concatenate([conv_outputa1, conv_outputa2])
    
    conv_outputb = conv(inputb)
    conv_outputb1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputb))
    conv_outputb2 = GlobalAveragePooling1D()(conv_outputb)
    conv_outputb = concatenate([conv_outputb1, conv_outputb2])
    
    return conv_outputa, conv_outputb

# WORDS

In [7]:
from sklearn.model_selection import StratifiedKFold

best_results = []
last_results = []

for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):  # word/char switch
    train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch
    dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch
    
    input1 = Input(shape=(SEQ_LEN,), dtype="int32")
    input2 = Input(shape=(SEQ_LEN,), dtype="int32")

    embedding_layer = Embedding(
        input_dim=word_embedding_data.shape[0],  # word/char switch
        output_dim=word_embedding_data.shape[1],  # word/char switch
        weights=[word_embedding_data],  # word/char switch
        input_length=SEQ_LEN,
        trainable=False
    )
    
    vector1 = embedding_layer(input1)
    vector2 = embedding_layer(input2)
    
    lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
    layer1a = lstm_layer1(vector1)
    layer1a = Dropout(LSTM_DROP_RATE)(layer1a)
    layer1b = lstm_layer1(vector2)
    layer1b = Dropout(LSTM_DROP_RATE)(layer1b)

    lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
    layer2a = lstm_layer2(layer1a)
    layer2b = lstm_layer2(layer1b)
#     # 每个序列片拼接对应的原始embedding向量
#     layer2a = concatenate([vector1, layer2a])
#     layer2b = concatenate([vector2, layer2b])
    
    conv1a, conv1b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)
    conv2a, conv2b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)
    conv3a, conv3b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)
    conv4a, conv4b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)
    conv5a, conv5b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)
    conv6a, conv6b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)
    
    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])
    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])
    merge = concatenate([diff, mult])
    
    x = Dropout(DROP_RATE)(merge)
    x = BatchNormalization()(x)
    x = Dense(DENSE_SIZE1, activation="relu")(x)
    x = Dropout(DROP_RATE)(x)
    x = BatchNormalization()(x)
    x = Dense(DENSE_SIZE2, activation="relu")(x)
    x = Dropout(DROP_RATE)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation="sigmoid")(x)
    
    model = Model(inputs=[input1, input2], outputs=pred)
    model.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=["acc"]
    )
    
    early_stopping = EarlyStopping("val_loss", patience=PATIENCE)
    check_point = ModelCheckpoint(
        "./log/%s.Multi_LSTM_CNN_v1.word.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
        monitor="val_loss",
        save_best_only=True,
    )
    
    fit_res = model.fit(
        x=[train_x1, train_x2],
        y=train_y,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHES,
        validation_data=([dev_x1, dev_x2], dev_y),
        shuffle=True,
        callbacks=[early_stopping, check_point]
    )
    
    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch
    last_results.append(pd.DataFrame(pred_last, columns=["y_pre"]))
    
    print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
    model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))
    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch
    best_results.append(pd.DataFrame(pred_best, columns=["y_pre"]))

pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
    "./result/%s-Multi_LSTM_CNN_v1_word_last.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
    index=False
)
pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
    "./result/%s-Multi_LSTM_CNN_v1_word_best.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
    index=False
)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 228946 samples, validate on 25440 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
load model ./log/20180707-200651.Multi_LSTM_CNN_v1.word.015.hdf5
Train on 228946 samples, validate on 25440 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
load model ./log/20180707-211124.Multi_LSTM_CNN_v1.word.018.hdf5
Train on 228947 samples, validate on 25439 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/3

Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
load model ./log/20180708-010007.Multi_LSTM_CNN_v1.word.015.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
load model ./log/20180708-020528.Multi_LSTM_CNN_v1.word.018.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30


Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
load model ./log/20180708-031925.Multi_LSTM_CNN_v1.word.019.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
load model ./log/20180708-043614.Multi_LSTM_CNN_v1.word.023.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30


Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
load model ./log/20180708-060114.Multi_LSTM_CNN_v1.word.015.hdf5
Train on 228948 samples, validate on 25438 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
load model ./log/20180708-070633.Multi_LSTM_CNN_v1.word.017.hdf5
