## 模型结构

- 两层单向LSTM, 输出序列结果, 即(batch_size, step_size, feature_size)
- 分别输入到1, 2, 3, 4, 5, 6共6个不同长度的卷积层中
- 卷积层为双层, 最后的池化层有Average和Max两种
- 对于每个问题, 将所有卷积核结果并起来
- 将两个问题并起来的结果, 分别[相减并取绝对值], [x]

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import shutil
import numpy as np
import pandas as pd
from glob import glob
from datetime import datetime

In [None]:
try:
    os.mkdir("./log/")
    os.mkdir("./result/")
except FileExistsError:
    pass

In [None]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

label = train_data["label"].values

from keras.preprocessing.text import Tokenizer

MAX_COUNT = 10000

word_tokenizer = Tokenizer(MAX_COUNT)
word_tokenizer.fit_on_texts(question_data["words"])

word_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),
        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values
    ),
    axis=0
)

char_tokenizer = Tokenizer(MAX_COUNT)
char_tokenizer.fit_on_texts(question_data["chars"])

char_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),
        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values
    ),
    axis=0
)

word_embedding_data.shape, char_embedding_data.shape

In [None]:
from keras.preprocessing.sequence import pad_sequences

SEQ_LEN = 25

def gen_word_data(data):
    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["words"])
    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["words"])
    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding="pre",truncating="pre")
    
def gen_char_data(data):
    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["chars"])
    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["chars"])
    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding="pre",truncating="pre")

word1, word2 = gen_word_data(train_data)
char1, char2 = gen_char_data(train_data)
test_word1, test_word2 = gen_word_data(test_data)
test_char1, test_char2 = gen_char_data(test_data)

word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape

In [None]:
from keras.models import Model
from keras.layers.merge import concatenate
from keras.optimizers import Adam, SGD, Nadam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import LSTM, Bidirectional, TimeDistributed
from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation

In [None]:
# general
NUM_EPOCHES = 50
EPOCHES1 = 5
EPOCHES2 = 25 # 5
EPOCHES3 = 22
BATCH_SIZE = 1024
DROP_RATE = 0.3

# cnn
CONV_LEN1 = 128
CONV_LEN2 = 128
CONV_LEN3 = 128
CONV_LEN4 = 128
CONV_LEN5 = 128
CONV_LEN6 = 128

# lstm
LSTM_SIZE1 = 256
LSTM_SIZE2 = 256
LSTM_DROP_RATE = 0.3

# dense
DENSE_INPUT = 300
DENSE_SIZE1 = 512
DENSE_SIZE2 = 256

In [None]:
def cnn_layer1(inputa, inputb, filters, kernel_size): # with average pooling
    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu")
    conv_outputa = conv(inputa)
    conv_outputa = GlobalAveragePooling1D()(conv_outputa)
    conv_outputb = conv(inputb)
    conv_outputb = GlobalAveragePooling1D()(conv_outputb)
    return conv_outputa, conv_outputb
    
def cnn_layer2(inputa, inputb, filters, kernel_size): # with max pooling
    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu")
    conv_outputa = conv(inputa)
    conv_outputa = MaxPool1D(pool_size=SEQ_LEN)(conv_outputa)
    conv_outputa = Flatten()(conv_outputa)
    conv_outputb = conv(inputb)
    conv_outputb = MaxPool1D(pool_size=SEQ_LEN)(conv_outputb)
    conv_outputb = Flatten()(conv_outputb)
    return conv_outputa, conv_outputb

def cnn_layer3(inputa, inputb, filters, kernel_size): # with both max and average poolings
    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu")
    
    conv_outputa = conv(inputa)
    conv_outputa1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputa))
    conv_outputa2 = GlobalAveragePooling1D()(conv_outputa)
    conv_outputa = concatenate([conv_outputa1, conv_outputa2])
    
    conv_outputb = conv(inputb)
    conv_outputb1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputb))
    conv_outputb2 = GlobalAveragePooling1D()(conv_outputb)
    conv_outputb = concatenate([conv_outputb1, conv_outputb2])
    
    return conv_outputa, conv_outputb

def cnn_double_layer(inputa, inputb, filters, kernel_size):
    conv1 = Conv1D(filters=filters, kernel_size=kernel_size, padding="same")
    conv2 = Conv1D(filters=filters, kernel_size=kernel_size, padding="same")
    
    conv1a = conv1(inputa)
    conv1a = BatchNormalization()(conv1a)
    conv1a = Activation(activation="relu")(conv1a)
    conv2a = conv2(conv1a)
    conv2a = BatchNormalization()(conv2a)
    conv2a = Activation(activation="relu")(conv2a)
    output_avg_a = GlobalAveragePooling1D()(conv2a)
    output_max_a = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2a))
    output_min_a = Lambda(lambda x: K.min(x, axis=1))(conv2a)
    output_a = concatenate([output_avg_a, output_max_a, output_min_a])
    
    conv1b = conv1(inputb)
    conv1b = BatchNormalization()(conv1b)
    conv1b = Activation(activation="relu")(conv1b)
    conv2b = conv2(conv1b)
    conv2b = BatchNormalization()(conv2b)
    conv2b = Activation(activation="relu")(conv2b)
    output_avg_b = GlobalAveragePooling1D()(conv2b)
    output_max_b = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2b))
    output_min_b = Lambda(lambda x: K.min(x, axis=1))(conv2b)
    output_b = concatenate([output_avg_b, output_max_b, output_min_b])
    
    return output_a, output_b


def sim_l1(v1, v2):
    return Lambda(lambda x: K.sum(K.abs(x[0] - x[1]), axis=1))([v1, v2])

def sim_l2(v1, v2):
    return Lambda(lambda x: K.sqrt(K.sum(K.square(x[0] - x[1]), axis=1)))([v1, v2])

def sim_cos(v1, v2):
    return Lambda(lambda x: K.sum(x[0] * x[1], axis=1) / (K.sqrt(K.sum(x[0] * x[0], axis=1)) * K.sqrt(K.sum(x[1] * x[1], axis=1))))([v1, v2])

def sim_vec(v1, v2):
    l1 = sim_l1(v1, v2)
    l2 = sim_l2(v1, v2)
    cos = sim_cos(v1, v2)
    vec = concatenate([Lambda(lambda x: K.reshape(x, shape=(-1, 1)))(t) for t in [l1, l2, cos]], axis=1)
    return vec

def similarity_mpcnn(s1, s2):
    fea_h, fea_a = [], []
    out1, out2 = [], []        
    for i in range(len(s1)):
        avg1 = GlobalAveragePooling1D()(s1[i])
        max1 = GlobalMaxPooling1D()(s1[i])
        min1 = Lambda(lambda x: K.min(x, axis=1))(s1[i])
        out1.append([avg1, max1, min1])
        
        avg2 = GlobalAveragePooling1D()(s2[i])
        max2 = GlobalMaxPooling1D()(s2[i])
        min2 = Lambda(lambda x: K.min(x, axis=1))(s2[i])
        out2.append([avg2, max2, min2])
        
    output1, output2 = [], [] # pool nums
    for p in range(3):
        output1.append(concatenate([Lambda(lambda x:K.reshape(x, shape=(-1, 1, CONV_LEN1)))(t[p]) for t in out1], axis=1))
        output2.append(concatenate([Lambda(lambda x:K.reshape(x, shape=(-1, 1, CONV_LEN1)))(t[p]) for t in out2], axis=1))
    
    for p in range(3):
        for f in range(CONV_LEN1):
            fea_h.append(sim_vec(Lambda(lambda x: x[:, :, f])(output1[p]), Lambda(lambda x: x[:, :, f])(output2[p])))
    
    for p in range(3):
        for k1 in range(len(s1)):
            for k2 in range(len(s1)):
                fea_a.append(sim_vec(Lambda(lambda x: x[:, k1, :])(output1[p]), Lambda(lambda x: x[:, k2, :])(output2[p])))
    fea = concatenate(fea_h + fea_a, axis=1)
    return fea

# WORDS

In [None]:
from sklearn.model_selection import StratifiedKFold

best_results = []
last_results = []
best_file_names = []

for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):  # word/char switch
    print("fold {} start".format(i + 1))
    train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch
    dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch
    
    input1 = Input(shape=(SEQ_LEN,), dtype="int32")
    input2 = Input(shape=(SEQ_LEN,), dtype="int32")

    embedding_layer = Embedding(
        input_dim=word_embedding_data.shape[0],  # word/char switch
        output_dim=word_embedding_data.shape[1],  # word/char switch
        weights=[word_embedding_data],  # word/char switch
        input_length=SEQ_LEN,
        trainable=False
    )
    
    vector1 = embedding_layer(input1)
    vector2 = embedding_layer(input2)
    
    input_layer = TimeDistributed(Dense(DENSE_INPUT))
    vector1 = input_layer(vector1)
    vector1 = BatchNormalization()(vector1)
    vector2 = input_layer(vector2)
    vector2 = BatchNormalization()(vector2)
    
    lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
    layer1a = lstm_layer1(vector1)
    layer1a = Dropout(LSTM_DROP_RATE)(layer1a)
    layer1b = lstm_layer1(vector2)
    layer1b = Dropout(LSTM_DROP_RATE)(layer1b)
    lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
    layer2a = lstm_layer2(layer1a)
    layer2b = lstm_layer2(layer1b)
    
    conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)
    conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)
    conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)
    conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)
    conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)
    conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)

    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])
    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])
    merge = concatenate([diff, mult])
    
    x = Dropout(DROP_RATE)(merge)
    x = BatchNormalization()(x)
    x = Dense(DENSE_SIZE1, activation="relu")(x)
    x = Dropout(DROP_RATE)(x)
    x = BatchNormalization()(x)
    x = Dense(DENSE_SIZE2, activation="relu")(x)
    x = Dropout(DROP_RATE)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation="sigmoid")(x)
    
    model = Model(inputs=[input1, input2], outputs=pred)
    model.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=["acc"]
    )
    
    early_stopping = EarlyStopping("val_loss", patience=8)
    lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.001)
    check_point = ModelCheckpoint(
        "./log/%s.Multi_LSTM_CNN_v4.word.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
        monitor="val_loss",
        save_best_only=True,
    )
    
    fit_res = model.fit(
        x=[train_x1, train_x2],
        y=train_y,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHES,
        validation_data=([dev_x1, dev_x2], dev_y),
        shuffle=True,
        callbacks=[early_stopping, lr_reducer, check_point]
    )
    
    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch
    last_results.append(pd.DataFrame(pred_last, columns=["y_pre"]))
    
    best_model_file = glob("./log/*.hdf5")[-1].replace("\\", "/")
    best_file_names.append(best_model_file)
    print("load model %s" % (best_model_file,))
    model.load_weights(best_model_file)
    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch
    best_results.append(pd.DataFrame(pred_best, columns=["y_pre"]))

pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
    "./result/%s-Multi_LSTM_CNN_v4_word_last.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
    index=False
)
pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
    "./result/%s-Multi_LSTM_CNN_v4_word_best.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
    index=False
)

model_path = "./log/" + datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
os.mkdir(model_path)
for model_name in best_file_names:
    abs_name = os.path.split(model_name)[1]
    os.rename(model_name, model_path + abs_name)

# CHARS

In [None]:
# from sklearn.model_selection import StratifiedKFold

# best_results = []
# last_results = []
# best_file_names = []

# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=char1, y=label)):  # word/char switch
#     print("fold {} start".format(i + 1))
#     train_x1, train_x2, train_y = char1[train_index, :], char2[train_index, :], label[train_index]  # word/char switch
#     dev_x1, dev_x2, dev_y = char1[dev_index, :], char2[dev_index, :], label[dev_index]  # word/char switch
    
#     input1 = Input(shape=(SEQ_LEN,), dtype="int32")
#     input2 = Input(shape=(SEQ_LEN,), dtype="int32")

#     embedding_layer = Embedding(
#         input_dim=char_embedding_data.shape[0],  # word/char switch
#         output_dim=char_embedding_data.shape[1],  # word/char switch
#         weights=[char_embedding_data],  # word/char switch
#         input_length=SEQ_LEN,
#         trainable=False
#     )
    
#     vector1 = embedding_layer(input1)
#     vector2 = embedding_layer(input2)
    
#     input_layer = TimeDistributed(Dense(DENSE_INPUT))
#     vector1 = input_layer(vector1)
#     vector1 = BatchNormalization()(vector1)
#     vector2 = input_layer(vector2)
#     vector2 = BatchNormalization()(vector2)
    
#     lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
#     layer1a = lstm_layer1(vector1)
#     layer1a = Dropout(LSTM_DROP_RATE)(layer1a)
#     layer1b = lstm_layer1(vector2)
#     layer1b = Dropout(LSTM_DROP_RATE)(layer1b)
#     lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)
#     layer2a = lstm_layer2(layer1a)
#     layer2b = lstm_layer2(layer1b)
    
#     conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)
#     conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)
#     conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)
#     conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)
#     conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)
#     conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)

#     merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])
#     merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])
#     diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])
#     mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])
#     merge = concatenate([diff, mult])
    
#     x = Dropout(DROP_RATE)(merge)
#     x = BatchNormalization()(x)
#     x = Dense(DENSE_SIZE1, activation="relu")(x)
#     x = Dropout(DROP_RATE)(x)
#     x = BatchNormalization()(x)
#     x = Dense(DENSE_SIZE2, activation="relu")(x)
#     x = Dropout(DROP_RATE)(x)
#     x = BatchNormalization()(x)
#     pred = Dense(1, activation="sigmoid")(x)
    
#     model = Model(inputs=[input1, input2], outputs=pred)
#     model.compile(
#         optimizer="nadam",
#         loss="binary_crossentropy",
#         metrics=["acc"]
#     )
    
#     early_stopping = EarlyStopping("val_loss", patience=8)
#     lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.001)
#     check_point = ModelCheckpoint(
#         "./log/%s.Multi_LSTM_CNN_v4.char.{epoch:03d}.hdf5" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
#         monitor="val_loss",
#         save_best_only=True,
#     )
    
#     fit_res = model.fit(
#         x=[train_x1, train_x2],
#         y=train_y,
#         batch_size=BATCH_SIZE,
#         epochs=NUM_EPOCHES,
#         validation_data=([dev_x1, dev_x2], dev_y),
#         shuffle=True,
#         callbacks=[early_stopping, lr_reducer, check_point]
#     )
    
#     pred_last = model.predict([test_char1, test_char2], batch_size=BATCH_SIZE)  # word/char switch
#     last_results.append(pd.DataFrame(pred_last, columns=["y_pre"]))
    
#     best_model_file = glob("./log/*.hdf5")[-1].replace("\\", "/")
#     best_file_names.append(best_model_file)
#     print("load model %s" % (best_model_file,))
#     model.load_weights(best_model_file)
#     pred_best = model.predict([test_char1, test_char2], batch_size=BATCH_SIZE)  # word/char switch
#     best_results.append(pd.DataFrame(pred_best, columns=["y_pre"]))

# pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
#     "./result/%s-Multi_LSTM_CNN_v4_char_last.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
#     index=False
# )
# pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=["y_pre"]).to_csv(
#     "./result/%s-Multi_LSTM_CNN_v4_char_best.csv" % (datetime.now().strftime("%Y%m%d-%H%M%S")),  # word/char switch
#     index=False
# )

# model_path = "./log/" + datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
# os.mkdir(model_path)
# for model_name in best_file_names:
#     abs_name = os.path.split(model_name)[1]
#     os.rename(model_name, model_path + abs_name)