In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from glob import glob

In [2]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

In [3]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

In [4]:
from keras.preprocessing.text import Tokenizer

MAX_WORD_NUMS = 10000

word_tokenizer = Tokenizer(MAX_WORD_NUMS)
word_tokenizer.fit_on_texts(question_data["words"])

Using TensorFlow backend.


In [5]:
word_embedding_data = np.concatenate(
    (
        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),
        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_WORD_NUMS]].values
    ),
    axis=0
)
word_embedding_data.shape

(10001, 300)

In [6]:
from keras.preprocessing.sequence import pad_sequences

WORD_SEQ_LEN = 30

def gen_data(data):
    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q1", right_on="qid")["words"])
    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how="left", left_on="q2", right_on="qid")["words"])
    return pad_sequences(seq_word1, maxlen=WORD_SEQ_LEN, padding="pre",truncating="pre"), \
        pad_sequences(seq_word2, maxlen=WORD_SEQ_LEN, padding="pre",truncating="pre")

In [7]:
word1, word2 = gen_data(train_data)
test_word1, test_word2 = gen_data(test_data)

In [8]:
from keras.layers import Input, Embedding, LSTM, Dropout, BatchNormalization, Dense
from keras.layers.merge import concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [9]:
from sklearn.model_selection import train_test_split

train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(
    word1, word2, train_data["label"].values,
    test_size=0.2
)

train_word1.shape, dev_word1.shape

lstm_size = 256
drop_rate = 0.5
dense_size = 300
num_epoch = 100
batch_size = 2048

word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")

embedding_layer = Embedding(
    input_dim=word_embedding_data.shape[0],
    output_dim=word_embedding_data.shape[1],
    weights=[word_embedding_data],
    input_length=WORD_SEQ_LEN,
    trainable=False
)

word_vector1 = embedding_layer(word_input1)
word_vector2 = embedding_layer(word_input2)

lstm_layer = LSTM(lstm_size, dropout=drop_rate, recurrent_dropout=drop_rate)
word1 = lstm_layer(word_vector1)
word2 = lstm_layer(word_vector2)
word = concatenate([word1, word2])

x = Dropout(drop_rate)(word)
x = BatchNormalization()(x)

x = Dense(dense_size, activation="relu")(x)
x = Dropout(drop_rate)(x)
x = BatchNormalization()(x)

pred = Dense(1, activation="sigmoid")(x)

model = Model(inputs=[word_input1, word_input2], outputs=pred)
model.compile(
    optimizer=Adam(lr=0.001),
    loss="binary_crossentropy",
    metrics=["acc"]
)

lr_reducer = ReduceLROnPlateau(monitor='val_loss', mode="min", factor=0.33, patience=3, min_lr=0.0001)
early_stop = EarlyStopping("val_loss", patience=6)
check_point = ModelCheckpoint("./log/lstm.{epoch:02d}_{val_loss:.3f}.hdf5", monitor="val_loss", save_best_only=True, save_weights_only=True)

train_res = model.fit(
    x=[train_word1, train_word2],
    y=train_y,
    batch_size=batch_size,
    epochs=num_epoch,
    validation_data=([dev_word1, dev_word2], dev_y),
    shuffle=True,
    callbacks=[lr_reducer, early_stop, check_point]
)

print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))

test_pred = model.predict([test_word1, test_word2], batch_size=batch_size)
pd.DataFrame(test_pred, columns=["y_pre"]).to_csv("./result/pred.csv", index=False)

Train on 203508 samples, validate on 50878 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
 36864/203508 [====>.........................] - ETA: 26s - loss: 0.2563 - acc: 0.8888

KeyboardInterrupt: 

In [None]:
# from keras.layers import Input, Embedding, LSTM, Dropout, BatchNormalization, Dense, Lambda
# from keras.layers.merge import concatenate
# from keras.models import Model
# from keras.optimizers import Adam
# from keras.callbacks import EarlyStopping, ModelCheckpoint

# from sklearn.model_selection import StratifiedKFold

# lstm_size = 256
# drop_rate = 0.3
# dense_size = 300
# num_epoch = 100
# batch_size = 2048

# label = train_data["label"].values

# preds = []

# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):
#     train_word1, train_word2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]
#     dev_word1, dev_word2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]
    
#     word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")
#     word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype="int32")

#     embedding_layer = Embedding(
#         input_dim=word_embedding_data.shape[0],
#         output_dim=word_embedding_data.shape[1],
#         weights=[word_embedding_data],
#         input_length=WORD_SEQ_LEN,
#         trainable=False
#     )

#     word_vector1 = embedding_layer(word_input1)
#     word_vector2 = embedding_layer(word_input2)

#     lstm_layer = LSTM(lstm_size, dropout=drop_rate, recurrent_dropout=drop_rate)
#     word_output1 = lstm_layer(word_vector1)
#     word_output2 = lstm_layer(word_vector2)
#     word = concatenate([word_output1, word_output2])
    
# #     diff = Lambda(lambda x: x[0] - x[1], output_shape=(lstm_size,))([word_output1, word_output2])
# #     mult = Lambda(lambda x: x[0] * x[1], output_shape=(lstm_size,))([word_output1, word_output2])
# #     word = concatenate([diff, mult])

#     x = Dropout(drop_rate)(word)
#     x = BatchNormalization()(x)

#     x = Dense(dense_size, activation="relu")(x)
#     x = Dropout(drop_rate)(x)
#     x = BatchNormalization()(x)

#     pred = Dense(1, activation="sigmoid")(x)

#     model = Model(inputs=[word_input1, word_input2], outputs=pred)
#     model.compile(optimizer="nadam", loss="binary_crossentropy", metrics=["acc"])

#     early_stop = EarlyStopping("val_loss", patience=10)
#     check_point = ModelCheckpoint(
#         "./log/lstm_%02d.{epoch:02d}_{val_loss:.3f}.hdf5" % (i + 1),
#         monitor="val_loss",
#         save_best_only=True,
#         save_weights_only=True
#     )

#     train_res = model.fit(
#         x=[train_word1, train_word2],
#         y=train_y,
#         batch_size=batch_size,
#         epochs=num_epoch,
#         validation_data=([dev_word1, dev_word2], dev_y),
#         shuffle=True,
#         callbacks=[early_stop, check_point]
#     )
    
#     print("load model %s" % (glob("./log/*.hdf5")[-1].replace("\\", "/"),))
#     model.load_weights(glob("./log/*.hdf5")[-1].replace("\\", "/"))

#     test_pred = model.predict([test_word1, test_word2], batch_size=batch_size)
#     preds.append(pd.DataFrame(test_pred, columns=["y_pre"]))

# pd.DataFrame(pd.concat(preds, axis=1).mean(axis=1), columns=["y_pre"]).to_csv("./result/pred.csv", index=False)