In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

SEED = 2018

In [3]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

In [4]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(question_data["words"])
word_index = word_tokenizer.word_index

In [5]:
WORD_SEQ_LEN = 8

word_texts_q1 = train_data.merge(question_data, how="left", left_on="q1", right_on="qid")["words"]
word_texts_q2 = train_data.merge(question_data, how="left", left_on="q2", right_on="qid")["words"]

word_seq_q1 = word_tokenizer.texts_to_sequences(word_texts_q1)
pad_word_seq_q1 = pad_sequences(word_seq_q1, maxlen=WORD_SEQ_LEN)
word_seq_q2 = word_tokenizer.texts_to_sequences(word_texts_q2)
pad_word_seq_q2 = pad_sequences(word_seq_q2, maxlen=WORD_SEQ_LEN)

In [6]:
label = train_data["label"].values
train_seq1, dev_seq1, train_seq2, dev_seq2, train_y, dev_y = train_test_split(pad_word_seq_q1, pad_word_seq_q2, label, test_size=0.05)

In [7]:
word_embedding_data = word_embedding_data.reindex(list(word_index.keys())).values
word_embedding_data = np.concatenate((np.zeros((1, word_embedding_data.shape[1]), dtype=np.float32), word_embedding_data), axis=0)

In [None]:
KEEP_PROB = 0.8
NUM_UNITS = 256
DENSE_SIZE = 128
INIT_STD = 0.1
INIT_LEARN = 0.5

NUM_EPOCH = 3
BATCH_SIZE = 128

num_train = len(train_seq1)
num_batches = num_train // BATCH_SIZE

graph = tf.Graph()

with graph.as_default():
    with tf.variable_scope("input_layer"):
        word1 = tf.placeholder(shape=(None, WORD_SEQ_LEN), dtype=tf.int64, name="word1_input")
        word2 = tf.placeholder(shape=(None, WORD_SEQ_LEN), dtype=tf.int64, name="word2_input")
        y = tf.placeholder(shape=(None,), dtype=tf.int64, name="true_label")
        y_ = tf.cast(y, dtype=tf.float64)
        
    with tf.variable_scope("embedding_layer"):
        word_embedding = tf.Variable(word_embedding_data, trainable=False, name="word_embedding")
        word_vector1 = tf.nn.embedding_lookup(word_embedding, word1, name="embedding_looking")
        word_vector2 = tf.nn.embedding_lookup(word_embedding, word2, name="embedding_looking")
    
    with tf.variable_scope("lstm_layer"):
        cell = tf.contrib.rnn.BasicLSTMCell(num_units=NUM_UNITS, name="word_lstm")
        lstm_output, _ = tf.nn.dynamic_rnn(cell, word_vector1, dtype=tf.float64)
        final_output = lstm_output[:, -1, :]
    
    with tf.variable_scope("output_layer"):
        input_norm = tf.layers.batch_normalization(final_output, axis=1, training=True)
        dense = tf.layers.dense(input_norm, DENSE_SIZE, activation=tf.nn.relu,
                                 kernel_initializer=tf.random_normal_initializer(stddev=INIT_STD),
                                 bias_initializer=tf.zeros_initializer())
        dense_norm = tf.layers.batch_normalization(dense, axis=1, training=True)
        pred = tf.layers.dense(dense_norm, 1, activation=tf.nn.sigmoid,
                               kernel_initializer=tf.random_normal_initializer(stddev=INIT_STD),
                               bias_initializer=tf.zeros_initializer())
    
    with tf.variable_scope("loss"):
        loss = -tf.reduce_mean(y_ * tf.log(pred) + (1 - y_) * tf.log(1 - pred))
    
    with tf.variable_scope("train"):
        optimizer = tf.train.GradientDescentOptimizer(INIT_LEARN)
        grads, variables = zip(*optimizer.compute_gradients(loss, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))
        
        global_step = tf.Variable(0, dtype=tf.int64, trainable=False, name="global_step")
        
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        print(update_ops)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.apply_gradients(zip(grads, variables), global_step=global_step)
        
with tf.Session(graph=graph) as sess:
    sess.run([tf.global_variables_initializer()])
    
    for epoch in range(NUM_EPOCH):
        epoch_index = np.random.permutation(num_train)
        for i in range(num_batches):
            batch_index = epoch_index[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
            data_w1 = train_seq1[batch_index, :]
            data_w2 = train_seq2[batch_index, :]
            data_y = train_y[batch_index]
            
            train_feed = {word1: data_w1, word2: data_w2, y: data_y}
            _, train_loss, cur_step = sess.run([train_op, loss, global_step], feed_dict=train_feed)
            dev_feed = {word1: dev_seq1, word2: dev_seq2, y: dev_y}
            dev_loss = sess.run(loss, feed_dict=dev_feed)
            
            print(f"[epoch {epoch + 1} step {cur_step}] train loss: {train_loss}, dev loss: {dev_loss}")

[<tf.Tensor 'output_layer/batch_normalization/AssignMovingAvg:0' shape=(256,) dtype=float64_ref>, <tf.Tensor 'output_layer/batch_normalization/AssignMovingAvg_1:0' shape=(256,) dtype=float64_ref>, <tf.Tensor 'output_layer/batch_normalization_1/AssignMovingAvg:0' shape=(128,) dtype=float64_ref>, <tf.Tensor 'output_layer/batch_normalization_1/AssignMovingAvg_1:0' shape=(128,) dtype=float64_ref>]
[epoch 1 step 1] train loss: 0.8574914609992914, dev loss: 1.0217921001652481
[epoch 1 step 2] train loss: 0.9848272175404253, dev loss: 0.8940810427182642
[epoch 1 step 3] train loss: 0.8662045926183246, dev loss: 0.7703268891158105
[epoch 1 step 4] train loss: 0.7687002173375244, dev loss: 0.7474213034287281
[epoch 1 step 5] train loss: 0.746488186044315, dev loss: 0.7216810961452014
[epoch 1 step 6] train loss: 0.7255934421976289, dev loss: 0.7185147816864392
