In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

SEED = 2018

In [3]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

In [4]:
question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

In [5]:
from gensim.corpora import Dictionary

word_dict = Dictionary(question_data["words"])
char_dict = Dictionary(question_data["chars"])

num_words, num_chars = len(word_dict.dfs), len(char_dict.dfs)
num_words, num_chars

(20890, 3048)

In [6]:
def adjust_embedding(embedding, corp_dict):
    index = sorted([k for k, v in corp_dict.token2id.items()], key=lambda x: corp_dict.token2id[x])
    return embedding.reindex(index).values

In [7]:
word_embedding_data = adjust_embedding(word_embedding_data, word_dict)
char_embedding_data = adjust_embedding(char_embedding_data, char_dict)

In [8]:
def pad_embedding(embedding, method="mean"):
    new_vector = np.zeros(embedding.shape[1], dtype=np.float32)
    if method == "mean":
        new_vector = embedding.mean(axis=0)
    return np.concatenate((embedding, new_vector.reshape((1, -1))), axis=0)

In [9]:
print(f"former word embedding matrix shape: {word_embedding_data.shape}")
word_embedding_data = pad_embedding(word_embedding_data, method="zero")
print(f"final word embedding matrix shape: {word_embedding_data.shape}")

print(f"former char embedding matrix shape: {char_embedding_data.shape}")
char_embedding_data = pad_embedding(char_embedding_data, method="zero")
print(f"final char embedding matrix shape: {char_embedding_data.shape}")

former word embedding matrix shape: (20890, 300)
final word embedding matrix shape: (20891, 300)
former char embedding matrix shape: (3048, 300)
final char embedding matrix shape: (3049, 300)


In [10]:
T_STOP = 100000
USE_STOP = True

word_count = sorted([(token, word_dict.dfs[id_]) for token, id_ in word_dict.token2id.items()],
                    key=lambda x: x[1], reverse=True)
char_count = sorted([(token, char_dict.dfs[id_]) for token, id_ in char_dict.token2id.items()],
                    key=lambda x: x[1], reverse=True)

word_stop = [token for token, count in word_count if count >= T_STOP]
char_stop = [token for token, count in char_count if count >= T_STOP]
word_stop, char_stop

(['W17378', 'W19355', 'W16319', 'W18238', 'W18103'],
 ['L0104',
  'L2214',
  'L1861',
  'L2582',
  'L3019',
  'L0143',
  'L2218',
  'L1132',
  'L1128',
  'L0362',
  'L1187'])

In [11]:
def pair2vec(pair, question, use_stop=USE_STOP):
    return ques2vec(pair[["q1"]].rename(columns={"q1": "qid"}), question, use_stop) + \
           ques2vec(pair[["q2"]].rename(columns={"q2": "qid"}), question, use_stop)
    
def ques2vec(ques, question, use_stop=True):
    q = ques.merge(question, how="left", on="qid")
    return seq2vec(q, "words", use_stop), seq2vec(q, "chars", use_stop)
    
def seq2vec(ques, col, use_stop=True):
    seq = ques[col]
    if col == "words":
        d = word_dict.token2id
        s = word_stop
    else:
        d = char_dict.token2id
        s = char_stop
    
    if use_stop:
        return [[d[token] for token in text if token not in s] for text in seq]
    else:
        return [[d[token] for token in text] for text in seq]

In [12]:
train_word1, train_word2, train_char1, train_char2 = pair2vec(train_data, question_data)
test_word1, test_word2, test_char1, test_char2 = pair2vec(test_data, question_data)

In [None]:
T_WORD = 6
T_CHAR = 10

def padding(text, length, pad_index, cut_method):
    text = text[0]
    num_text = len(text)
    if num_text < length:
        return text + [pad_index] * (length - num_text)
    elif num_text > length:
        if cut_method == "direct":
            return text[:length]
        else:
            return text
    return text

def sample_padding(texts, length, pad_index, cut_method="direct"):
    texts = np.array(texts).reshape(-1, 1)
    return np.apply_along_axis(padding, 1, texts, length, pad_index, cut_method="direct")

def padding_all(all_words, all_chars):
    num_words = len(word_dict.dfs)
    num_chars = len(char_dict.dfs)
    words_res = [sample_padding(s, T_WORD, num_words) for s in all_words]
    chars_res = [sample_padding(s, T_CHAR, num_chars) for s in all_chars]
    return tuple(words_res + chars_res)

In [None]:
train_word1, train_word2, train_char1, train_char2 = padding_all([train_word1, train_word2], [train_char1, train_char2])
test_word1, test_word2, test_char1, test_char2 = padding_all([test_word1, test_word2], [test_char1, test_char2])

In [None]:
train_word1, dev_word1, \
train_word2, dev_word2, \
train_char1, dev_char1, \
train_char2, dev_char2, \
train_y, dev_y = train_test_split(train_word1, train_word2, train_char1, train_char2, train_data["label"], test_size=0.05)

In [None]:
epoch = 10
batch_size = 64
keep_prob = 0.8
unit_size = 128
dense_size = 64
learning_rate = 0.01
clip_norm = 2
weight_init = 1.0


def lstm_cell(unit_size, keep):
    lstm = tf.contrib.rnn.BasicLSTMCell(unit_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep)
    return drop

graph = tf.Graph()
with graph.as_default():
    with tf.variable_scope("dataset"):
        dataset = tf.data.Dataset.from_tensor_slices((train_word1, train_word2, train_char1, train_char2, train_y))
        dataset = dataset.repeat(epoch).shuffle(batch_size * 3 + 1000).batch(batch_size)
        iterator = dataset.make_initializable_iterator()
        word1, word2, char1, char2, label = iterator.get_next()
    
    with tf.variable_scope("input"):
        w1 = tf.placeholder(shape=(None, T_WORD), dtype=tf.int64, name="w1")
        w2 = tf.placeholder(shape=(None, T_WORD), dtype=tf.int64, name="w2")
        c1 = tf.placeholder(shape=(None, T_CHAR), dtype=tf.int64, name="c1")
        c2 = tf.placeholder(shape=(None, T_CHAR), dtype=tf.int64, name="c2")
        y = tf.placeholder(shape=(None,), dtype=tf.float64, name="y_true")
        keep = tf.placeholder(shape=(), dtype=tf.float64, name="keep_prob")
    
    with tf.variable_scope("embedding"):
        word_embedding = tf.Variable(word_embedding_data, trainable=False, name="word_embedding")
        char_embedding = tf.Variable(char_embedding_data, trainable=False, name="char_embedding")
        w1_embed = tf.nn.embedding_lookup(word_embedding, w1, name="word1_embed")
        w2_embed = tf.nn.embedding_lookup(word_embedding, w2, name="word2_embed")
        c1_embed = tf.nn.embedding_lookup(char_embedding, c1, name="char1_embed")
        c2_embed = tf.nn.embedding_lookup(char_embedding, c2, name="char2_embed")
    
    with tf.variable_scope("word_layers"):
        word_cell = lstm_cell(unit_size, keep)
        output_w1, _ = tf.nn.dynamic_rnn(word_cell, w1_embed, dtype=tf.float64)
        output_w2, _ = tf.nn.dynamic_rnn(word_cell, w2_embed, dtype=tf.float64)
    
    with tf.variable_scope("char_layers"):
        char_cell = lstm_cell(unit_size, keep)
        output_c1, _ = tf.nn.dynamic_rnn(char_cell, c1_embed, dtype=tf.float64)
        output_c2, _ = tf.nn.dynamic_rnn(char_cell, c2_embed, dtype=tf.float64)
        
    with tf.variable_scope("combination"):
        o_w1, o_w2 = output_w1[:, -1, :], output_w2[:, -1, :]
        o_c1, o_c2 = output_c1[:, -1, :], output_c2[:, -1, :]
        final_output = tf.concat((o_w1, o_w2, o_c1, o_c2), axis=1)
    
    with tf.variable_scope("output_layer"):
        dense_output = tf.layers.dense(final_output, dense_size, activation=tf.nn.relu,
                                       kernel_initializer=tf.truncated_normal_initializer(stddev=weight_init),
                                       bias_initializer=tf.zeros_initializer)
        pred = tf.layers.dense(dense_output, 1,
                               activation=tf.nn.sigmoid, kernel_initializer=tf.truncated_normal_initializer(stddev=weight_init),
                               bias_initializer=tf.zeros_initializer)
    
    with tf.variable_scope("loss"):
        loss = -tf.reduce_mean(y * tf.log(pred) + (1 - y) * tf.log(1 - pred))
        
    with tf.variable_scope("train"):
        global_step = tf.Variable(0, trainable=False, dtype=tf.int64, name="global_step")
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        # grads, variables = zip(*optimizer.compute_gradients(loss, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))
        # clip_grads = tf.clip_by_global_norm(grads, clip_norm)[0]
        # train_op = optimizer.apply_gradients(zip(clip_grads, variables), global_step=global_step)
        train_op = optimizer.minimize(loss, global_step=global_step)

    tf.summary.scalar("batch_loss", loss)
    for v in tf.trainable_variables():
        tf.summary.histogram(v.name, v)
    for g in clip_grads:
        tf.summary.histogram("grad_" + g.name, g)
    
    writer = tf.summary.FileWriter("./log/", graph=graph)

In [None]:
with tf.Session(graph=graph) as sess:
    tf.global_variables_initializer().run()
    tf.local_variables_initializer().run()
    sess.run(iterator.initializer)
    
    summary_op = tf.summary.merge_all()
    
    try:
        while True:
            res_w1, res_w2, res_c1, res_c2, res_y = sess.run([word1, word2, char1, char2, label])
            train_feed = {
                w1: res_w1,
                w2: res_w2,
                c1: res_c1,
                c2: res_c2,
                y: res_y,
                keep: keep_prob,
            }
            
            _, total_steps, train_loss, summary = sess.run([train_op, global_step, loss, summary_op], feed_dict=train_feed)
            writer.add_summary(summary, global_step=total_steps)
            
            if total_steps % 100  == 0:
                test_feed = {
                    w1: dev_word1,
                    w2: dev_word2,
                    c1: dev_char1,
                    c2: dev_char2,
                    y: dev_y,
                    keep: 1.0,
                }
                dev_less = sess.run(loss, feed_dict=test_feed)
                
                print(f"[step {total_steps}] train loss: {train_loss}, dev loss: {dev_less}")
    except tf.errors.OutOfRangeError:
        pass