In [1]:
import tensorflow as tf
from sklearn.externals import joblib
import keras
import sys
sys.path.append('..')
from utils.word2vec_fast import *
from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
question_path = '../data/chip2018/question_id.csv'
wv = Word2VecFast.load_word2vec_format(file_path='../data/chip2018/word_embedding.txt', word_shape=300)
print(wv.word_shape())

(9648, 300)


In [3]:
sequence_length = 45 # 句子最大长度
embedding_size = 300
filter_sizes = [3, 4, 5]
num_filters = 128
num_classes = 2
num_epoch = 10
batch_size = 128

In [4]:
# 加载问题
question_c_dict = {}
def load_question():
    temp1 = {}
    with open(file=question_path, encoding='utf-8') as f:
        for line_num,line in enumerate(f):
            if line_num == 0:
                continue
            ss = line.split(',')
            qid,qw,qc = ss[0],ss[1],ss[2]
            temp1[qid] = wv.words_id(qw)
    return temp1

In [5]:
question_w_dict = load_question()
question_w_dict['S100012']

[2466, 8275, 913, 2369, 9586, 9527, 8479, 8275, 805, 586]

In [7]:
def load_dataset():
    x_,y_ = [],[]
    with open(file=train_path, encoding='utf-8') as f:
        for line_num,line in enumerate(f):
            if line_num == 0:
                continue
            ss = line.split(',')
            a1 = question_w_dict[ss[0]]
            a2 = question_w_dict[ss[1]]
            a1 = sequence.pad_sequences([a1], maxlen=sequence_length, padding='post', value=wv.words_id('<UNK>'))[0]
            a2 = sequence.pad_sequences([a2], maxlen=sequence_length, padding='post', value=wv.words_id('<UNK>'))[0]
            a = np.hstack((a1, a2))
            x_.append(a)
            y_.append(ss[2])
    return x_,y_

In [8]:
train_path = '../data/chip2018/train.csv'
X1_, Y_ = load_dataset()
X1_ = np.array(X1_)
Y_ = np.array(Y_)
print(X1_.shape)
num_examples = X1_.shape[0]
total_batch = int(num_examples/batch_size)
Y_ = keras.utils.to_categorical(Y_, 2)

(20000, 90)


In [10]:
input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1")
input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2")
input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

In [11]:
# 嵌入层
WL_word2vec = tf.constant(wv.word_embeddings(), dtype=tf.float32)
WL_word_embedding1 = tf.nn.embedding_lookup(WL_word2vec, input_x1)
WL_word_embedding_expanded1 = tf.expand_dims(WL_word_embedding1, -1)
print(WL_word_embedding_expanded1)

# 嵌入层
WL_word_embedding2 = tf.nn.embedding_lookup(WL_word2vec, input_x2)
WL_word_embedding_expanded2 = tf.expand_dims(WL_word_embedding2, -1)
print(WL_word_embedding_expanded2)

Tensor("ExpandDims:0", shape=(?, 45, 300, 1), dtype=float32)
Tensor("ExpandDims_1:0", shape=(?, 45, 300, 1), dtype=float32)


In [12]:
W1 = tf.Variable(tf.truncated_normal([3, embedding_size, 1, num_filters], stddev=0.1), dtype=tf.float32)
b1 = tf.Variable(tf.constant(0.1, shape=[num_filters]))

conv1 = tf.nn.conv2d(input=WL_word_embedding_expanded1, filter=W1, strides=[1, 1, 1, 1], padding="VALID")
h1 = tf.nn.relu(tf.nn.bias_add(conv1, b1))
pooled1 = tf.nn.max_pool(h1, ksize=[1, sequence_length - 3 + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
print(pooled1)

conv2 = tf.nn.conv2d(input=WL_word_embedding_expanded2, filter=W1, strides=[1, 1, 1, 1], padding="VALID")
h2 = tf.nn.relu(tf.nn.bias_add(conv2, b1))
pooled2 = tf.nn.max_pool(h2, ksize=[1, sequence_length - 3 + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
print(pooled2)

Tensor("MaxPool:0", shape=(?, 1, 1, 128), dtype=float32)
Tensor("MaxPool_1:0", shape=(?, 1, 1, 128), dtype=float32)


In [13]:
h_pool_flat1 = tf.reshape(pooled1, [-1, num_filters])
h_pool_flat2 = tf.reshape(pooled2, [-1, num_filters])
print(h_pool_flat1, h_pool_flat2)

Tensor("Reshape:0", shape=(?, 128), dtype=float32) Tensor("Reshape_1:0", shape=(?, 128), dtype=float32)


In [14]:
h_pool_flat1 = h_pool_flat1 / tf.norm(h_pool_flat1)
h_pool_flat2 = h_pool_flat2 / tf.norm(h_pool_flat2)
print(h_pool_flat1, h_pool_flat2)

Tensor("truediv:0", shape=(?, 128), dtype=float32) Tensor("truediv_1:0", shape=(?, 128), dtype=float32)


In [15]:
h_drop = tf.multiply(h_pool_flat1, h_pool_flat2)


W2 = tf.Variable(tf.truncated_normal([num_filters, num_classes], stddev=0.1))
b2 = tf.Variable(tf.constant(0.1, shape=[num_classes]))
logits = tf.nn.xw_plus_b(h_drop, W2, b2)
predictions = tf.nn.softmax(logits)

correct_predict = tf.equal(tf.argmax(predictions, axis=1), tf.argmax(input_y, axis=1))
acc_op = tf.reduce_mean(tf.cast(correct_predict, tf.float32))

print(predictions)

Tensor("Softmax:0", shape=(?, 2), dtype=float32)


In [16]:
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=input_y))
train_op = tf.train.AdamOptimizer(learning_rate=0.0005).minimize(loss_op)
init_op = tf.global_variables_initializer()

In [17]:
def next_batch(i):
    a = i * batch_size
    b = (i+1) * batch_size
    return (X1_[a:b], Y_[a:b])

In [18]:
with tf.Session() as sess:
    sess.run(init_op)
    
    for epoch in range(num_epoch):
        avg_cost = 0.
        avg_acc = 0.
        for i in range(total_batch):
            batch_xs, batch_ys = next_batch(i)
            sess.run(train_op, feed_dict={input_x1: batch_xs[:,0:45], input_x2: batch_xs[:,45:], input_y: batch_ys, dropout_keep_prob: 0.5})
            c, acc = sess.run([loss_op, acc_op], feed_dict={input_x1: batch_xs[:,0:45], input_x2: batch_xs[:,45:], input_y: batch_ys, dropout_keep_prob: 0.5})
            avg_cost += c / total_batch
            avg_acc += acc / total_batch
        print("Epoch:", '%04d' % (epoch+1), "loss=", "{:.9f}".format(avg_cost), "accuracy=", "{:.9f}".format(avg_acc))
    print("Optimization Finished!")

Epoch: 0001 loss= 0.693160074 accuracy= 0.496844952
Epoch: 0002 loss= 0.693095208 accuracy= 0.502153446
Epoch: 0003 loss= 0.692754656 accuracy= 0.512419872
Epoch: 0004 loss= 0.692329844 accuracy= 0.517678285
Epoch: 0005 loss= 0.691820627 accuracy= 0.514823718
Epoch: 0006 loss= 0.691369498 accuracy= 0.513872196
Epoch: 0007 loss= 0.691059194 accuracy= 0.515825321
Epoch: 0008 loss= 0.690801490 accuracy= 0.515675080
Epoch: 0009 loss= 0.690573449 accuracy= 0.515324519


KeyboardInterrupt: 