In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import gensim



In [2]:
train = pd.read_csv('ratings_train.txt', delimiter='\t')
test = pd.read_csv('ratings_test.txt', delimiter='\t')
y_train = train.label.values
y_test = test.label.values

In [3]:
model = gensim.models.Word2Vec.load('./vectors.bin')

In [4]:
len(model.wv.vocab)

34371

In [5]:
train_sents = np.load('train_sents.npy')
train_segs = np.load('train_segs.npy')
test_sents = np.load('test_sents.npy')
test_segs = np.load('test_segs.npy')

In [6]:
def get_max_length(seq_data):
    max_len = 0
    for sent in seq_data:
        if max_len < len(sent):
            max_len = len(sent)
    return(max_len)

In [7]:
#max_len = get_max_length(train_segs)
max_len = 10

In [8]:
word = []
for sent in train_segs:
    for seg in sent:
        word.append(seg)

In [9]:
word2num = {w:(i+1) for i,w in enumerate(set(word))}
num2word = {(i+1):w for i,w in enumerate(set(word))}

In [10]:
embedding_vector = [np.zeros(shape=300)]
for i, w in enumerate(word2num.keys()):
    if w not in model.wv.vocab:
        embedding_vector += [np.random.normal(size=300)]
    else:
        embedding_vector += [model.wv[w]]

In [11]:
word2num['<UNK>'] = 0
num2word[0] = '<UNK>'

In [12]:
def sent2index(segs):
    idx = []
    for sent in segs:
        temp = []
        for seg in sent:
            if seg not in word2num.keys():
                seg = '<UNK>'
            temp.append(word2num[seg])
        idx.append(temp)
    return idx

In [13]:
train_idx = sent2index(train_segs)
test_idx = sent2index(test_segs)

In [14]:
def get_seq_length(seq_data, max_len):
    seq_len = []
    for seq in seq_data:
        if max_len <= len(seq):
            seq_len.append(max_len)
        else:
            seq_len.append(len(seq))
    return seq_len

In [15]:
seq_length_ = get_seq_length(train_segs, max_len)

In [16]:
def give_zero_padding(seq_data, max_len):
    seq_ = []
    for seq in seq_data:
        temp = seq
        if len(temp) > max_len:
            temp = seq[:max_len]
        for _ in range(len(temp), max_len):
            temp.append(0)
        seq_.append(temp)
    return seq_

In [17]:
train_idx = give_zero_padding(train_idx, max_len)
test_idx = give_zero_padding(test_idx, max_len)

In [18]:
train_idx[0]

[39396, 1389, 15842, 26982, 23790, 8438, 60577, 3211, 0, 0]

In [29]:
learning_rate = 0.001
n_hidden = 128
n_epoch = 1
n_embedding = 300
n_step = max_len
n_input = 300
n_vocab = len(word2num)
batch_size = 64
n_output = 2

In [43]:
tf.reset_default_graph()
X = tf.placeholder(dtype=tf.int32, shape=[None, n_step])
Y = tf.placeholder(dtype=tf.int32, shape=[None])
W = tf.get_variable(name='embedding', shape=[n_vocab, n_embedding], trainable=True)
#embedding_placeholder = tf.placeholder(tf.float32, [n_vocab, n_embedding])
#embedding_init = W.assign(embedding_placeholder)
seq_len = tf.placeholder(dtype=tf.int32, shape=[None])
inputs = tf.nn.embedding_lookup(W, X)

In [44]:
W_ = tf.Variable(tf.truncated_normal([n_hidden*2, n_output]))
b = tf.Variable(tf.truncated_normal([n_output]))

In [45]:
cell_fw = tf.nn.rnn_cell.GRUCell(n_hidden)
cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw, output_keep_prob=0.5)
cell_bw = tf.nn.rnn_cell.GRUCell(n_hidden)
cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw, output_keep_prob=0.5)

In [46]:
((output_fw, output_bw), (state_fw, state_bw)) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=seq_len, dtype=tf.float32)
#outputs: TensorShape([batch_size, time_steps, hidden_size])

In [47]:
states = tf.concat([state_fw, state_bw], 1)

In [48]:
#idx = tf.range(tf.shape(outputs)[0])*tf.shape(outputs)[1] + (seq_len - 1)
#idx = tf.cast(tf.reduce_sum(tf.one_hot(idx, tf.shape(outputs)[0]*tf.shape(outputs)[1]), 0), tf.int32)
#outputs = tf.dynamic_partition(tf.reshape(outputs, [-1, n_hidden]), idx, 2)
#last_outputs = outputs[1]

In [49]:
logits = tf.matmul(states, W_) + b
preds = tf.cast(tf.argmax(logits, 1), tf.int32)
correct = tf.equal(preds, Y)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [50]:
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y))

In [51]:
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [52]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
#sess.run(embedding_init, feed_dict={embedding_placeholder: embedding_vector})
total_batch = int(len(train_idx) / batch_size)

for epoch in range(n_epoch):
    cost_sum = 0
    print('< epoch :', (epoch+1), '>')
    for i in range(total_batch):
        if i == (total_batch-1):
            batch_xs = train_idx[(i*batch_size):len(train_idx)]
            batch_ys = y_train[(i*batch_size):len(train_idx)]
            seq_length_data = get_seq_length(train_segs[(i*batch_size):len(train_idx)], max_len)
        else:
            batch_xs = train_idx[i*batch_size:(i+1)*batch_size]
            batch_ys = y_train[i*batch_size:(i+1)*batch_size]
            seq_length_data = get_seq_length(train_segs[(i*batch_size):(i+1)*batch_size], max_len)
        cost_val, _ = sess.run([cost, optimizer], feed_dict={X: batch_xs, Y: batch_ys, seq_len: seq_length_data})
        cost_sum += cost_val
        if i % 500 == 499:
            print('%04d' % (i+1), 'Cost: ', '{:.3f}'.format(cost_sum/500))
            cost_sum = 0

< epoch : 1 >
0500 Cost:  0.487
1000 Cost:  0.441
1500 Cost:  0.425
2000 Cost:  0.412


In [53]:
test_batch = int(len(test_idx) / batch_size)
test_acc = 0

for i in range(test_batch):
    if i == (test_batch-1):
        batch_xs = test_idx[(i*batch_size):len(test_idx)]
        batch_ys = y_test[(i*batch_size):len(test_idx)]
        seq_length_data = get_seq_length(test_segs[(i*batch_size):len(test_idx)], max_len)
    else:
        batch_xs = test_idx[i*batch_size:(i+1)*batch_size]
        batch_ys = y_test[i*batch_size:(i+1)*batch_size]
        seq_length_data = get_seq_length(test_segs[(i*batch_size):(i+1)*batch_size], max_len)
    acc = sess.run(accuracy, feed_dict={X: batch_xs, Y: batch_ys, seq_len: seq_length_data})
    test_acc += acc
print('Accuracy: ', '{:.3f}'.format(test_acc/test_batch))

Accuracy:  0.814
