In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [13]:
train = pd.read_csv('ratings_train.txt', delimiter='\t')
test = pd.read_csv('ratings_test.txt', delimiter='\t')
X_train = train.document.values
X_test = test.document.values
y_train = train.label.values
y_test = test.label.values

In [14]:
def get_max_length(seq_data):
    max_len = 0
    for sent in seq_data:
        if max_len < len(sent):
            max_len = len(sent)
    return(max_len)

In [37]:
max_len = 20

In [38]:
word = []
for sent in X_train:
    for seg in sent:
        word.append(seg)

In [39]:
word2num = {w:(i+1) for i,w in enumerate(set(word))}
num2word = {(i+1):w for i,w in enumerate(set(word))}

In [40]:
word2num['<UNK>'] = 0
num2word[0] = '<UNK>'

In [41]:
def sent2index(segs):
    idx = []
    for sent in segs:
        temp = []
        for seg in sent:
            if seg not in word2num.keys():
                seg = '<UNK>'
            temp.append(word2num[seg])
        idx.append(temp)
    return idx

In [42]:
train_idx = sent2index(train.document.values)
test_idx = sent2index(test.document.values)

In [43]:
def get_seq_length(seq_data, max_len):
    seq_len = []
    for seq in seq_data:
        if max_len <= len(seq):
            seq_len.append(max_len)
        else:
            seq_len.append(len(seq))
    return seq_len

In [44]:
def give_zero_padding(seq_data, max_len):
    seq_ = []
    for seq in seq_data:
        temp = seq
        if len(temp) > max_len:
            temp = seq[:max_len]
        for _ in range(len(temp), max_len):
            temp.append(0)
        seq_.append(temp)
    return seq_

In [45]:
train_idx = give_zero_padding(train_idx, max_len)
test_idx = give_zero_padding(test_idx, max_len)

In [64]:
learning_rate = 0.001
n_hidden = 128
n_epoch = 5
n_embedding = 300
n_step = max_len
n_input = 300
n_vocab = len(word2num)
batch_size = 64
n_output = 2

In [54]:
tf.reset_default_graph()
X = tf.placeholder(dtype=tf.int32, shape=[None, n_step])
Y = tf.placeholder(dtype=tf.int32, shape=[None])
embedding = tf.get_variable(name="embedding", shape=[n_vocab, n_embedding], dtype=tf.float32)
seq_len = tf.placeholder(dtype=tf.int32, shape=[None])
inputs = tf.nn.embedding_lookup(embedding, X)

In [55]:
W = tf.get_variable(name="weights", shape=[2*n_hidden, n_output])
b = tf.get_variable(name="bias", shape=[n_output])

In [56]:
cell = tf.nn.rnn_cell.LSTMCell(n_hidden)
cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.5)

In [57]:
outputs, states = tf.nn.dynamic_rnn(cell, inputs, sequence_length=seq_len, dtype=tf.float32)

In [58]:
states = tf.concat([states.c, states.h], 1)

In [59]:
logits = tf.matmul(states, W) + b
preds = tf.cast(tf.argmax(logits, 1), tf.int32)
correct = tf.equal(preds, Y)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [65]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
total_batch = int(len(train_idx) / batch_size)

for epoch in range(n_epoch):
    cost_avg = 0
    print('< epoch :', (epoch+1), '>')
    for i in range(total_batch):
        if i == (total_batch-1):
            batch_xs = train_idx[(i*batch_size):len(train_idx)]
            batch_ys = y_train[(i*batch_size):len(train_idx)]
            seq_length_data = get_seq_length(X_train[(i*batch_size):len(train_idx)], max_len)
        else:
            batch_xs = train_idx[i*batch_size:(i+1)*batch_size]
            batch_ys = y_train[i*batch_size:(i+1)*batch_size]
            seq_length_data = get_seq_length(X_train[(i*batch_size):(i+1)*batch_size], max_len)
        cost_val, _ = sess.run([cost, optimizer], feed_dict={X: batch_xs, Y: batch_ys, seq_len: seq_length_data})
        cost_avg += cost_val
        if i % 500 == 499:
            print('%04d' % (i+1), 'Cost: ', '{:.3f}'.format(cost_avg/500))
            cost_avg = 0

< epoch : 1 >
0500 Cost:  0.554
1000 Cost:  0.514
1500 Cost:  0.490
2000 Cost:  0.478
< epoch : 2 >
0500 Cost:  0.455
1000 Cost:  0.449
1500 Cost:  0.441
2000 Cost:  0.434
< epoch : 3 >
0500 Cost:  0.419
1000 Cost:  0.416
1500 Cost:  0.413
2000 Cost:  0.404
< epoch : 4 >
0500 Cost:  0.392
1000 Cost:  0.389
1500 Cost:  0.389
2000 Cost:  0.378
< epoch : 5 >
0500 Cost:  0.366
1000 Cost:  0.364
1500 Cost:  0.364
2000 Cost:  0.351


In [66]:
test_batch = int(len(test_idx) / batch_size)
test_acc = 0

for i in range(test_batch):
    if i == (test_batch-1):
        batch_xs = test_idx[(i*batch_size):len(test_idx)]
        batch_ys = y_test[(i*batch_size):len(test_idx)]
        seq_length_data = get_seq_length(X_test[(i*batch_size):len(test_idx)], max_len)
    else:
        batch_xs = test_idx[i*batch_size:(i+1)*batch_size]
        batch_ys = y_test[i*batch_size:(i+1)*batch_size]
        seq_length_data = get_seq_length(X_test[(i*batch_size):(i+1)*batch_size], max_len)
    acc = sess.run(accuracy, feed_dict={X: batch_xs, Y: batch_ys, seq_len: seq_length_data})
    test_acc += acc
print('Accuracy: ', '{:.3f}'.format(test_acc/test_batch))

Accuracy:  0.793


In [23]:
sess.close()