In [22]:
import tensorflow as tf
import numpy as np
import pandas as pd
import gensim

In [6]:
train = pd.read_csv('ratings_train.txt', delimiter='\t')
test = pd.read_csv('ratings_test.txt', delimiter='\t')

In [10]:
model = gensim.models.Word2Vec.load('./vectors.bin')

In [11]:
len(model.wv.vocab)

61589

In [89]:
train_sents = np.load('train_sents.npy')
train_segs = np.load('train_segs.npy')
test_sents = np.load('test_sents.npy')
test_segs = np.load('test_segs.npy')

In [90]:
def get_max_length(seq_data):
    max_len = 0
    for sent in seq_data:
        if max_len < len(sent):
            max_len = len(sent)
    return(max_len)

In [91]:
max_len = get_max_length(train_segs)

In [None]:
word = []
for sent in train_segs:
    for seg in sent:
        word.append(seg)

In [None]:
word2num = {w:(i+1) for i,w in enumerate(set(word))}
num2word = {(i+1):w for i,w in enumerate(set(word))}
word2num['<UNK>'] = 0
num2word[0] = '<UNK>'

In [81]:
len(word2num)

61590

In [84]:
def sent2index(segs):
    idx = []
    for sent in segs:
        temp = []
        for seg in sent:
            if seg not in word2num.keys():
                seg = '<UNK>'
            temp.append(word2num[seg])
        idx.append(temp)
    return idx

In [92]:
train_idx = sent2index(train_segs)
test_idx = sent2index(test_segs)

KeyError: 'GDNTOPCLASSINTHECLUB'

In [87]:
len(train_idx)

150000

In [76]:
def give_zero_padding(seq_data, max_len):
    for seq in seq_data:
        if len(seq) > max_len:
            seq = seq[:max_len]
        for _ in range(len(seq), max_len):
            seq.append(0)
    return seq_data

In [77]:
train_idx = give_zero_padding(train_idx, max_len)

In [243]:
def make_batch(seq_data):
    input_batch = []
    target_batch = []
    
    for seq in seq_data:
        input = [num_dic[n] for n in seq[:-1]]
        target = num_dic[seq[-1]]
        
        input_batch.append(np.eye(dic_len)[input])
        target_batch.append(target)
        
    return input_batch, target_batch

In [244]:
input_batch, target_batch = make_batch(seq_data)

In [245]:
target_batch

[3, 3, 19, 11, 11, 4, 0, 18, 6]

In [246]:
learning_rate = 0.001
n_hidden = 128
total_epoch = 200
n_step = max_len
n_input = 300
batch_size = len(seq_data)

In [247]:
tf.reset_default_graph()
X = tf.placeholder(dtype=tf.float32, shape=[None, n_step])
Y = tf.placeholder(dtype=tf.int32, shape=[None])
seq_len = tf.placeholder(dtype=tf.int32, shape=[None])

In [248]:
W = tf.Variable(tf.random_normal([n_hidden, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

In [249]:
cell = tf.nn.rnn_cell.GRUCell(n_hidden)
cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.5)

In [250]:
outputs, states = tf.nn.dynamic_rnn(cell, X, sequence_length=seq_len, dtype=tf.float32)
#outputs: TensorShape([batch_size, time_steps, hidden_size])

In [251]:
idx = tf.range(batch_size)*tf.shape(outputs)[1] + (seq_len - 1)

In [252]:
last_outputs = tf.gather(tf.reshape(outputs, [-1, n_hidden]), idx)
#tf.reshape(outputs, [-1, n_hidden]): TensorShape([batch_size*time_steps, hidden_size])

In [253]:
logits = tf.matmul(last_outputs, W) + b
preds = tf.cast(tf.argmax(logits, 1), tf.int32)
correct = tf.equal(preds, Y)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [254]:
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y))

In [255]:
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [256]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
input_batch, target_batch = make_batch(seq_data)
for epoch in range(total_epoch):
    _, loss = sess.run([optimizer, cost], feed_dict={X: input_batch, Y: target_batch, seq_len: seq_length_data})
    if epoch % 20 == 19:
        print('Epoch:', '%04d' % (epoch+1), 'cost =', '{:.6f}'.format(loss))
    
print('optimization finished!')

Epoch: 0020 cost = 1.856622
Epoch: 0040 cost = 0.382763
Epoch: 0060 cost = 0.097220
Epoch: 0080 cost = 0.021560
Epoch: 0100 cost = 0.011558
Epoch: 0120 cost = 0.030519
Epoch: 0140 cost = 0.014260
Epoch: 0160 cost = 0.005036
Epoch: 0180 cost = 0.012159
Epoch: 0200 cost = 0.009908
optimization finished!


In [257]:
input_batch, target_batch = make_batch(seq_data)
predict, accuracy_val = sess.run([preds, accuracy], feed_dict={X: input_batch, Y: target_batch, seq_len: seq_length_data})
for i, val in enumerate(seq_data):
    predict_words = seq_data[i][:seq_length_data[i]] + char_arr[predict[i]]
    print('Input: ', seq_data[i][:seq_length_data[i]])
    print('Prediction: ', predict_words, '\n')

Input:  wor
Prediction:  word 

Input:  worl
Prediction:  world 

Input:  treatmen
Prediction:  treatment 

Input:  powerfu
Prediction:  powerful 

Input:  potentia
Prediction:  potential 

Input:  googl
Prediction:  google 

Input:  south kore
Prediction:  south korea 

Input:  the united state
Prediction:  the united states 

Input:  natural language processin
Prediction:  natural language processing 

