In [1]:
# -*- coding: utf-8 -*-

import vocab as vc
import data_helpers as dh
import data_helpers_yk as dh_yk
import custom_loss as cl

from collections import Counter
from gensim.models import Word2Vec
import tensorflow as tf
import numpy as np
from tensorflow.contrib import learn

In [2]:
vocab_filename = 'corpusToLines_vocab.txt'
vocab = vc.load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print("# [{}] is loaded as [vocab].".format(vocab_filename))

# [corpusToLines_vocab.txt] is loaded as [vocab].


## Benchmark data pre-processing

In [3]:
# MR path flags
MR_pos = "../data/rt-polaritydata/rt-polarity.pos"
MR_neg = "../data/rt-polaritydata/rt-polarity.neg"

In [4]:
# benchmark 데이터를 x_data로 변환하기
x_data, y_data = dh_yk.load_data_and_labels(MR_pos, MR_neg)
y_data = np.argmax(y_data, axis=1)

In [5]:
vocab_path = "vocab_processor"
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_data)))

In [6]:
len(x_test[0])

62

## word vector loading

In [7]:
def load_word2vec(filename):
    vocab = []
    embd = []
    file = open(filename,'r')
    lines = file.readlines()[1:]
    for line in lines:
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print('# [{}] is successfully loaded!'.format(filename))
    file.close()
    return vocab,embd

filename = './fantasy_embedding_word2vec.txt'
vocab,embd = load_word2vec(filename)
vocab_size = len(vocab)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)
print("# embedding vocabulary size: {}".format(len(embedding)))

# [./fantasy_embedding_word2vec.txt] is successfully loaded!
# embedding vocabulary size: 10065


## Benchmark data forwarding

In [8]:
sequence_length = 62
wv_sz = 100

input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
tf_embedding = tf.constant(embedding, dtype=tf.float32)

embedded_chars = tf.nn.embedding_lookup(tf_embedding, input_x) 
embedded_chars_expanded = tf.expand_dims(embedded_chars, -1) 

filter_sizes = [3, 4, 5]
num_filters = 128

pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
    with tf.name_scope("conv-maxpool-%s" % filter_size):
        # Convolution Layer
        filter_shape = [filter_size, wv_sz, 1, num_filters]
        W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="conv_W")
        b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="conv_b")
        conv = tf.nn.conv2d(
            embedded_chars_expanded,
            W,
            strides=[1, 1, 1, 1],
            padding="VALID",
            name="conv")
        # Apply nonlinearity
        h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
        # Max-pooling over the outputs
        pooled = tf.nn.max_pool(
            h,
            ksize=[1, sequence_length - filter_size + 1, 1, 1],
            strides=[1, 1, 1, 1],
            padding='VALID',
            name="pool")
        pooled_outputs.append(pooled)

num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat( pooled_outputs, 3)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

cnn_output = h_pool_flat

In [9]:
cnn_output

<tf.Tensor 'Reshape:0' shape=(?, 384) dtype=float32>

In [12]:
saver = tf.train.Saver()
sess = tf.Session()
saver.restore(sess, "./logs/model3.ckpt")
# sess.run(tf.global_variables_initializer())
print("모델 불러오기 성공")

batch_size = 32

for i in range(0, len(x_test), batch_size):
    x_batch = x_test[i:i+batch_size]

    cnn_output_val = sess.run([cnn_output], 
                            feed_dict={input_x: x_test})

print("cnn_output_val 생성 완료")

INFO:tensorflow:Restoring parameters from ./logs/model3.ckpt
모델 불러오기 성공
cnn_output_val 생성 완료


In [13]:
print(cnn_output_val)

[array([[2.2277083e+00, 0.0000000e+00, 7.1619436e-02, ..., 3.6095488e+00,
        1.5604650e+00, 0.0000000e+00],
       [2.2295709e+00, 0.0000000e+00, 6.4765237e-02, ..., 3.6444957e+00,
        1.5776744e+00, 0.0000000e+00],
       [2.2188148e+00, 0.0000000e+00, 0.0000000e+00, ..., 3.5315635e+00,
        1.4621205e+00, 0.0000000e+00],
       ...,
       [2.2246110e+00, 4.3256015e-02, 7.5296961e-02, ..., 3.6486440e+00,
        1.5819114e+00, 0.0000000e+00],
       [2.2278743e+00, 2.9837713e-03, 3.0418903e-02, ..., 3.5315635e+00,
        1.4621205e+00, 0.0000000e+00],
       [2.2269096e+00, 0.0000000e+00, 7.0705257e-02, ..., 3.6415527e+00,
        1.5770420e+00, 0.0000000e+00]], dtype=float32)]


In [14]:
import _pickle as cPickle
with open('cnn_output_val.pkl', 'wb') as f:
    cPickle.dump(cnn_output_val, f)

In [33]:
with open('cnn_output_val.pkl', 'rb') as f:
    cnn_output_val = cPickle.load(f)

In [34]:
cnn_output_val = np.asarray(cnn_output_val)

In [35]:
cnn_output_val.shape

(1, 10662, 384)

In [36]:
cnn_output_val[0].shape

(10662, 384)

In [37]:
cnn_output_val = cnn_output_val[0]

In [38]:
cnn_output_val.shape

(10662, 384)

In [39]:
# model_inputs = h_pool_flat
model_inputs = tf.placeholder(dtype=tf.float32, shape=[None, 384])
labels = tf.placeholder(dtype=tf.float32, shape=[None, 2])

W = tf.Variable(tf.random_normal([384, 2], stddev=0.01))
b = tf.Variable(tf.random_normal([2], stddev=0.01))
                                                       
logits = tf.matmul(model_inputs, W) + b
predictions = tf.nn.softmax(logits)

In [46]:
labels

<tf.Tensor 'Placeholder_5:0' shape=(?, 2) dtype=float32>

In [40]:
loss = tf.losses.softmax_cross_entropy(
    onehot_labels=labels,
    logits=predictions)

dense_predictions = tf.argmax(predictions, axis=1)
dense_labels = tf.argmax(labels, axis=1)
equals = tf.cast(tf.equal(dense_predictions, dense_labels), tf.float32)
acc = tf.reduce_mean(equals)

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(loss)

In [66]:
y_data2 = list(y_data)

In [64]:
y_data2[0] = [0, 1]

In [69]:
y_data2[:5]

[1, 1, 1, 1, 1]

In [75]:
y_test = []
for idx, i in enumerate(y_data2):
    if i == 1:
        y_test.append([0, 1])
    else:
        y_test.append([1, 0])

In [89]:
saver = tf.train.Saver()
with tf.Session() as sess:
    final_acc = 0.0
    sess.run(tf.global_variables_initializer())
    
    for step in range(100):
        feed = {model_inputs: cnn_output_val, labels: y_test}
        _, acc_val = sess.run([train_op, acc], feed_dict=feed)
        final_acc += acc_val
        
    final_acc /= 100
    print ("Full Evaluation Accuracy : {}".format(final_acc))

Full Evaluation Accuracy : 0.5050318920612336


In [44]:
cnn_output_val.shape

(10662, 384)