In [1]:
import zipfile
import numpy as np
import tensorflow as tf

  return f(*args, **kwds)
  return f(*args, **kwds)


# 입력 데이터 정의

In [2]:
LOG_DIR='logs'

path_to_glove = "glove.840B.300d.zip"
PRE_TRAINED = True
GLOVE_SIZE = 300
batch_size = 128
embedding_dimension = 64
num_classes = 2
hidden_layer_size = 32
times_steps = 6

digit_to_word_map = {1: "One", 2: "Two", 3: "Three", 4: "Four", 5: "Five",
                     6: "Six", 7: "Seven", 8: "Eight", 9: "Nine"}
digit_to_word_map[0] = "PAD_TOKEN"
even_sentences = []
odd_sentences = []
seqlens = []

In [3]:
for i in range(10000):
    rand_seq_len = np.random.choice(range(3, 7))
    seqlens.append(rand_seq_len)
    rand_odd_ints = np.random.choice(range(1, 10, 2),
                                     rand_seq_len)
    rand_even_ints = np.random.choice(range(2, 10, 2),
                                      rand_seq_len)

    if rand_seq_len < 6:
        rand_odd_ints = np.append(rand_odd_ints,
                                  [0]*(6-rand_seq_len))
        rand_even_ints = np.append(rand_even_ints,
                                   [0]*(6-rand_seq_len))

    even_sentences.append(" ".join([digit_to_word_map[r]
                          for r in rand_odd_ints]))
    odd_sentences.append(" ".join([digit_to_word_map[r]
                         for r in rand_even_ints]))

In [4]:
even_sentences[:5]

['Nine Five Nine One PAD_TOKEN PAD_TOKEN',
 'Three Seven One One PAD_TOKEN PAD_TOKEN',
 'Five One Nine Five PAD_TOKEN PAD_TOKEN',
 'One Three Three PAD_TOKEN PAD_TOKEN PAD_TOKEN',
 'Five Seven Three Seven Three Three']

In [5]:
odd_sentences[:5]

['Six Four Six Four PAD_TOKEN PAD_TOKEN',
 'Two Two Four Four PAD_TOKEN PAD_TOKEN',
 'Six Four Eight Four PAD_TOKEN PAD_TOKEN',
 'Four Six Four PAD_TOKEN PAD_TOKEN PAD_TOKEN',
 'Eight Four Six Four Two Eight']

In [6]:
data = even_sentences+odd_sentences
# same seq lengths for even, odd sentences
seqlens *= 2
labels = [1]*10000 + [0]*10000
for i in range(len(labels)):
    label = labels[i]
    one_hot_encoding = [0]*2
    one_hot_encoding[label] = 1
    labels[i] = one_hot_encoding

In [7]:
labels[:5]

[[0, 1], [0, 1], [0, 1], [0, 1], [0, 1]]

In [8]:
word2index_map = {}
index = 0
for sent in data:
    for word in sent.split():
        if word not in word2index_map:
            word2index_map[word] = index
            index += 1

In [9]:
index2word_map = {index: word for word, index in word2index_map.items()}

vocabulary_size = len(index2word_map)

In [10]:
index2word_map

{0: 'Nine',
 1: 'Five',
 2: 'One',
 3: 'PAD_TOKEN',
 4: 'Three',
 5: 'Seven',
 6: 'Six',
 7: 'Four',
 8: 'Two',
 9: 'Eight'}

# 모델 학습 전 함수 및 변수 정의

In [11]:
def get_glove(path_to_glove, word2index_map):

    embedding_weights = {}
    count_all_words = 0
    with zipfile.ZipFile(path_to_glove) as z:
        with z.open("glove.840B.300d.txt") as f:
            for line in f:
                vals = line.split()
                word = str(vals[0].decode("utf-8"))
                if word in word2index_map:
                    print(word)
                    count_all_words += 1
                    coefs = np.asarray(vals[1:], dtype='float32')
                    coefs /= np.linalg.norm(coefs)
                    embedding_weights[word] = coefs
                if count_all_words == len(word2index_map) - 1:
                    break
    return embedding_weights

In [12]:
word2embedding_dict = get_glove(path_to_glove, word2index_map)
embedding_matrix = np.zeros((vocabulary_size, GLOVE_SIZE))

for word, index in word2index_map.items():
    if not word == "PAD_TOKEN":
        word_embedding = word2embedding_dict[word]
        embedding_matrix[index, :] = word_embedding

One
Two
Three
Four
Five
Six
Seven
Nine
Eight


In [13]:
data_indices = list(range(len(data)))
np.random.shuffle(data_indices)
data = np.array(data)[data_indices]
labels = np.array(labels)[data_indices]
seqlens = np.array(seqlens)[data_indices]
train_x = data[:10000]
train_y = labels[:10000]
train_seqlens = seqlens[:10000]

test_x = data[10000:]
test_y = labels[10000:]
test_seqlens = seqlens[10000:]

In [14]:
def get_sentence_batch(batch_size, data_x,
                       data_y, data_seqlens):
    instance_indices = list(range(len(data_x)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    x = [[word2index_map[word] for word in data_x[i].split()]
         for i in batch]
    y = [data_y[i] for i in batch]
    seqlens = [data_seqlens[i] for i in batch]
    return x, y, seqlens


In [15]:
_inputs = tf.placeholder(tf.int32, shape=[batch_size, times_steps])
embedding_placeholder = tf.placeholder(tf.float32, [vocabulary_size,
                                                    GLOVE_SIZE])

_labels = tf.placeholder(tf.float32, shape=[batch_size, num_classes])
_seqlens = tf.placeholder(tf.int32, shape=[batch_size])

In [16]:
if PRE_TRAINED:
        embeddings = tf.Variable(tf.constant(0.0, shape=[vocabulary_size, GLOVE_SIZE]),
                                 trainable=True)
        # if using pre-trained embeddings, assign them to the embeddings variable
        embedding_init = embeddings.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(embeddings, _inputs)

else:
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size,
                               embedding_dimension],
                              -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, _inputs)

In [17]:
with tf.name_scope("biGRU"):
    with tf.variable_scope('forward'):
        gru_fw_cell = tf.contrib.rnn.GRUCell(hidden_layer_size)
        gru_fw_cell = tf.contrib.rnn.DropoutWrapper(gru_fw_cell)

    with tf.variable_scope('backward'):
        gru_bw_cell = tf.contrib.rnn.GRUCell(hidden_layer_size)
        gru_bw_cell = tf.contrib.rnn.DropoutWrapper(gru_bw_cell)

        outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell,
                                                          cell_bw=gru_bw_cell,
                                                          inputs=embed,
                                                          sequence_length=_seqlens,
                                                          dtype=tf.float32,
                                                          scope="biGRU")
states = tf.concat(values=states, axis=1)

In [18]:
# This helper function taken from official TensorFlow documentation,
# simply add some ops that take care of logging summaries
# for tensorboard graph
def variable_summaries(scope_name,var):
    with tf.name_scope(scope_name):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)


In [19]:
weights = {
    'linear_layer': tf.Variable(tf.truncated_normal([2*hidden_layer_size,
                                                    num_classes],
                                                    mean=0, stddev=.01))
}
variable_summaries('W_linear',weights.get('linear_layer'))
biases = {
    'linear_layer': tf.Variable(tf.truncated_normal([num_classes],
                                                    mean=0, stddev=.01))
}
variable_summaries('B_linear',biases.get('linear_layer'))

In [20]:
print(weights.get('linear_layer'))

<tf.Variable 'Variable_1:0' shape=(64, 2) dtype=float32_ref>


In [21]:
# extract the final state and use in a linear layer
final_output = tf.matmul(states,
                         weights["linear_layer"]) + biases["linear_layer"]

softmax = tf.nn.softmax_cross_entropy_with_logits(logits=final_output,
                                                  labels=_labels)
cross_entropy = tf.reduce_mean(softmax)
tf.summary.scalar('cross_entropy', cross_entropy)

train_step = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(_labels, 1),
                              tf.argmax(final_output, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,
                                   tf.float32))
tf.summary.scalar('accuracy', accuracy)

merged = tf.summary.merge_all()

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



# 모델 학습

In [22]:
step=1000

with tf.Session() as sess:
    train_writer = tf.summary.FileWriter(LOG_DIR + '/train',
                                         graph=tf.get_default_graph())
    test_writer = tf.summary.FileWriter(LOG_DIR + '/test',
                                        graph=tf.get_default_graph())
    
    sess.run(tf.global_variables_initializer())
    sess.run(embedding_init,
             feed_dict={embedding_placeholder: embedding_matrix})
    for step in range(step):
        x_batch, y_batch, seqlen_batch = get_sentence_batch(batch_size,
                                                            train_x, train_y,
                                                            train_seqlens)
        summary,_ = sess.run([merged,train_step], feed_dict={_inputs: x_batch, _labels: y_batch,
                                        _seqlens: seqlen_batch})
        train_writer.add_summary(summary, step)

        if step % 100 == 0:
            summary, acc = sess.run([merged, accuracy], feed_dict={_inputs: x_batch, _labels: y_batch, _seqlens: seqlen_batch})

            train_writer.add_summary(summary, step)
            print("Accuracy at %d: %.5f" % (step, acc))

    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),
                                 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    normalized_embeddings_matrix = sess.run(normalized_embeddings)

    for test_batch in range(5):
        x_test, y_test, seqlen_test = get_sentence_batch(batch_size,
                                                         test_x, test_y,
                                                         test_seqlens)
        batch_pred, batch_acc = sess.run([tf.argmax(final_output, 1), accuracy],
                                         feed_dict={_inputs: x_test,
                                                    _labels: y_test,
                                                    _seqlens: seqlen_test})
        print("Test batch accuracy %d: %.5f" % (test_batch, batch_acc))

ref_word = normalized_embeddings_matrix[word2index_map["Three"]]

cosine_dists = np.dot(normalized_embeddings_matrix, ref_word)
ff = np.argsort(cosine_dists)[::-1][1:10]
for f in ff:
    print(index2word_map[f])
    print(cosine_dists[f])


Accuracy at 0: 0.50781
Accuracy at 100: 1.00000
Accuracy at 200: 1.00000
Accuracy at 300: 1.00000
Accuracy at 400: 1.00000
Accuracy at 500: 1.00000
Accuracy at 600: 1.00000
Accuracy at 700: 1.00000
Accuracy at 800: 1.00000
Accuracy at 900: 1.00000
Test batch accuracy 0: 1.00000
Test batch accuracy 1: 1.00000
Test batch accuracy 2: 1.00000
Test batch accuracy 3: 1.00000
Test batch accuracy 4: 1.00000
Three
1.0
Five
0.95007575
Seven
0.9058734
Nine
0.8764322
One
0.8663273
Four
-0.08899117
Two
-0.0921309
Six
-0.12420682
Eight
-0.13784575
