In [1]:
import os
import math
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
from datetime import datetime, date, time

batch_size = 64
embedding_dimension = 5
negative_samples = 8

# 텐서보드 모델 요약을 저장할 위치
LOG_DIR = "/home/younseun/columbus_study/04_learning_tensorflow/06__word_embeddings_and_rnns/logs/word2vec_intro_"+datetime.now().strftime('%Y%m%d%H%M%S')
LOG_DIR = "/tmp/logs/word2vec_intro_2"

digit_to_word_map = {1: "One", 2: "Two", 3: "Three", 4: "Four", 5: "Five",
                     6: "Six", 7: "Seven", 8: "Eight", 9: "Nine"}
sentences = []

# Create two kinds of sentences - sequences of odd and even digits.
for i in range(10000):
    rand_odd_ints = np.random.choice(range(1, 10, 2), 3)
    sentences.append(" ".join([digit_to_word_map[r] for r in rand_odd_ints]))
    rand_even_ints = np.random.choice(range(2, 10, 2), 3)
    sentences.append(" ".join([digit_to_word_map[r] for r in rand_even_ints]))

# 하나만 테스트로 진행할때 사용
# sentences = []
# sentences.append("Eight Four Six")
print('({}) ({}) ({}) ({})'.format(sentences[0], sentences[1], sentences[2], sentences[3]))

(Nine Five One) (Two Two Four) (Five Five Seven) (Six Six Two)


In [2]:
# 단어를 인덱스에 매핑
word2index_map = {}
index = 0
for sent in sentences:
    #print(sent)
    for word in sent.lower().split():
        #print(word)
        if word not in word2index_map:
            word2index_map[word] = index
            index += 1
        #print(word2index_map)

# 역방향 매핑
index2word_map = {index: word for word, index in word2index_map.items()}
vocabulary_size = len(index2word_map)

print('word2index_map :{}'.format(word2index_map))
print('index2word_map :{}'.format(index2word_map))
print('vocabulary_size :{}'.format(vocabulary_size))


word2index_map :{'nine': 0, 'five': 1, 'one': 2, 'two': 3, 'four': 4, 'seven': 5, 'six': 6, 'eight': 7, 'three': 8}
index2word_map :{0: 'nine', 1: 'five', 2: 'one', 3: 'two', 4: 'four', 5: 'seven', 6: 'six', 7: 'eight', 8: 'three'}
vocabulary_size :9


In [3]:
# Generate skip-gram pairs
sample_seq = 0
skip_gram_pairs = []
for sent in sentences:
    tokenized_sent = sent.lower().split()
    sample_seq += 1
    if sample_seq == 1:
        print(tokenized_sent)
    for i in range(1, len(tokenized_sent)-1):
        word_context_pair = [[word2index_map[tokenized_sent[i-1]],
                              word2index_map[tokenized_sent[i+1]]],
                             word2index_map[tokenized_sent[i]]]
        skip_gram_pairs.append([word_context_pair[1],
                                word_context_pair[0][0]])
        skip_gram_pairs.append([word_context_pair[1],
                                word_context_pair[0][1]])
        if sample_seq == 10:
            print(word_context_pair)
            print([word_context_pair[1], word_context_pair[0][0]])
            print([word_context_pair[1], word_context_pair[0][1]])

['nine', 'five', 'one']
[[4, 3], 7]
[7, 4]
[7, 3]


In [4]:
len(skip_gram_pairs), skip_gram_pairs[10],skip_gram_pairs[11],skip_gram_pairs[12],skip_gram_pairs[13]

(40000, [7, 4], [7, 6], [5, 8], [5, 5])

In [5]:
def get_skipgram_batch(batch_size):
    instance_indices = list(range(len(skip_gram_pairs)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    x = [skip_gram_pairs[i][0] for i in batch]      # Target word
    y = [[skip_gram_pairs[i][1]] for i in batch]    # 문맥 단어
    return x, y

In [6]:
#skip_gram_pairs 데이터 확인
x, y = get_skipgram_batch(batch_size)
print('x:{}, y:{}'.format(x[:3],y[:3]))
print('x(target word) : ', [index2word_map[index] for index in x[:3]])
print('y(context word) : ', [index2word_map[index[0]] for index in y[:3]])


x:[8, 6, 6], y:[[8], [6], [4]]
x(target word) :  ['three', 'six', 'six']
y(context word) :  ['three', 'six', 'four']


In [7]:
# Input data, labels
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [8]:
# Embedding lookup table currently only implemented in CPU
with tf.name_scope("embeddings"):
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_dimension],-1.0, 1.0), name='embedding')
    # This is essentialy a lookup table
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)
embeddings, train_inputs, embed

(<tf.Variable 'embeddings/embedding:0' shape=(9, 5) dtype=float32_ref>,
 <tf.Tensor 'Placeholder:0' shape=(64,) dtype=int32>,
 <tf.Tensor 'embeddings/embedding_lookup:0' shape=(64, 5) dtype=float32>)

In [9]:
# Create variables for the NCE loss
nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_dimension],
                            stddev=1.0 / math.sqrt(embedding_dimension)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))


loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, inputs=embed, labels=train_labels,
                      num_sampled=negative_samples, num_classes=vocabulary_size))
tf.summary.scalar("NCE_loss", loss)


<tf.Tensor 'NCE_loss_1:0' shape=() dtype=string>

In [14]:
# Learning rate decay
global_step = tf.Variable(0, trainable=False)
learningRate = tf.train.exponential_decay(learning_rate=0.01,
                                          global_step=global_step,
                                          decay_steps=1000,
                                          decay_rate=0.95,
                                          staircase=True)
train_step = tf.train.GradientDescentOptimizer(learningRate).minimize(loss)

merged = tf.summary.merge_all()

In [30]:
first_onoff = 0
with tf.Session() as sess:
    train_writer = tf.summary.FileWriter(LOG_DIR,
                                         graph=tf.get_default_graph())
    saver = tf.train.Saver()

    with open(os.path.join(LOG_DIR, 'metadata.tsv'), "w") as metadata:
        metadata.write('Name\tClass\n')
        for k, v in index2word_map.items():
            metadata.write('%s\t%d\n' % (v, k))

    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embeddings.name
    # Link this tensor to its metadata file (e.g. labels).
    embedding.metadata_path = os.path.join(LOG_DIR, 'metadata.tsv')
    projector.visualize_embeddings(train_writer, config)

    tf.global_variables_initializer().run()

    for step in range(10000):
        x_batch, y_batch = get_skipgram_batch(batch_size)
        first_onoff += 1
        if first_onoff == 1:
            print(x_batch, y_batch)
        summary, out_nce_weights, _ = sess.run([merged, nce_weights, train_step],
                              feed_dict={train_inputs: x_batch,
                                         train_labels: y_batch})
        train_writer.add_summary(summary, step)

        #if step % 1000 == 0:
        #    print('nce_weights {}'.format(out_nce_weights))
        if step % 1000 == 0:
            saver.save(sess, os.path.join(LOG_DIR, "w2v_model.ckpt"), step)
            loss_value = sess.run(loss,
                                  feed_dict={train_inputs: x_batch,
                                             train_labels: y_batch})
            print("Loss at %d: %.5f" % (step, loss_value))

    # Normalize embeddings before using
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    normalized_embeddings_matrix = sess.run(normalized_embeddings)
print("completed !!")

[2, 7, 7, 4, 4, 4, 6, 5, 6, 3, 6, 4, 6, 2, 0, 7, 4, 3, 1, 2, 8, 0, 3, 6, 7, 5, 2, 2, 8, 4, 2, 7, 7, 4, 6, 6, 7, 6, 7, 3, 6, 8, 1, 3, 2, 0, 7, 0, 6, 5, 5, 4, 1, 1, 4, 6, 3, 7, 4, 0, 4, 6, 1, 8] [[2], [4], [3], [7], [6], [7], [7], [2], [4], [3], [6], [7], [6], [0], [1], [3], [6], [7], [1], [2], [0], [8], [6], [7], [7], [0], [2], [1], [2], [3], [0], [6], [4], [6], [3], [7], [6], [6], [6], [3], [7], [1], [1], [6], [2], [1], [4], [2], [3], [0], [8], [6], [8], [1], [4], [6], [7], [4], [4], [8], [6], [3], [2], [5]]
Loss at 0: 7.68430
Loss at 1000: 3.01388
Loss at 2000: 2.83595
Loss at 3000: 2.64399
Loss at 4000: 2.58955
Loss at 5000: 2.52692
Loss at 6000: 2.49235
Loss at 7000: 2.52326
Loss at 8000: 2.55969
Loss at 9000: 2.46295
completed !!


In [20]:
normalized_embeddings, embeddings, norm

(<tf.Tensor 'truediv_4:0' shape=(9, 5) dtype=float32>,
 <tf.Variable 'embeddings/embedding:0' shape=(9, 5) dtype=float32_ref>,
 <tf.Tensor 'Sqrt_4:0' shape=(9, 1) dtype=float32>)

In [31]:
normalized_embeddings_matrix

array([[ 0.01851027, -0.38256797, -0.69862163, -0.48257422, -0.3637981 ],
       [ 0.15151925, -0.45563003, -0.63779   , -0.5848783 , -0.14347272],
       [-0.02436541, -0.39945036, -0.6633278 , -0.5176547 , -0.36314663],
       [-0.35064706,  0.64548475, -0.21336372, -0.37061203,  0.52680045],
       [-0.52213836,  0.50760204, -0.4818405 ,  0.02821813,  0.48656473],
       [ 0.1887003 , -0.14939962, -0.50887835, -0.714728  , -0.41506463],
       [-0.46791896,  0.6075562 , -0.31650034, -0.23186843,  0.507929  ],
       [-0.6220929 ,  0.58995175, -0.22646055, -0.28524277,  0.36374375],
       [-0.11415682, -0.7390903 , -0.55131435, -0.3517108 ,  0.114306  ]],
      dtype=float32)

In [32]:
ref_word = normalized_embeddings_matrix[2]

In [33]:
cosine_dists = np.dot(normalized_embeddings_matrix, ref_word)
ff = np.argsort(cosine_dists)[::-1][1:10]
for f in ff:
    print(index2word_map[f])
    print(cosine_dists[f])

nine
0.9977
five
0.95624024
seven
0.91334486
three
0.8042685
eight
-0.054716364
four
-0.0617231
six
-0.08576897
two
-0.107222125


In [26]:
cosine_dists

array([ 0.9731061 ,  0.8959499 ,  0.9999998 , -0.03284888, -0.03454876,
        0.77516216, -0.03165343,  0.02024998,  0.79811215], dtype=float32)