# 04.04 Eager execution (Word2Vec Visualize)

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import numpy as np

In [3]:
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf

In [29]:
def batch_gen(download_url, expected_byte, vocab_size, batch_size, 
                skip_window, visual_fld):
    local_dest = 'data/text8.zip'
    utils.download_one_file(download_url, local_dest, expected_byte)
    words = read_data(local_dest)
    dictionary, _ = build_vocab(words, vocab_size, visual_fld)
    index_words = convert_words_to_index(words, dictionary)
    del words           # to save memory
    single_gen = generate_sample(index_words, skip_window)
    
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(single_gen)
        yield center_batch, target_batch

In [7]:
def most_common_words(visual_fld, num_visualize):
    """ create a list of num_visualize most frequent words to visualize on TensorBoard.
    saved to visualization/vocab_[num_visualize].tsv
    """
    words = open(os.path.join(visual_fld, 'vocab.tsv'), 'r').readlines()[:num_visualize]
    words = [word for word in words]
    file = open(os.path.join(visual_fld, 'vocab_' + str(num_visualize) + '.tsv'), 'w')
    for word in words:
        file.write(word)
    file.close()

In [6]:
def safe_mkdir(path):
    """ Create a directory if there isn't one already. """
    try:
        os.mkdir(path)
    except OSError:
        pass

### Model 파마메터 설정

In [4]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # 단어의 임베딩 벡터의 차원을 정한다.
SKIP_WINDOW = 1 # 문맥 window
NUM_SAMPLED = 64 # 숫자 부정적인 예의 샘플
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
VISUAL_FLD = 'visualization'
SKIP_STEP = 5000

### 다운로드 데이터

In [5]:
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016
NUM_VISUALIZE = 3000 # 각화 할 토큰 수

In [25]:
class SkipGramModel:
    """word2vec 모델로 부터 graph를 구성한다."""
    def __init__(self, dataset, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.get_variable('global_step', initializer=tf.constant(0), trainable=False) # 여기서 문제 발생 나중에 풀자
        self.skip_step = SKIP_STEP
        self.dataset = dataset

    
    def _import_data(self):
        # Step 1: 데이터 가져오기
        with tf.name_scope('data'):
            self.iterator = self.dataset.make_initializable_iterator()
            self.center_words, self.target_words = self.iterator.get_next()
    
    def _create_embedding(self):
        # Step 2 + 3: 가중치 선언 그리고 embedding 색인 처리
        with tf.name_scope('embed'):
            self.embed_matrix = tf.get_variable('embed_matrix',
                                               shape=[self.vocab_size, self.embed_size],
                                               initializer=tf.random_uniform_initializer())
            self.embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name='embedding')
    
    def _create_loss(self):
        # Step 4: 손실 함수 정의
        with tf.name_scope('loss'):
            # NCE loss 변수 만든다.
            nce_weight = tf.get_variable('nce_weight', 
                                         shape=[self.vocab_size, self.embed_size],
                                         initializer=tf.truncated_normal_initializer(stddev=1.0 / (self.embed_size ** 0.5)))
            nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([VOCAB_SIZE]))
            
            #  NCE 손실 함수를 손실 함수로 정의 한다.
            self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                                     biases=nce_bias,
                                                     labels=self.target_words,
                                                     inputs=self.embed,
                                                     num_sampled=self.num_sampled,
                                                     num_classes=self.vocab_size),
                                                     name='loss')
    
    def _create_optimizer(self):
        # Step 5: 옵티마이져 정의
        self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)
    
    def _create_summaries(self):
        with tf.name_scope('summaries'):
            tf.summary.scalar('loss', self.loss)
            tf.summary.histogram('histogram loss', self.loss)
            # 한번에 합쳐 놓으면 관리하기가 편하다.
            self.summary_op = tf.summary.merge_all()
    
    def build_graph(self):
        """ 모델 graph 작성"""
        self._import_data()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()
        
    def train(self, num_train_steps):
        # 기본값은 모든 변수에 저장 - embed_matrix, nce_weight, nce_bias
        saver = tf.train.Saver()
        
        initial_step = 0
        safe_mkdir('checkpoints')
        with tf.Session() as sess:
            sess.run(self.iterator.initializer)
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
            
            # 만약 체크포인트가 존재 하면 복구 시킨다.
            if ckpt and ckpt.modle_checkpoint_path:
                saver.restore(sess, ckpt.modle_checkpoint_path)
                
            total_loss = 0.0 # 마지막 SKIP_STEP 단계의 평균 손실
            writer = tf.summary.FileWriter('graphs/word2vec/lr' + str(self.lr), sess.graph)
            initial_step = self.global_step.eval()
            
            for index in range(initial_step, initial_step + num_train_steps):
                try:
                    loss_batch, _, summary = sess.run([self.loss, self.optimizer, self.summary_op])
                    writer.add_summary(summary, global_step=index)
                    total_loss += loss_batch
                    if (index + 1) % self.skip_step == 0:
                        print('Average loss at step {}: {:5.1f}'.format(index, total/loss / self.skip_step))
                        total_loss = 0.0
                        saver.save(sess, 'checkpoints/skip-gram', index)
                except tf.errors.OutOfRangeError:
                    sess.run(self.iterator.initializer)
            writer.close()
    
    def visualize(self, visual_fid, num_visualize):
        """ tensorboard --logdir='visualization를 통해 임베딩 된것을 볼 수 있다. """
        most_common_words(visual_fld, num_visualize)
        
        saver = tf.train.Saver()
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
            
             # 만약 체크포인트가 존재 하면 복구 시킨다.
            if ckpt and ckpt.modle_checkpoint_path:
                saver.restore(sess, ckpt.modle_checkpoint_path)   
                
            final_embed_matrix = sess.run(self.embed_matrix)
            
            # 새로운 변수에 가지고 있는 embedding 을 저장한다.
            embedding_var = tf.Variable(final_embed_matrix[:num_visualize], name='embedding')
            sess.run(embedding_var.initializer)
            
            cofig = projector.ProjectorConfig()
            summary_writer = tf.summary.FileWriter(visual_fld)
            
            # 설정 파일에 embedding을 추가한다.
            embedding = config.embeddings.add()
            embedding.tensor_name = embedding_var.name
            
            # 사전의 단어를 텐서의 메타 데이터 파일에 링크합니다.
            embedding.metadata_path = 'vocab_' + str(num_visualize) + '.tsv'
            
            # TensorBoad가 시작될때 사용할 설정 파일을 저장한다.
            projector.visualize_embeddings(summary_writer, config)
            saver_embed = tf.train.Saver([embedding_var])
            saver_embed.save(sess, os.path.join(visual_fld, 'model.ckpt'), 1)

In [30]:
def gen():
    yield from batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD)

In [33]:
def main():
    dataset = tf.data.Dataset.from_generator(gen, 
                                (tf.int32, tf.int32), 
                                (tf.TensorShape([BATCH_SIZE]), tf.TensorShape([BATCH_SIZE, 1])))
    model = SkipGramModel(dataset, VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
    model.build_graph()
    model.train(NUM_TRAIN_STEPS)
    model.visualize(VISUAL_FLD, NUM_VISUALIZE)

In [34]:
if __name__ == '__main__':
    main()

ValueError: Variable global_step already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "<ipython-input-16-772611d1bd85>", line 9, in __init__
    self.global_step = tf.get_variable('global_step', initializer=tf.constant(0), trainable=False)
  File "<ipython-input-17-4bc92ee3d28c>", line 5, in main
    model = SkipGramModel(dataset, VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
  File "<ipython-input-18-c7bc734e5e35>", line 2, in <module>
    main()
