# 04.03 Eager execution (Word2Vec)

In [48]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import gzip
import numpy as np
import urllib
import zipfile
from collections import Counter
import random

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.eager as tfe

  from ._conv import register_converters as _register_converters


In [39]:
def safe_mkdir(path):
    """ 디렉토리가 없으면 디렉토리를 작성합니다. """
    try:
        os.mkdir(path)
    except OSError:
        pass

In [30]:
def download_one_file(download_url, 
                    local_dest, 
                    expected_byte=None, 
                    unzip_and_remove=False):
    """ 
    다운로드 파일이 존재 하지 않는 경우 
    download_url에서 local_dest로 파일 다운로드 한다. 
    expected_byte가 제공되면 다운로드 한 파일의 바이트 수가 같아야 한다.
    unzip_and_remove가 True이면 파일의 압축을 풀고 zip 파일을 제거한다.
    """
    if os.path.exists(local_dest) or os.path.exists(local_dest[:-3]):
        print('%s already exists' %local_dest)
    else:
        print('Downloading %s' %download_url)
        local_file, _ = urllib.request.urlretrieve(download_url, local_dest)
        file_stat = os.stat(local_dest)
        if expected_byte:
            if file_stat.st_size == expected_byte:
                print('Successfully downloaded %s' %local_dest)
                if unzip_and_remove:
                    with gzip.open(local_dest, 'rb') as f_in, open(local_dest[:-3],'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                    os.remove(local_dest)
            else:
                print('The downloaded file has unexpected number of bytes')

In [46]:
def generate_sample(index_words, context_window_size):
    """ skip-gram 모델에 따라 트레이닝 쌍을 형성 합니다. """
    for index, center in enumerate(index_words):
        context = random.randint(1, context_window_size)
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target

In [44]:
def convert_words_to_index(words, dictionary):
    """ 데이터 집합의 각 단어를 사전의 Index로 교체합니다. """
    return [dictionary[word] if word in dictionary else 0 for word in words]

In [40]:
def build_vocab(words, vocab_size, visual_fld):
    """ 
    빈번한 단어들을 VOCAB_SIZE 만큼 작성합니다. 
    visualization/vocab.tsv
    """
    safe_mkdir(visual_fld)
    file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')
    
    dictionary = dict()
    count = [('UNK', -1)]
    index = 0
    count.extend(Counter(words).most_common(vocab_size - 1))
    
    for word, _ in count:
        dictionary[word] = index
        index += 1
        file.write(word + '\n')
    
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    file.close()
    return dictionary, index_dictionary

In [35]:
def read_data(file_path):
    """ 데이터 목록을 읽어서 토크나이즈 했을때 17,005,207 토큰이 존재 해야 한다.
    """
    with zipfile.ZipFile(file_path) as f:
        words = tf.compat.as_str(f.read(f.namelist()[0])).split() 
    return words

In [31]:
def batch_gen(download_url, expected_byte, vocab_size, batch_size, 
                skip_window, visual_fld):
    local_dest = 'data/text8.zip'
    download_one_file(download_url, local_dest, expected_byte)
    words = read_data(local_dest)
    dictionary, _ = build_vocab(words, vocab_size, visual_fld)
    index_words = convert_words_to_index(words, dictionary)
    del words           # to save memory
    single_gen = generate_sample(index_words, skip_window)
    
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(single_gen)
        yield center_batch, target_batch

### eager mode 실행

In [3]:
tfe.enable_eager_execution()

### Model 파마메터 설정

In [4]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # 단어의 임베딩 벡터의 차원을 정한다.
SKIP_WINDOW = 1 # 문맥 window
NUM_SAMPLED = 64 # 숫자 부정적인 예의 샘플
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
VISUAL_FLD = 'visualization'
SKIP_STEP = 5000

### 다운로드 데이터

In [5]:
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016

In [22]:
class Word2Vec(object):
    def __init__(self, vocab_size, embed_size, num_sampled=NUM_SAMPLED):
        self.vocab_size = vocab_size
        self.num_sampled = num_sampled
        self.embed_matrix = tfe.Variable(tf.random_uniform([vocab_size, embed_size]))
        self.nce_weight = tfe.Variable(tf.truncated_normal([vocab_size, embed_size], stddev=1.0 / (embed_size ** 0.5)))
        self.nce_bias = tfe.Variable(tf.zeros([vocab_size]))
    
    def compute_loss(self, center_words, target_words):
        """이 목적함수는 모델이 실제 단어들에 높은 확률을 할당하고 노이즈 단어들에 낮은 확률을 할당할 때 최대화된다. 
        기술적으로, 이를 Negative Sampling 이라 명하며, 이 손실(loss) 함수 사용에 대해 수학적으로 유리한 동기가 존재한다.
        제시되는 업데이트들은 제한된 softmax 함수의 업데이트들을 근사값을 계산한다. 
        하지만 손실 함수의 계산을 우리가 선택한 noise words( k ) 의 갯수, 어휘( V ) 내 모든 단어(all words) 가 아닌, 
        만으로 변경하여 계산한다는 점 때문에 계산적으로 특히 매력적이다. 
        이것은 학습을 더욱 빠르게 만든다. 
        우리는 noise-contrastive estimation (NCE) 손실(loss) 와 매우 유사한 것, 
        TensorFlow 가 가지고 있는 유용한 헬퍼 함수 tf.nn.nce_loss() 를 활용한다."""
        embed = tf.nn.embedding_lookup(self.embed_matrix, center_words)
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weight,
                                            biases=self.nce_bias,
                                            labels=target_words,
                                            inputs=embed,
                                            num_sampled=self.num_sampled,
                                            num_classes=self.vocab_size))
        return loss

In [27]:
def gen():
    yield from batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD)

In [50]:
def main():
    dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32),
                                            (tf.TensorShape([BATCH_SIZE]),
                                            tf.TensorShape([BATCH_SIZE, 1])))
    optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)
    model = Word2Vec(vocab_size=VOCAB_SIZE, embed_size=EMBED_SIZE)
    grad_fn = tfe.implicit_value_and_gradients(model.compute_loss)
    total_loss = 0.0 # 마지막 SKIP_STEP 단계의 평균 손실
    num_train_steps = 0
    while num_train_steps < NUM_TRAIN_STEPS:
        for center_words, target_words in tfe.Iterator(dataset):
            if num_train_steps >= NUM_TRAIN_STEPS:
                break
            loss_batch, grads = grad_fn(center_words, target_words)
            total_loss += loss_batch
            optimizer.apply_gradients(grads)
            if(num_train_steps + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(num_train_steps, total_loss / SKIP_STEP))
                total_loss = 0.0
            num_train_steps += 1

In [51]:
if __name__ == '__main__':
    main()

data/text8.zip already exists
Average loss at step 4999:  65.4
Average loss at step 9999:  18.3
Average loss at step 14999:   9.6
Average loss at step 19999:   6.7
Average loss at step 24999:   5.7
Average loss at step 29999:   5.2
Average loss at step 34999:   5.0
Average loss at step 39999:   4.9
Average loss at step 44999:   4.8
Average loss at step 49999:   4.8
Average loss at step 54999:   4.7
Average loss at step 59999:   4.7
Average loss at step 64999:   4.7
Average loss at step 69999:   4.7
Average loss at step 74999:   4.6
Average loss at step 79999:   4.7
Average loss at step 84999:   4.7
Average loss at step 89999:   4.7
Average loss at step 94999:   4.6
Average loss at step 99999:   4.6
