In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf

import utils
import word2vec_utils

  from ._conv import register_converters as _register_converters


In [2]:
# Model hyperparameters
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128            # dimension of the word embedding vectors
SKIP_WINDOW = 1             # the context window
NUM_SAMPLED = 64            # number of negative examples to sample
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
VISUAL_FLD = 'visualization'
SKIP_STEP = 1000

In [3]:
# Parameters for downloading data
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016
NUM_VISUALIZE = 3000        # number of tokens to visualize

In [6]:
def word2vec(dataset):
    """ Build the graph for word2vec model and train it """
    # Step 1: get input, output from the dataset
    with tf.name_scope('data'):
        iterator = dataset.make_initializable_iterator()
        center_words, target_words = iterator.get_next()

    """ Step 2 + 3: define weights and embedding lookup.
    In word2vec, it's actually the weights that we care about 
    """
    with tf.name_scope('embed'):
        embedding_weights = tf.get_variable(shape=(VOCAB_SIZE,EMBED_SIZE),name='embedding_weights',initializer=tf.random_uniform_initializer())
        embed = tf.nn.embedding_lookup(embedding_weights,center_words,name='embeddings')

    # Step 4: construct variables for NCE loss and define loss function
    with tf.name_scope('loss'):
        nce_weight = tf.get_variable('nce_weight', shape=[VOCAB_SIZE, EMBED_SIZE],
                            initializer=tf.truncated_normal_initializer(stddev=1.0 / (EMBED_SIZE ** 0.5)))
        nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([VOCAB_SIZE]))

        # define loss function to be NCE loss function
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
                                            biases=nce_bias, 
                                            labels=target_words, 
                                            inputs=embed, 
                                            num_sampled=NUM_SAMPLED, 
                                            num_classes=VOCAB_SIZE), name='loss')

    # Step 5: define optimizer
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE).minimize(loss)
    
    with tf.Session() as sess:
        sess.run(iterator.initializer)
        sess.run(tf.global_variables_initializer())

        total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter('./graphs/word2vec_simple', sess.graph)
        for index in range(NUM_TRAIN_STEPS):
            try:
                loss_batch, _ = sess.run([loss, optimizer])
                total_loss += loss_batch
                if (index + 1) % SKIP_STEP == 0:
                    print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                    total_loss = 0.0
            except tf.errors.OutOfRangeError:
                sess.run(iterator.initializer)
        writer.close()

In [7]:

def gen():
    yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE, 
                                        BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD)

def main():
    tf.reset_default_graph()
    dataset = tf.data.Dataset.from_generator(gen, 
                                (tf.int32, tf.int32), 
                                (tf.TensorShape([BATCH_SIZE]), tf.TensorShape([BATCH_SIZE, 1])))
    word2vec(dataset)

if __name__ == '__main__':
    main()

data/text8.zip already exists
Average loss at step 999: 132.0
Average loss at step 1999:  73.4
Average loss at step 2999:  51.8
Average loss at step 3999:  39.2
Average loss at step 4999:  30.7
Average loss at step 5999:  24.9
Average loss at step 6999:  20.7
Average loss at step 7999:  17.3
Average loss at step 8999:  15.8
Average loss at step 9999:  13.4
Average loss at step 10999:  11.3
Average loss at step 11999:  10.4
Average loss at step 12999:   9.5
Average loss at step 13999:   8.4
Average loss at step 14999:   7.8
Average loss at step 15999:   7.5
Average loss at step 16999:   6.8
Average loss at step 17999:   6.7
Average loss at step 18999:   6.4
Average loss at step 19999:   6.1
Average loss at step 20999:   5.8
Average loss at step 21999:   5.9
Average loss at step 22999:   5.6
Average loss at step 23999:   5.5
Average loss at step 24999:   5.5
Average loss at step 25999:   5.4
Average loss at step 26999:   5.4
Average loss at step 27999:   5.2
Average loss at step 28999:  