In [1]:
import os
import math
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
batch_size=64
embedding_dimension = 5
negative_samples =8
LOG_DIR = "logs\word2vec_intro"
digit_to_word_map = {1:"One",2:"Two", 3:"Three", 4:"Four", 5:"Five",6:"Six",7:"Seven",8:"Eight",9:"Nine"}
sentences = []
# Create two kinds of sentences - sequences of odd and even digits
for i in range(10000):
    rand_odd_ints = np.random.choice(range(1,10,2),3)
    sentences.append(" ".join([digit_to_word_map[r] for r in rand_odd_ints]))
    rand_even_ints = np.random.choice(range(2,10,2),3)
    sentences.append(" ".join([digit_to_word_map[r] for r in rand_even_ints]))
    
sentences[0:10]
word2index_map ={}
index=0
for sent in sentences:
    for word in sent.lower().split():
        if word not in word2index_map:
            word2index_map[word] = index
            index+=1
            
index2word_map = {index: word for word, index in word2index_map.items()}
vocabulary_size = len(index2word_map)

##create skip grams
skip_gram_pairs = []
for sent in sentences:
    tokenized_sent = sent.lower().split()
    for i in range(1, len(tokenized_sent)-1) :
        #previous word and next word
        word_context_pair = [[word2index_map[tokenized_sent[i-1]], word2index_map[tokenized_sent[i+1]]], word2index_map[tokenized_sent[i]]] 
        skip_gram_pairs.append([word_context_pair[1], word_context_pair[0][0]]) #word with previous word
        skip_gram_pairs.append([word_context_pair[1], word_context_pair[0][1]]) #word with next word
        
def get_skipgram_batch(batch_size):
    instance_indices = list(range(len(skip_gram_pairs)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    x = [skip_gram_pairs[i][0] for i in batch]   # 0 is origin word
    y = [[skip_gram_pairs[i][1]] for i in batch] # 1 is previous or next word(correleated word)
    return x,y

# Input data, labels
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

#Embeddings
with tf.name_scope("embeddings"):
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_dimension],-1.0, 1.0),name='embedding')
    # This is essentially a lookup table
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
tf.summary.embedding('loss', loss)
#The Noise-Contrastive Estimation (NCE) Loss Function
#tf.nn.nce_loss() automatically draws negative (“noise”) samples when we evaluate the loss 
# Create variables for the NCE loss
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_dimension],stddev=1.0 / math.sqrt(embedding_dimension)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
loss = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights, biases = nce_biases, inputs = embed,
                                     labels = train_labels, num_sampled = negative_samples, num_classes =vocabulary_size))
tf.summary.scalar('loss', loss)
#Learning Rate Decay
# Learning rate decay

global_step = tf.Variable(0, trainable=False)
learningRate = tf.train.exponential_decay(learning_rate=0.1,global_step= global_step,decay_steps=1000,decay_rate= 0.95,
                                          staircase=True)
train_step = tf.train.GradientDescentOptimizer(learningRate).minimize(loss)

# Merge all summary ops
merged = tf.summary.merge_all()
with tf.Session() as sess:
    
    train_writer = tf.summary.FileWriter(LOG_DIR,graph=tf.get_default_graph())
    saver = tf.train.Saver()
    with open(os.path.join(LOG_DIR,'metadata.tsv'), "w") as metadata:
        metadata.write('Name\tClass\n')
        for k,v in index2word_map.items():
            metadata.write('%s\t%d\n' % (v, k))
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embeddings.name
    # Link embedding to its metadata file
    embedding.metadata_path = os.path.join(LOG_DIR,'metadata.tsv')
    projector.visualize_embeddings(train_writer, config)
    
    tf.global_variables_initializer().run()
    x_batch, y_batch = get_skipgram_batch(batch_size)
    for step in range(1000):
        x_batch, y_batch = get_skipgram_batch(batch_size)
        summary,_ = sess.run([merged,train_step],feed_dict={train_inputs:x_batch,train_labels:y_batch})
        train_writer.add_summary(summary, step)
        if step % 100 == 0:
            saver.save(sess, os.path.join(LOG_DIR, "w2v_model.ckpt"), step) 
            loss_value = sess.run(loss, feed_dict={train_inputs:x_batch, train_labels:y_batch})
            print("Loss at %d: %.5f" % (step, loss_value))
    #Normalize embeddings before using
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    normalized_embeddings_matrix = sess.run(normalized_embeddings)

ref_word = normalized_embeddings_matrix[word2index_map["one"]]
cosine_dists = np.dot(normalized_embeddings_matrix,ref_word)
ff = np.argsort(cosine_dists)[::-1][1:10]
for f in ff:
    print(index2word_map[f])
    print(cosine_dists[f])
    

Loss at 0: 5.74344
Loss at 100: 3.01105
Loss at 200: 2.82462
Loss at 300: 2.65077
Loss at 400: 2.64569
Loss at 500: 2.55482
Loss at 600: 2.40361
Loss at 700: 2.53572
Loss at 800: 2.49653
Loss at 900: 2.53345
Instructions for updating:
keep_dims is deprecated, use keepdims instead
three
0.93005675
nine
0.91762745
seven
0.90931445
five
0.89189136
four
0.10658033
six
0.10599244
eight
0.07293857
two
0.06234684
