In [1]:
#Word2Vec SkipGram  implementation

In [2]:
import tensorflow as tf            #For Machine Learning
import numpy as np                 #For Mathematical Operations
import zipfile                     #To load data
import collections
import math

In [3]:
#Function to read data from a dataset text8. The dataset consists of sentences. The function
#tokenizes each word in the sentence and returns a list consisting of all words in order
#from all sentences removing any special characters, quotes, commas, and other symbols.

def read_data(filename):
    with zipfile.ZipFile(filename) as f:
         data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [4]:
#Load data into a variable
words = read_data('/home/aeros/Documents/Datasets/text8.zip')

In [5]:
#Number of words in dataset
n_words = len(words)

In [6]:
#Size of vocabulary we'll be using
vocabulary_size = 50000

In [7]:
def build_dataset(words):
    #Counting each token in the data 
    count = [['UNK',-1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    
    #Creating a word to int dictionary
    dictionary = {}
    for word, cnt in count:
        dictionary[word] = len(dictionary)
        
    #Creating a list of data words represented by their integer value in dictionary
    data =[]
    unk_count = 0
    for word in words: 
        if word in dictionary:
            index = dictionary[word]
        else:
            #if word not in dictionary it is 'UNK' word
            index = 0
            unk_count += 1   #Increment counter for unknown words
        data.append(index)
    count[0][1] = unk_count   #replace count of unknown words
    
    #Creating a int to word dictionary
    reversed_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
    
    return data, count, dictionary, reversed_dictionary
    

In [8]:
data_index = 0

#Preparing a list of words mapped to integers to feed to the model as input. Also, keeping 
#count of all tokens, creating a mapping from words to integers and vice versa.

def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    #Keeping parameters consistent
    assert batch_size%num_skips == 0
    assert num_skips <= 2*skip_window
    
    batch = np.ndarray(shape = [batch_size], dtype = np.int32)
    labels = np.ndarray(shape = [batch_size,1], dtype = np.int32)
    
    #Spaning window for inital buffer space
    span = 2*skip_window + 1
    
    #Buffer to hold part of elements in a list
    buffer = collections.deque(maxlen = span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1)%len(data)
    
    #Batch and label assignment
    for i in range(batch_size//num_skips):#batch_size//num_skips gives no. of iterations to update the buffer center
        target = skip_window
        targets_to_avoid = [skip_window]
        
        for j in range(num_skips):#No of elements to form groups with the target i.e skip_window or buffer_mid
            while target in targets_to_avoid:
                target = np.random.randint(0,span)
            targets_to_avoid.append(target)
            batch[i*num_skips + j] = buffer[skip_window]
            labels[i*num_skips + j,0] = buffer[target] 
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
        
    return batch, labels
            

In [9]:
#Get dataset of integers for words
data, count, dictionary, reversed_dictionary = build_dataset(words)

In [10]:
#Setting up parameters for training
embed_size = 300       #size of hidden layer
num_sampled = 64       #number of sampled examples in nce_loss
learning_rate = 1.0    #learning rate for updates
batch_size = 128       #processing data in a batch of 128
skip_window = 2        #number of steps to look on left and right of center word 
num_skips = 4          #maximum pairs that could be formed from a center word

In [11]:
#Setting up variables for similarity comparison
valid_size = 16    #number of random samples to compare
valid_window = 100   #window from which the samples are drawn
valid_examples = np.random.choice(valid_window, valid_size)    #integer array of 16 random integers
valid_dataset = tf.constant(valid_examples, dtype = tf.int32)  #converting to tensor

In [12]:
#Number of epochs to train the model
n_epochs = 100001

In [13]:
#Selecting device for training 
with tf.device('/cpu:0'):
    #Defining name scope for input
    with tf.name_scope('inputs'):
        #target and context of the skip_gram model
        center_words = tf.placeholder(dtype = tf.int32, shape = [batch_size],name = 'center_words')
        target_words = tf.placeholder(dtype = tf.int32, shape = [batch_size,1], name = 'target_words')
    #Defining name scpe for embedding weights
    with tf.name_scope('embed'):
        #Embedding matrix for the model
        embed_matrix = tf.Variable(tf.random_uniform([vocabulary_size, embed_size], -1, 1), name = 'embed_matrix')
    #Defining name scope for loss     
    with tf.name_scope('loss'):
        embed = tf.nn.embedding_lookup(embed_matrix, center_words, name = 'embed')
        #weights for the final layer before nce_loss
        nce_weight = tf.Variable(tf.truncated_normal(shape = [vocabulary_size, embed_size], 
                                                     stddev = 1.0/math.sqrt(embed_size)), name = 'nce_weight')
        #biases for the final layer before nce_loss
        nce_bias = tf.Variable(tf.zeros(shape = [vocabulary_size]), name = 'nce_bias')
        #loss function for the model
        loss = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weight, biases = nce_bias, labels = target_words,
                                             inputs = embed, num_sampled = num_sampled, num_classes = vocabulary_size)
                             ,name = 'loss')
    #Optimizer for the model
    optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(loss)
    
    
    #Normalizing embeddings
    norm = tf.sqrt(tf.reduce_sum(tf.square(embed_matrix), 1, keep_dims = True))
    normalized_embeddings = embed_matrix/norm
    #looking for embeddings over finite samples
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    #similarity comparison over embeddings
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b = True)


In [14]:
#Initialzing variables variable
init = tf.global_variables_initializer()

In [15]:
with tf.Session() as sess:
    writer = tf.summary.FileWriter('./graphs', sess.graph)
    #initialize variables
    sess.run(init)
    print('initialized')
    saver = tf.train.Saver()
    
    average_loss = 0
    for i in range(n_epochs):
        #Generate batch of data for input
        batch_inputs, batch_labels = generate_batch(data, batch_size = batch_size, num_skips = num_skips, 
                                                    skip_window = skip_window)
        feed_dict = {center_words: batch_inputs, target_words: batch_labels}
        opt, loss_value = sess.run([optimizer, loss], feed_dict)
        average_loss += loss_value
        #Print loss at steps of 2000
        if i % 2000 == 0:
            if i > 0:
                average_loss /= 2000
            # the average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step ", i, ": ", average_loss)
            average_loss = 0
    
            
        #Print similarities at steps of 10000
        if i % 10000 == 0:
            sim = similarity.eval()
            for j in range(valid_size):
                valid_word = reversed_dictionary[valid_examples[j]]
                top_k = 8
                nearest = (-sim[j, :]).argsort()[1:top_k + 1]
                log_str = "nearest to %s:" % valid_word
                for k in range(top_k):
                    close_word = reversed_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)
                
    final_embeddings = normalized_embeddings.eval()
    saver.save(sess,'./models/word2vec.ckpt')        
writer.close()

initialized
Average loss at step  0 :  255.583633423
nearest to their: eliminating, wanton, roy, borough, peninsula, switch, abyss, weird,
nearest to often: reasonable, auk, emanate, favours, burner, shiite, consequentialist, recantation,
nearest to to: nicolau, multiple, alkaloid, humans, automation, baseball, mystique, installment,
nearest to th: gospels, calvi, silvanus, exceeds, arendt, oklahoma, journals, xj,
nearest to i: indicated, wins, gluons, chihuahua, ros, venerated, mundi, resold,
nearest to been: corvettes, fall, kokomo, synchronic, farkas, praetorians, impotence, lecoq,
nearest to from: dhole, hemoglobin, nlds, ashikaga, employee, rttemberg, conveyed, jacopo,
nearest to was: dizzy, automotive, wisely, nine, syntax, polyatomic, spiders, nyu,
nearest to during: albright, mention, inlaid, ramp, rosalynn, rockin, dantzig, centric,
nearest to use: cistercian, consultant, hobsbawm, carrel, gearbox, extraterrestrials, piacenza, embargo,
nearest to the: elway, insecticide, carol

Average loss at step  52000 :  5.88783019555
Average loss at step  54000 :  6.01559590054
Average loss at step  56000 :  5.76475006127
Average loss at step  58000 :  5.58141971827
Average loss at step  60000 :  5.74091485333
nearest to their: its, his, the, a, mathbf, callisto, and, this,
nearest to often: asbestos, mathbf, to, also, be, that, circ, aristophanes,
nearest to to: and, for, cc, mathbf, with, that, gland, or,
nearest to th: nine, eight, zero, five, seven, three, six, mathbf,
nearest to i: four, mathbf, s, and, UNK, vma, three, two,
nearest to been: be, was, were, are, is, he, not, mctaggart,
nearest to from: and, in, with, by, for, on, of, mathbf,
nearest to was: is, were, be, has, had, are, mathbf, UNK,
nearest to during: in, mathbf, of, and, after, callisto, five, circ,
nearest to use: akita, gland, agouti, mathbf, altenberg, or, and, aristophanes,
nearest to the: a, this, an, his, their, UNK, its, gland,
nearest to six: four, five, three, eight, seven, zero, two, nine,


![](word2vec_graph.png)

Tensorboard now shows a much linear model that is interpretable. This is because we used tf.name_scope() while defining our variables. The tf.name_scope() groups contents together in one name defined by the user as it is visbible from inputs, embed, loss etc.  

We can also see our learned embeddings plotted by clicking on the Embeddings tab. We can also choose from different representations like SNE, PCA etc.