This is a slight modification of the word2vec tensorflow tutorial. This is labeled heavily so that the significance of each and every line is properly 
comprehended by the user.

In [46]:
import tensorflow as tf
import nltk
import urllib
import collections
import numpy as np
from six.moves import xrange

Defining a function to open the text file and create a list of words. The total number of unique words are also printed.

In [3]:
def read_data(filename = 'text8'):
    text_file = open('data/'+str(filename),'r')
    words = text_file.read().split()
    return words

words = read_data()
print(len(set(words))) # Prints the total number of unique words

count = collections.Counter(words) # To get the count of each words and visualize the top 10 most frequent words
count.most_common()[:10]

253854


[('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430),
 ('two', 192644)]

Defining a function to create data,count, word_dictionary and reversed dictionary. Rare words (words 
outside our definded vocabulary) are replaced with the 'UNK' token. 'word_dictionary' is a dictionary which maps the actual word to it's integer representation. 'count' will be a list, each element being the word and its number of occurrence.(First element of count is 'UNK' and the count of all words not present in the vocabulary that we define). 'data' is a list which is obtained by substituting each 
word in 'words' with its integer representation (the input data generated from read_data). So basically, we convert words to integer representation. 'reversed_dictionary' is the reverse mapping of 'word_dictionary' (from integer labels to words)

In [5]:
vocabulary_size = 50000

def create_train_data(words,vocab_size):
    # initially setting the first element of 'count' as the 'UNK' token (for rare words) and its occurrence as -1
    # Its occurence will be changed later
    count = [['UNK',-1]] 
    # Populating the count list with the rest of the words as per vocab_size. (We take the most common words)
    count.extend(collections.Counter(words).most_common(vocab_size-1)) # 1 less as the first element is 'UNK'
    
    # Now creating 'word_dictionary' which maps words to integer labels.
    word_dictionary = dict()
    for word,occurence in count:
        word_dictionary[word] = len(word_dictionary) # So it assigns an integer label as the dictionary fills up.
    
    # Now creating 'data' which converts words list to a list of its integer representations
    # We only take words defined in the word_dictionary as we have a fixed vocabulary(vocab_size)
    # All other words are 'UNK'
    data = list()
    unk_count = 0
    for word in words:
        if word in word_dictionary: 
            index = word_dictionary[word]
        else:
            index = 0 # Word not in dictionary, so label is 0 which is 'UNK'
            unk_count += 1 # To get the unknown count at the end to update the first element of 'count'
        data.append(index)
    
    # updating the first element of count. i.e, updating the count of 'UNK'
    count[0][1] = unk_count 
    # To get the reverse mapping
    reversed_dictionary = dict(zip(word_dictionary.values(),word_dictionary.keys())) 
    return data, count, word_dictionary, reversed_dictionary

# Print and check the visualize these objects
data, count, word_dictionary, reversed_dictionary = create_train_data(words,vocabulary_size)

Now we have our data prepared. Checking the first 10 elements of data and count.

In [6]:
data[:10], count[:10]

([5244, 3081, 12, 6, 195, 2, 3137, 46, 59, 156],
 [['UNK', 418391],
  ('the', 1061396),
  ('of', 593677),
  ('and', 416629),
  ('one', 411764),
  ('in', 372201),
  ('a', 325873),
  ('to', 316376),
  ('zero', 264975),
  ('nine', 250430)])

In [28]:
data_index = 0 
# This is a global variable and is declared as global in the function below to keep track of the index of data so as to generate the next batch.

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  if data_index + span > len(data):
    data_index = 0
  buffer.extend(data[data_index:data_index + span])
  #print("Buffer")
  #print(buffer)
  data_index += span
  for i in range(batch_size // num_skips): 
      # basically i acts as a counter for taking the target word. Batch size is 8. 2 context words are taken,
      # 1 on the left and 1 on the right. So 8/2 = 4. Hence we get 4 target words. Uncomment the print statements to visualize. 
      
    context_words = [w for w in range(span) if w != skip_window]
    random.shuffle(context_words)
    words_to_use = collections.deque(context_words)
    #print("Words to use")
    #print(words_to_use)
    for j in range(num_skips):
      #print("prev Batch")
      #print(batch)
      batch[i * num_skips + j] = buffer[skip_window]  # Target Word. 
      # i*num_skips + j basically acts as a counter for the indices of the batch, just like writing numbers like 11 as (1*10 + 1)
      context_word = words_to_use.pop() 
      # Grab the index of a context word from words_to_use and then pop it from the deque. So now this popped index is stored in context_word
      labels[i * num_skips + j, 0] = buffer[context_word] # Now use this popped word  as the label. 
      # (We are predicting context words from center word)
      #print("Batch")
      #print(batch)
      #print("Labels")
      #print(labels)
    if data_index == len(data):
      buffer[:] = data[:span]
      data_index = span
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

Visualizing a batch of 8

In [29]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i], reversed_dictionary[batch[i]],
        '->', labels[i, 0], reversed_dictionary[labels[i, 0]])

3081 originated -> 12 as
3081 originated -> 5244 anarchism
12 as -> 6 a
12 as -> 3081 originated
6 a -> 195 term
6 a -> 12 as
195 term -> 2 of
195 term -> 6 a


Bulding the skip-gram model and the tensorflow graph

In [32]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label. Use 2 when skip_window = 1 (So it generates 2 pairs for one word)

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False) # generates a random sized subset in the range (o, valid_window)
num_sampled = 64    # Number of negative examples to sample.

In [40]:
vocabulary_size = 50000

graph = tf.Graph()
with graph.as_default():
     # setting placeholders and constants for input data, labels and validation dataset
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
     
    # defining a variable for the word embeddings and initializing them with numbers between -1 and 1.
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size],-1.0,1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)  
    # This can be taken as the output of a multiplication happening at the first hidden layer of the neural net. (just for understanding)
    # Basically like a multiplication in which input tensor is a one hot. ( the hot element corresponding to the ids of words given by inputs)
    # But internally during backprop, its like a dictionary which reverse maps (during backprop) so that it sends 
    # out the gradients only to those rows of  embeddings that were used in that particular batch. Hence it will be a much more faster than
    # representing this operation with a typical neural network layer with the embeddings as its weights.
    # This picks up all the tensors in the embeddings according to the ids obtained from train_inputs
    # Remember that train_inputs is just a tensor of a set of integers governed by batch_size which are 
    # the integer representations of the words in data 
     
    # Constructing weights and biases for NCE Loss
    nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev = 1.0/math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # in NCE loss, 'inputs' actually represent the input to the nce_loss and not the input of the neural net.
    # The input to the nce_loss will be the output of the previous layer. The previous layer is the embedding_lookup layer
    #
    loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size))
    
    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
          normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
          valid_embeddings, normalized_embeddings, transpose_b=True)
    
    # Add variable initializer.
    init = tf.global_variables_initializer()

A high level explanation for the optimization of nce loss obtained from stackoverflow (given by curator23) : 
"The embeddings Tensor is your final output matrix. It maps words to vectors. Use this in your word prediction graph.
The input matrix is a batch of centre-word : context-word pairs (train_input and train_label respectively) generated from the training text.
While the exact workings of the nce_loss op are not yet know to me, the basic idea is that it uses a single layer network (parameters nce_weights and nce_biases) to map an input vector (selected from embeddings using the embed op) to an output word, and then compares the output to the training label (an adjacent word in the training text) and also to a random sub-sample (num_sampled) of all other words in the vocab, and then modifies the input vector (stored in embeddings) and the network parameters to minimise the error."

Training the skip-gram model

In [48]:
# Step 5: Begin training.
num_steps = 100001

with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step ', step, ': ', average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reversed_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        log_str = 'Nearest to %s:' % valid_word
        for k in xrange(top_k):
          close_word = reversed_dictionary[nearest[k]]
          log_str = '%s %s,' % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step  0 :  282.584777832
Nearest to nine: subsist, anisotropic, fellows, findable, freeciv, rumors, seventeenth, asn,
Nearest to UNK: acrimonious, oxen, qing, mkultra, literal, mutinied, meddle, mercenary,
Nearest to four: lifespan, advantages, dhimmi, eudoxia, tert, suzy, straightforwardly, ramesses,
Nearest to one: iucn, gamble, rf, elves, barton, anz, vitry, gilt,
Nearest to he: registration, local, paraphrased, rabinowitz, supers, gourmet, anatoly, endocrinology,
Nearest to by: prom, ehud, southern, assaulted, sectarianism, indochina, plunder, pledges,
Nearest to his: boo, radiosity, opus, jungles, unit, franconia, oceanography, color,
Nearest to no: uncontrolled, specially, sn, mouthpieces, plugs, foix, miles, lineage,
Nearest to its: crashed, sophistication, appearances, unicellular, analysing, gurion, humorous, chloride,
Nearest to often: crevasses, prosthetic, pompidou, outstripped, reconquest, tapes, lifespan, divided,


Nearest to th: weld, brunel, martins, point, dioxide, cezanne, observing, inwardly,
Nearest to their: anacreon, af, appendices, clip, marshalling, scenery, sharper, minarchists,
Nearest to been: adopts, immune, stark, christensen, peyton, datum, heterocyclic, soul,
Nearest to it: falls, drucker, runs, furor, sciences, louisiana, drivers, grotesque,
Nearest to there: nowell, guernica, withered, adolphus, triggering, sara, warns, lutheranism,
Nearest to when: several, disparagingly, sandro, serial, communism, calendars, warned, lacquer,


Average loss at step  2000 :  113.735357479


Average loss at step  4000 :  52.778852736


Average loss at step  6000 :  33.5482197268


Average loss at step  8000 :  23.7463875093


Average loss at step  10000 :  17.3574590499
Nearest to nine: eight, zero, vs, var, seven, one, phi, six,
Nearest to UNK: mosque, yeast, one, vs, bang, and, the, gb,
Nearest to four: eight, zero, var, nine, one, three, rudolph, two,
Nearest to one: two, var, nine, gb, zero, UNK, seven, mosque,
Nearest to he: it, they, anatoly, pseudopods, vocals, local, vs, measure,
Nearest to by: and, in, of, is, wire, confident, var, as,
Nearest to his: the, circumcision, its, del, damage, theological, settings, edmonton,
Nearest to no: to, specially, deism, revolutionaries, servant, socialism, proposed, miles,
Nearest to its: the, appearances, his, circumcision, training, vienna, charge, typical,
Nearest to often: divided, lifespan, authors, stylized, manifestations, tapes, their, biography,
Nearest to th: point, observing, six, zero, three, var, rotate, born,
Nearest to their: the, clip, style, concentrated, remain, often, survey, vs,
Nearest to been: immune, adopts, soul, paradigms, vs, rays, hono

Average loss at step  12000 :  14.0689575607


Average loss at step  14000 :  11.9180101365


Average loss at step  16000 :  10.004420069


Average loss at step  18000 :  8.4035275017


Average loss at step  20000 :  8.02782380855
Nearest to nine: eight, six, seven, zero, five, four, vs, operatorname,
Nearest to UNK: dasyprocta, circ, operatorname, agouti, yeast, mosque, two, venus,
Nearest to four: eight, zero, nine, three, two, five, six, seven,
Nearest to one: two, dasyprocta, four, agouti, three, eight, seven, operatorname,
Nearest to he: it, they, and, she, argues, who, dasyprocta, vs,
Nearest to by: was, in, and, is, with, as, for, from,
Nearest to his: the, its, her, their, s, del, circumcision, opus,
Nearest to no: specially, unequal, to, servant, revolutionaries, socialism, and, soprano,
Nearest to its: his, the, their, crashed, appearances, training, circumcision, typical,
Nearest to often: divided, tapes, manifestations, stylized, lifespan, authors, some, audible,
Nearest to th: three, six, zero, four, eight, one, observing, point,
Nearest to their: the, his, its, anacreon, operatorname, remain, her, clip,
Nearest to been: adopts, immune, by, was, who, have

Average loss at step  22000 :  7.03642044675


Average loss at step  24000 :  6.85834533679


Average loss at step  26000 :  6.72017207646


Average loss at step  28000 :  6.34312397707


Average loss at step  30000 :  5.97918035579
Nearest to nine: eight, seven, six, five, four, zero, three, operatorname,
Nearest to UNK: dasyprocta, circ, operatorname, mosque, agouti, three, aediles, arin,
Nearest to four: five, eight, seven, six, three, nine, two, zero,
Nearest to one: two, four, three, seven, eight, dasyprocta, agouti, operatorname,
Nearest to he: it, they, she, who, there, argues, zero, vs,
Nearest to by: in, with, was, from, and, as, is, be,
Nearest to his: her, their, the, its, s, del, circumcision, opus,
Nearest to no: sponsors, specially, unequal, to, revolutionaries, servant, a, nine,
Nearest to its: the, his, their, crashed, a, analysing, appearances, training,
Nearest to often: divided, tapes, also, audible, lifespan, manifestations, authors, stylized,
Nearest to th: six, three, eight, one, two, four, seven, point,
Nearest to their: the, his, its, her, remain, operatorname, lithuanian, a,
Nearest to been: by, adopts, sponsors, was, be, immune, who, have,
Near

Average loss at step  32000 :  5.9756525476


Average loss at step  34000 :  5.69859705305


Average loss at step  36000 :  5.75849120533


Average loss at step  38000 :  5.55741627693


Average loss at step  40000 :  5.24219259167
Nearest to nine: eight, seven, zero, six, five, four, three, operatorname,
Nearest to UNK: dasyprocta, operatorname, circ, agouti, recitative, arin, four, aediles,
Nearest to four: five, three, six, eight, seven, zero, two, one,
Nearest to one: two, four, three, eight, six, five, seven, dasyprocta,
Nearest to he: it, they, she, who, there, zero, we, vs,
Nearest to by: was, with, in, be, is, as, been, from,
Nearest to his: their, her, the, its, s, circumcision, del, conspired,
Nearest to no: sponsors, unequal, a, specially, it, revolutionaries, still, nine,
Nearest to its: their, the, his, a, crashed, analysing, training, appearances,
Nearest to often: divided, also, tapes, authors, chaotic, lifespan, audible, ras,
Nearest to th: six, eight, seven, three, accumulator, zero, four, one,
Nearest to their: the, its, his, her, recitative, beadwork, operatorname, agouti,
Nearest to been: be, was, by, sponsors, adopts, reliance, were, who,
Nearest t

Average loss at step  42000 :  5.34693013203


Average loss at step  44000 :  5.24501333344


Average loss at step  46000 :  5.23082812393


Average loss at step  48000 :  5.22957700038


Average loss at step  50000 :  4.98618395913
Nearest to nine: eight, six, seven, zero, five, three, four, operatorname,
Nearest to UNK: dasyprocta, kapoor, agouti, circ, four, recitative, marek, operatorname,
Nearest to four: three, six, five, eight, two, seven, one, kapoor,
Nearest to one: two, four, three, six, eight, seven, five, kapoor,
Nearest to he: it, they, she, who, there, we, truetype, this,
Nearest to by: was, be, with, from, in, as, is, and,
Nearest to his: their, her, the, its, s, recitative, agouti, circumcision,
Nearest to no: sponsors, unequal, it, a, still, or, only, and,
Nearest to its: their, his, the, analysing, crashed, a, her, homomorphism,
Nearest to often: also, divided, now, sometimes, tapes, chaotic, ras, audible,
Nearest to th: six, three, eight, seven, four, accumulator, one, five,
Nearest to their: his, its, the, her, kapoor, some, marek, recitative,
Nearest to been: be, was, were, sponsors, by, adopts, reliance, are,
Nearest to it: he, this, there, they, w

Average loss at step  52000 :  5.05709727466


Average loss at step  54000 :  5.17568934417


Average loss at step  56000 :  5.07141618419


Average loss at step  58000 :  5.04957467067


Average loss at step  60000 :  4.94871200526
Nearest to nine: eight, seven, six, four, five, zero, ursus, operatorname,
Nearest to UNK: dasyprocta, kapoor, ursus, circ, pulau, agouti, operatorname, thibetanus,
Nearest to four: five, six, three, eight, seven, two, kapoor, nine,
Nearest to one: two, six, four, three, five, dasyprocta, eight, kapoor,
Nearest to he: it, they, she, who, there, we, but, this,
Nearest to by: was, be, with, as, from, four, five, been,
Nearest to his: their, her, its, the, s, my, pulau, recitative,
Nearest to no: unequal, sponsors, a, it, only, westphalia, franc, positions,
Nearest to its: their, his, the, her, analysing, recitative, a, homomorphism,
Nearest to often: pulau, also, divided, sometimes, now, it, ras, tapes,
Nearest to th: six, four, seven, eight, three, accumulator, five, dioxide,
Nearest to their: its, his, the, her, recitative, kapoor, pulau, some,
Nearest to been: be, was, were, by, reliance, sponsors, adopts, are,
Nearest to it: he, this, ther

Average loss at step  62000 :  5.0174506073


Average loss at step  64000 :  4.82829598403


Average loss at step  66000 :  4.59480603361


Average loss at step  68000 :  4.96854473853


Average loss at step  70000 :  4.86998276556
Nearest to nine: eight, seven, six, five, zero, four, ursus, operatorname,
Nearest to UNK: dasyprocta, kapoor, ursus, mico, pulau, circ, callithrix, operatorname,
Nearest to four: six, five, three, eight, seven, two, nine, kapoor,
Nearest to one: six, two, four, three, five, dasyprocta, seven, kapoor,
Nearest to he: it, she, they, who, there, we, never, remnant,
Nearest to by: was, be, with, as, seizures, from, kapoor, in,
Nearest to his: their, her, its, the, my, thaler, s, pulau,
Nearest to no: unequal, sponsors, it, only, a, westphalia, positions, franc,
Nearest to its: their, his, the, her, thaler, analysing, homomorphism, recitative,
Nearest to often: pulau, also, now, sometimes, divided, commonly, farewell, simply,
Nearest to th: six, three, seven, four, eight, accumulator, one, dioxide,
Nearest to their: its, his, the, her, some, recitative, kapoor, pulau,
Nearest to been: be, was, were, reliance, by, are, sponsors, adopts,
Nearest to

Average loss at step  72000 :  4.74126870692


Average loss at step  74000 :  4.7994087069


Average loss at step  76000 :  4.71473241282


Average loss at step  78000 :  4.80887523252


Average loss at step  80000 :  4.81248695421
Nearest to nine: eight, six, seven, five, four, zero, ursus, operatorname,
Nearest to UNK: dasyprocta, ursus, kapoor, agouti, callithrix, mico, circ, operatorname,
Nearest to four: five, six, three, seven, eight, two, nine, zero,
Nearest to one: six, seven, two, five, kapoor, dasyprocta, three, michelob,
Nearest to he: it, they, she, who, there, we, never, you,
Nearest to by: was, be, as, with, from, operatorname, in, kapoor,
Nearest to his: their, her, its, the, s, my, thaler, polyn,
Nearest to no: unequal, sponsors, it, only, still, westphalia, franc, positions,
Nearest to its: their, his, the, her, thaler, analysing, jethro, a,
Nearest to often: pulau, also, sometimes, now, commonly, divided, still, usually,
Nearest to th: six, seven, eight, four, accumulator, three, dioxide, five,
Nearest to their: its, his, her, the, thaler, pulau, kapoor, recitative,
Nearest to been: be, was, were, reliance, by, sponsors, had, are,
Nearest to it: he, t

Average loss at step  82000 :  4.77211881125


Average loss at step  84000 :  4.76178852105


Average loss at step  86000 :  4.78316018534


Average loss at step  88000 :  4.74671554017


Average loss at step  90000 :  4.73551342165
Nearest to nine: eight, seven, six, five, zero, four, ursus, three,
Nearest to UNK: dasyprocta, ursus, mico, kapoor, circ, pulau, busan, operatorname,
Nearest to four: three, five, seven, six, eight, two, one, kapoor,
Nearest to one: four, two, three, seven, six, five, eight, kapoor,
Nearest to he: it, she, they, who, there, we, never, but,
Nearest to by: was, be, as, with, in, from, busan, kapoor,
Nearest to his: their, her, its, the, my, s, thaler, transistor,
Nearest to no: unequal, sponsors, it, only, a, still, any, westphalia,
Nearest to its: their, his, the, her, thaler, analysing, recitative, jethro,
Nearest to often: sometimes, also, pulau, now, commonly, usually, still, generally,
Nearest to th: six, seven, eight, nine, accumulator, four, three, five,
Nearest to their: its, his, her, the, recitative, some, agouti, pulau,
Nearest to been: be, was, were, reliance, by, are, had, sponsors,
Nearest to it: he, this, there, she, they, whic

Average loss at step  92000 :  4.67600551403


Average loss at step  94000 :  4.71477692652


Average loss at step  96000 :  4.69089664531


Average loss at step  98000 :  4.60148849851


Average loss at step  100000 :  4.69934381998
Nearest to nine: eight, seven, six, five, zero, four, three, ursus,
Nearest to UNK: kapoor, dasyprocta, ursus, mico, callithrix, operatorname, circ, peacocks,
Nearest to four: five, three, seven, eight, six, two, zero, kapoor,
Nearest to one: four, seven, two, five, six, three, kapoor, eight,
Nearest to he: it, she, they, who, there, we, never, but,
Nearest to by: be, was, as, operatorname, seizures, kapoor, busan, seven,
Nearest to his: their, her, its, the, my, s, thaler, our,
Nearest to no: it, unequal, sponsors, only, any, nine, franc, positions,
Nearest to its: their, his, the, her, thaler, jethro, analysing, some,
Nearest to often: sometimes, also, pulau, commonly, now, usually, generally, still,
Nearest to th: six, seven, accumulator, nine, eight, dioxide, three, brunel,
Nearest to their: its, his, her, the, some, pulau, agouti, thaler,
Nearest to been: be, was, were, reliance, become, by, had, sponsors,
Nearest to it: he, there, thi

In [50]:
final_embeddings[:10]

array([[ 0.00751682, -0.05371699, -0.07132055, ..., -0.13933519,
        -0.02045279, -0.04116968],
       [ 0.08291626,  0.11531038, -0.08103251, ...,  0.12472363,
         0.05915395, -0.06485777],
       [ 0.12645645,  0.12861994, -0.15893146, ...,  0.02102033,
        -0.08008344, -0.17346135],
       ..., 
       [ 0.07011682,  0.00313191, -0.05450524, ..., -0.07874514,
        -0.05461983, -0.1422379 ],
       [ 0.03490714, -0.02096882,  0.01477128, ..., -0.02078016,
        -0.04375534, -0.05463719],
       [ 0.06420515, -0.06192603, -0.09500519, ..., -0.0512049 ,
        -0.07853603,  0.0226666 ]], dtype=float32)