In [1]:
import urllib.request
import collections
import math
import os
import random
import zipfile
import datetime as dt

import numpy as np
import tensorflow as tf

In [2]:
def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    
    return filename


filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    
    return data

vocabulary = read_data(filename)
print('Data size', len(vocabulary))

Data size 17005207


In [4]:
from sklearn.datasets import fetch_20newsgroups
# the .data attribute will access the raw data, where
# each element is a document
newsgroups_train = fetch_20newsgroups(subset = 'train')

# gensim’s Word2vec expects a sequence of sentences as its input,
# where each sentence a list of words. We'll be lazy for now
# and not perform any sort of text preprocessing
vocabulary = [word for doc in newsgroups_train.data for word in doc]
print('Data size', len(vocabulary))

Data size 22054494


In [5]:
documents = [doc.strip().split() for doc in newsgroups_train.data]
vocabulary = [word.lower() for doc in documents for word in doc]
#          if word not in ENGLISH_STOP_WORDS]
print('Data size', len(vocabulary))

Data size 3252437


In [9]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
# vocabulary_size = None

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
#     count = [['UNK', 0]]
#     if n_words is None:
#         count.extend(collections.Counter(words).most_common(n_words))
#     else:
#         count.extend(collections.Counter(words).most_common(n_words - 1))
    count = collections.Counter(words).most_common(n_words)

    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)

    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
            data.append(index)
        # else:
        #     index = 0  # dictionary['UNK']
        #     unk_count += 1
        # data.append(index)

    # count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [('subject', 9520), ('use', 8750), ("'s", 7893), ('write', 6884), ('know', 6105)]
Sample data [15, 0, 56, 39, 23079, 46, 16410, 1074, 697, 246] ['thing', 'subject', 'car', 'nntp_posting', 'host_rac3.wam.umd.edu', 'organization_university', 'maryland_college', 'park', 'lines', 'wonder']


In [10]:
# data_index = 0
# batch = np.zeros(batch_size, dtype = np.uint32)
# labels = np.zeros((batch_size, 1), dtype = np.uint32)
# span = 2 * skip_window + 1  # [skip_window target skip_window]
# print(span)
# # the buffer holds a maximum of `span` elements and will
# # be a moving window of words that samples are drawn from
# buffer = collections.deque(maxlen = span)
# for _ in range(span):
#     buffer.append(data[data_index])
#     data_index = (data_index + 1) % len(data)


In [11]:
data_index = 0


# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]

        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]],
          '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

0 subject -> 15 thing
0 subject -> 56 car
56 car -> 39 nntp_posting
56 car -> 0 subject
39 nntp_posting -> 23079 host_rac3.wam.umd.edu
39 nntp_posting -> 56 car
23079 host_rac3.wam.umd.edu -> 39 nntp_posting
23079 host_rac3.wam.umd.edu -> 46 organization_university


In [12]:
# Step 4: Build and train a skip-gram model.

vocabulary_size = len(dictionary)

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_labels,
                       inputs=embed,
                       num_sampled=num_sampled,
                       num_classes=vocabulary_size))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.global_variables_initializer()

In [13]:
# Step 5: Begin training.
from tqdm import trange

num_steps = 100001
with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in trange(num_steps):
        batch_inputs, batch_labels = generate_batch(
            batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
#         if step % 10000 == 0:
#             sim = similarity.eval()
#             for i in range(valid_size):
#                 valid_word = reverse_dictionary[valid_examples[i]]
#                 top_k = 8  # number of nearest neighbors
#                 nearest = (-sim[i, :]).argsort()[1:top_k + 1]
#                 log_str = 'Nearest to %s:' % valid_word
#                 for k in range(top_k):
#                     close_word = reverse_dictionary[nearest[k]]
#                     log_str = '%s %s,' % (log_str, close_word)
#                 print(log_str)

    final_embeddings = normalized_embeddings.eval()

  0%|          | 53/100001 [00:00<03:08, 529.00it/s]

Initialized
Average loss at step  0 :  299.814086914


  2%|▏         | 2092/100001 [00:03<02:28, 660.22it/s]

Average loss at step  2000 :  132.999240431


  4%|▍         | 4130/100001 [00:06<02:22, 672.86it/s]

Average loss at step  4000 :  64.3089953823


  6%|▌         | 6081/100001 [00:09<02:33, 612.03it/s]

Average loss at step  6000 :  39.7369674871


  8%|▊         | 8089/100001 [00:12<02:19, 658.67it/s]

Average loss at step  8000 :  27.7269375744


 10%|█         | 10103/100001 [00:15<02:15, 664.08it/s]

Average loss at step  10000 :  20.5734169469


 12%|█▏        | 12096/100001 [00:18<02:30, 584.39it/s]

Average loss at step  12000 :  16.0279527538


 14%|█▍        | 14114/100001 [00:21<02:17, 623.50it/s]

Average loss at step  14000 :  12.7265606122


 16%|█▌        | 16089/100001 [00:25<02:07, 656.15it/s]

Average loss at step  16000 :  11.2583232989


 18%|█▊        | 18067/100001 [00:28<02:06, 648.14it/s]

Average loss at step  18000 :  9.17188687849


 20%|██        | 20108/100001 [00:31<02:07, 626.93it/s]

Average loss at step  20000 :  8.22088277483


 22%|██▏       | 22070/100001 [00:34<02:00, 648.20it/s]

Average loss at step  22000 :  6.93328272533


 24%|██▍       | 24103/100001 [00:37<01:58, 639.92it/s]

Average loss at step  24000 :  6.23523056483


 26%|██▌       | 26079/100001 [00:40<01:55, 642.47it/s]

Average loss at step  26000 :  5.63266658413


 28%|██▊       | 28127/100001 [00:43<01:51, 644.38it/s]

Average loss at step  28000 :  5.53037808871


 30%|███       | 30113/100001 [00:46<01:45, 659.47it/s]

Average loss at step  30000 :  5.41085318971


 32%|███▏      | 32104/100001 [00:50<01:54, 592.25it/s]

Average loss at step  32000 :  5.22951853299


 34%|███▍      | 34093/100001 [00:53<01:41, 649.86it/s]

Average loss at step  34000 :  5.10790141428


 36%|███▌      | 36102/100001 [00:56<01:36, 662.90it/s]

Average loss at step  36000 :  5.10305028784


 38%|███▊      | 38116/100001 [00:59<01:33, 664.02it/s]

Average loss at step  38000 :  4.96739842135


 40%|████      | 40068/100001 [01:02<01:29, 671.06it/s]

Average loss at step  40000 :  4.94769360447


 42%|████▏     | 42073/100001 [01:05<01:29, 646.63it/s]

Average loss at step  42000 :  4.90534448075


 44%|████▍     | 44097/100001 [01:08<01:23, 672.28it/s]

Average loss at step  44000 :  4.80577723062


 46%|████▌     | 46104/100001 [01:11<01:20, 669.12it/s]

Average loss at step  46000 :  4.75936888337


 48%|████▊     | 48135/100001 [01:14<01:16, 673.79it/s]

Average loss at step  48000 :  4.73230387473


 50%|█████     | 50111/100001 [01:17<01:15, 663.59it/s]

Average loss at step  50000 :  4.84547416949


 52%|█████▏    | 52074/100001 [01:20<01:12, 663.93it/s]

Average loss at step  52000 :  4.7594352026


 54%|█████▍    | 54107/100001 [01:23<01:08, 672.24it/s]

Average loss at step  54000 :  4.6888678987


 56%|█████▌    | 56092/100001 [01:26<01:04, 677.54it/s]

Average loss at step  56000 :  4.666522982


 58%|█████▊    | 58110/100001 [01:29<01:04, 650.92it/s]

Average loss at step  58000 :  4.64724033868


 60%|██████    | 60071/100001 [01:32<00:59, 668.17it/s]

Average loss at step  60000 :  4.62005870533


 62%|██████▏   | 62085/100001 [01:35<00:58, 645.77it/s]

Average loss at step  62000 :  4.60748245668


 64%|██████▍   | 64059/100001 [01:38<00:57, 622.32it/s]

Average loss at step  64000 :  4.56079784679


 66%|██████▌   | 66113/100001 [01:42<00:53, 633.07it/s]

Average loss at step  66000 :  4.57100452197


 68%|██████▊   | 68121/100001 [01:45<00:50, 625.12it/s]

Average loss at step  68000 :  4.51377039814


 70%|███████   | 70092/100001 [01:48<00:44, 667.22it/s]

Average loss at step  70000 :  4.49332784742


 72%|███████▏  | 72115/100001 [01:51<00:46, 598.50it/s]

Average loss at step  72000 :  4.48448897171


 74%|███████▍  | 74107/100001 [01:54<00:43, 597.80it/s]

Average loss at step  74000 :  4.48985939038


 76%|███████▌  | 76129/100001 [01:57<00:35, 675.03it/s]

Average loss at step  76000 :  4.47161449313


 78%|███████▊  | 78109/100001 [02:00<00:32, 677.88it/s]

Average loss at step  78000 :  4.41033694029


 80%|████████  | 80092/100001 [02:03<00:29, 673.25it/s]

Average loss at step  80000 :  4.44358569908


 82%|████████▏ | 82071/100001 [02:06<00:26, 677.20it/s]

Average loss at step  82000 :  4.44948875225


 84%|████████▍ | 84085/100001 [02:10<00:25, 612.43it/s]

Average loss at step  84000 :  4.39938140035


 86%|████████▌ | 86105/100001 [02:13<00:20, 681.76it/s]

Average loss at step  86000 :  4.37770083308


 88%|████████▊ | 88088/100001 [02:16<00:17, 672.40it/s]

Average loss at step  88000 :  4.36674010289


 90%|█████████ | 90105/100001 [02:19<00:15, 649.25it/s]

Average loss at step  90000 :  4.37630050865


 92%|█████████▏| 92093/100001 [02:22<00:11, 663.18it/s]

Average loss at step  92000 :  4.33248949313


 94%|█████████▍| 94085/100001 [02:25<00:09, 646.89it/s]

Average loss at step  94000 :  4.34902845705


 96%|█████████▌| 96122/100001 [02:28<00:05, 680.43it/s]

Average loss at step  96000 :  4.33037853301


 98%|█████████▊| 98110/100001 [02:31<00:02, 678.67it/s]

Average loss at step  98000 :  4.31298154807


100%|██████████| 100001/100001 [02:34<00:00, 648.01it/s]


Average loss at step  100000 :  4.27476632017


In [14]:
from scipy.spatial.distance import cdist


top_k = 10

# idx = word_index['computer']
idx = dictionary['computer']

# eval_word = word_index_rev[idx]
eval_word = reverse_dictionary[idx]
print(eval_word, '\n')

# remember the cdist returns a the cosine distance,
# so when doing the argsort, which is sorting by ascending
# order, the top k most similar word will be the first k one;
# and since the most similar word will always be itself, we
# exclude that from the returned result
vector = final_embeddings[idx].reshape(1, -1)
sim = cdist(final_embeddings, vector, metric = 'cosine').ravel()
nearest_indices = np.argsort(sim)[1:(top_k + 1)]
print(1 - sim[nearest_indices])

for nearest_idx in nearest_indices:
    sim_word = reverse_dictionary[nearest_idx]
    print(sim_word)

computer 

[ 0.45620858  0.44938023  0.44900897  0.4468715   0.44654727  0.44154607
  0.4351962   0.4349767   0.42985046  0.42923429]
inverse
reducing
ay
universitet
pc
maryland_college
huge_rock
formatter
contort
public_education


# Gensim

In [None]:
from collections import defaultdict

sentence_no = -1
total_words = 0
min_reduce = 1
vocab = defaultdict(int)
checked_string_types = 0

for idx, sentence in enumerate(sentences, 1):
    for word in sentence:
        vocab[word] += 1

raw_vocab = vocab
corpus_count = sentence_no
print('corpus count:', corpus_count)

In [None]:
from gensim.models.keyedvectors import KeyedVectors, Vocab

wv = KeyedVectors()
wv.vocab = {}
wv.index2word = []

min_count = word2vec.min_count
sample = word2vec.sample


def keep_vocab_item(word, count, min_count, trim_rule = None):
    """filter the min word count"""
    default_rule = count >= min_count
    if trim_rule is None:
        return default_rule

In [None]:
drop_total = drop_unique = 0

# keep track of the total number of retained
# words to perform subsampling later
retain_total, retain_words = 0, []

for word, count in raw_vocab.items():
    if keep_vocab_item(word, count, min_count):
        retain_words.append(word)
        retain_total += count
        wv.vocab[word] = Vocab(count = count, index = len(wv.index2word))
        wv.index2word.append(word)
    else:
        drop_unique += 1
        drop_total += count

In [None]:
# used for some logging information
original_unique_total = len(retain_words) + drop_unique
retain_unique_pct = len(retain_words) * 100 / original_unique_total
original_total = retain_total + drop_total
retain_pct = retain_total * 100 / original_total
retain_total

In [None]:
# Precalculate each vocabulary item's threshold for sub-sampling
threshold_count = sample * retain_total

downsample_total, downsample_unique = 0, 0
for word in retain_words:
    count = raw_vocab[word]
    prob = np.sqrt(count / threshold_count + 1) * threshold_count / count
    if prob < 1.0:
        downsample_unique += 1
        downsample_total += prob * count
    else:
        prob = 1
        downsample_total += count
    
    # ?? * 2 ** 32
    wv.vocab[word].sample_int = int(prob * 2 ** 32)
    print(word, count, prob)

In [None]:
# sort vocabulary's index by count
wv.index2word.sort(key = lambda word: wv.vocab[word].count, reverse = True)
for idx, word in enumerate(wv.index2word):
    wv.vocab[word].index = idx

In [None]:
from temp import build_vocab

sentences = [doc.strip().split() for doc in newsgroups_train.data[:30]]
vocab, index2word = build_vocab(sentences)
vocab

In [None]:
#for sentence in sentences:
sentence = sentences[0]
word_vocabs = [vocab[w] for w in sentence if w in vocab and
               vocab[w]['prob'] > np.random.rand()]

## Tensorflow

In [None]:
from subprocess import call
from zipfile import ZipFile


def read_data():
    """Read data into a list of tokens/words"""
    filename = 'text8.zip'
    base_url = 'http://mattmahoney.net/dc/'

    if not os.path.isfile(filename):
        call('wget ' + base_url + filename, shell = True)

    with ZipFile(filename) as f:
        file = f.namelist()[0]

        # ensure compatibility each python2 and python3's str type
        # https://stackoverflow.com/questions/37689802/what-is-tensorflow-compat-as-str
        data = tf.compat.as_str(f.read(file)).split()

    return data


words = read_data()
print('data:', words[:4])
print('data size {}'.format(len(words)))

In [None]:
with open(BIGRAM_PATH) as f:
    words = f.read().split()

print('data:', words[:4])
print('data size {}'.format(len(words)))

In [None]:
def build_dataset(words, vocab_size = None):
    
    # word_count = [['UNK', -1]]
    # word_count.extend(Counter(words).most_common(vocab_size))
    word_count = Counter(words).most_common(vocab_size)
    word_index = {word: idx for idx, (word, _) in enumerate(word_count)}

    # build up word index and replaced the words by its assigned indices
    data = []
    unknown_count = 0
    for word in words:
        if word in word_index:
            idx = word_index[word]
        # else:
        #     idx = 0
        #    unknown_count += 1

            data.append(idx)

    # 'UNK' flag for out of vocabulary word
    # unknown = 'UNK', unknown_count
    # word_count.append(unknown)
    # word_count[0][1] = unknown_count
    word_index_rev = {idx: word for word, idx in word_index.items()}
    return data, word_count, word_index, word_index_rev

In [None]:
# TODO : ??? do we need to return a 4 element tuple
data, word_count, word_index, word_index_rev = build_dataset(words, vocab_size = 20000)

print('Most common words', word_count[:5])
print('Sample data', data[:10])

In [None]:
def generate_sample(indexed_words, window):
    """
    Form training pairs according to the skip-gram model
    
    Parameters
    ----------
    indexed_words : list
        List of index that represents the words, e.g. [5243, 3083, 11],
        and 5243 might represent the word "Today"
        
    window : int
        Window size of the skip-gram model, where word is sampled before
        and after the center word according to this window size
    """
    for index, center in enumerate(indexed_words):
        # random integers from `low` (inclusive) to `high` (exclusive)
        context = np.random.randint(1, window + 1)

        # get a random target before the center word
        for target in indexed_words[max(0, index - context):index]:
            yield center, target

        # get a random target after the center word
        for target in indexed_words[(index + 1):(index + 1 + context)]:
            yield center, target

In [None]:
iterator = generate_sample(indexed_words = data, window = 3)

print('original data:', data[:6])
print('skip gram sample:')

# we start off by using the first word as the center word,
# and since there's no word before it, we will not have any
# sampled word before it; after that we keep sliding the center
# word and generate word pairs
print(next(iterator))
print(next(iterator))
print(next(iterator))
print(next(iterator))
print(next(iterator))
print(next(iterator))
print(next(iterator))
print(next(iterator))

In [None]:
def get_batch(iterator, batch_size):
    """
    Group a numerical stream of centered and targeted
    word into batches and yield them as numpy arrays
    """
    while True:
        center_batch = np.zeros(batch_size, dtype = np.int32)
        target_batch = np.zeros((batch_size, 1), dtype = np.int32)
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)

        yield center_batch, target_batch

In [None]:
window = 3
batch_size = 5
iterator = generate_sample(indexed_words = data, window = window)
batches = get_batch(iterator, batch_size)

# e.g. generate a batch
center_batch, target_batch = next(batches)
print(center_batch)
print(target_batch)

**Step 1:** Define placeholders for input and output. Input is the center word and output is the target (context) word. Instead of using one-hot vectors, we input the index of those words directly.

```python
# explicitly naming our operations will make it easier to track them later
center_words = tf.placeholder(
    tf.int32, shape = [BATCH_SIZE], name = 'center_words')

# for target_words:
# we will use this with tensorflow's loss function later, and the function
# requires rank 2 input, that's why there's an extra dimension in the shape
target_words = tf.placeholder(
    tf.int32, shape = [BATCH_SIZE, 1], name = 'target_words')
```

**Step 2:** Define the weight/variable. In this case, the embedding matrix. Each row corresponds to the representation vector of one word. If one word is represented with a vector of size EMBED_SIZE, then the embedding matrix will have shape [VOCAB_SIZE, EMBED_SIZE]. We initialize the embedding matrix to value from a random distribution. In this case, let’s choose uniform distribution.

```python
# word vectors
embed_matrix = tf.Variable(
    tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0), name = 'embed_matrix')
```

**Step 3:** Inference (compute the forward path of the graph). Recall that the hidden layer serves as a lookup table and its purpose is to get the vector representations of words in our dictionary.

<img src="img/hidden_layer.png" width="70%" height="70%">

i.e. The output of the hidden layer is just the "word vector" for the input word. Our embed_matrix has dimension [VOCAB_SIZE x EMBED_SIZE], with each row of the embedding matrix corresponds to the vector representation of the word at that index. So to get the representation of all the center words in the batch, we get the slice of all corresponding rows in the embedding matrix. TensorFlow provides a convenient method to do so called `tf.nn.embedding_lookup()`. This method is really useful when it comes to matrix multiplication with one-hot vectors because it saves us from doing a bunch of unnecessary computation that will return 0 anyway.

```python
# input -> hidden layer
embed = tf.nn.embedding_lookup(embed_matrix, center_words)
```

**Step 4: Define the loss function and optimizer** For nce_loss, we need weights and biases for the hidden layer to calculate negative sampling loss.

```python
# hidden layer -> output layer's weights
output_weight = tf.Variable(
    tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev = 1.0 / EMBED_SIZE ** 0.5))

output_bias = tf.Variable(tf.zeros([VOCAB_SIZE]))

# hidden layer -> output layer + negative sampling loss
loss = tf.nn.sampled_softmax_loss(
    weights = output_weight, biases = output_bias,
    labels = target_words, inputs = embed,
    num_sampled = NUM_SAMPLED, num_classes = VOCAB_SIZE)

avg_loss = tf.reduce_mean(loss, name = 'loss')

# choose an optimizer to perform the heavy lifting
optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)
optimize = optimizer.minimize(avg_loss)
```

After defining the operations we will create the session to execute the computation, including feeding in the inputs, running the optimizer to minimize the objective function we just defined and fetch the loss value so we can check convergence. The following code chunk pretty much dumped everything into one giant function

In [None]:
from tf_word2vec_no_frill import tf_word2vec, build_vocab


VOCAB_SIZE = 20000  # 50000
BATCH_SIZE = 128
EMBED_SIZE = 128  # dimension of the word embedding vectors
WINDOW_SIZE = 5  # the context window 
NUM_SAMPLED = 64  # Number of negative examples to sample
LEARNING_RATE = 0.1
EPOCHS = 10000
TENSORBOARD = './graphs/no_frills/'
word2vec_param = {'vocab_size': VOCAB_SIZE, 'batch_size': BATCH_SIZE,
                  'embed_size': EMBED_SIZE, 'num_sampled': NUM_SAMPLED,
                  'learning_rate': LEARNING_RATE, 'epochs': EPOCHS,
                  'tensorboard': TENSORBOARD, 'window_size': WINDOW_SIZE}

In [None]:
# words = read_data()
# words = [word.lower() 
#          for doc in documents
#          for word in doc
#          if word not in ENGLISH_STOP_WORDS]
# with open(BIGRAM_PATH) as f:
#     words = f.read().split()

# data, word_count, word_index, word_index_rev = build_dataset(words, VOCAB_SIZE)
# iterator = generate_sample(indexed_words = data, window = WINDOW_SIZE)
# batch_gen = get_batch(iterator, BATCH_SIZE)

# actual model training
# word_vectors, history = tf_word2vec(batch_gen, **word2vec_param)
# word_vectors, history = tf_word2vec(data, **word2vec_param)

In [None]:
from tf_word2vec_no_frill import tf_word2vec, build_vocab


# VOCAB_SIZE = 20000  # 50000
BATCH_SIZE = 128
EMBED_SIZE = 128  # dimension of the word embedding vectors
WINDOW_SIZE = 5  # the context window 
NUM_SAMPLED = 64  # Number of negative examples to sample
LEARNING_RATE = 0.01
EPOCHS = 100000
TENSORBOARD = './graphs/no_frills/'
word2vec_param = {'batch_size': BATCH_SIZE,
                  'embed_size': EMBED_SIZE, 'num_sampled': NUM_SAMPLED,
                  'learning_rate': LEARNING_RATE, 'epochs': EPOCHS,
                  'tensorboard': TENSORBOARD, 'window_size': WINDOW_SIZE}

In [None]:
sentences = []
with open(BIGRAM_PATH) as f:
    for line in f:
        sentences.append(line.split())


vocab, index2word = build_vocab(sentences)
word_vectors, history = tf_word2vec(sentences, vocab, **word2vec_param)

In [None]:
# visualize the convergence or course
# we can also do this within tensorboard
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

plt.plot(history)
plt.title('Convergence Plot')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.show()

In [None]:
top = 500

embedding = word_vectors[:top]

metadata_file = 'top_vocab.tsv'
top_words = pd.DataFrame([word_index_rev[i] for i in range(top)])
top_words.to_csv(metadata_file, sep = '\t', index = False, header = False)
top_words.head()

To terminate the tensorboard visualization in jupyter notebook we can go to the dropdown menu at the top: `Kernel -> Interrupt`