This note book demonstrates skip-gram word2vec using TensorFlow. 

This is motivated by an awesome blogplot: http://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/ , with some variation in Python code.

In [13]:
import six
import os
import six
import urllib
import zipfile
import datetime
import collections
import nltk
import numpy as np
import tensorflow as tf

In [14]:
def maybeDownload(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


In [15]:
def readData(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as fh:
        raw_data = tf.compat.as_str(fh.read(fh.namelist()[0])).split()    
    return raw_data


In [16]:
def buildDataset(words, topK=10000):
    word_counts = [['UNK', -1]]
    word_counts.extend(collections.Counter(words).most_common(topK-1))
    word2idx = {}
    for w, _ in word_counts:
        word2idx[w] = len(word2idx)
    data = []
    unk_count = 0
    for w in words:
        if w in word2idx:
            idx = word2idx[w]
        else:
            idx = 0
            unk_count += 1
        data.append(idx)
    word_counts[0][1] = unk_count
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    return data, word_counts, word2idx, idx2word


In [17]:
def collectData(vocabulary_size=10000, remove_stop_words=True):
    url = 'http://mattmahoney.net/dc/'
    filename = maybeDownload('text8.zip', url, 31344016)
    raw_data = readData(filename)
    if remove_stop_words:
        stop_words = set(nltk.corpus.stopwords.words('english'))
        raw_data = [w for w in raw_data if w not in stop_words]
    data, word_counts, word2idx, idx2word = buildDataset(raw_data, vocabulary_size)
    del raw_data
    return data, word_counts, word2idx, idx2word


In [18]:
data_idx = 0
def generateBatch(data, batch_size, num_skips, window_size):
    global data_idx
    assert batch_size % num_skips == 0
    assert num_skips <= 2*window_size
    input_words = np.ndarray(shape=(batch_size), dtype=np.int32)   # the context
    output_words = np.ndarray(shape=(batch_size), dtype=np.int32)  # the target
    span = 2 * window_size + 1
    words_in_window = collections.deque(maxlen=span)
    for _ in range(span):
        words_in_window.append(data[data_idx])
        data_idx = (data_idx + 1) % len(data)
    for i in range(batch_size // num_skips):    
        input_word_idx = window_size
        indices_to_avoid = [window_size]
        for j in range(num_skips):
            while input_word_idx in indices_to_avoid:
                input_word_idx = np.random.randint(0, span)
            indices_to_avoid.append(input_word_idx)
            input_words[i * num_skips + j] = words_in_window[window_size]
            output_words[i * num_skips + j] = words_in_window[input_word_idx]
        words_in_window.append(data[data_idx])    
        data_idx = (data_idx + 1) % len(data)
    data_idx = (data_idx + len(data) - span) % len(data)
    return input_words, output_words
        

In [19]:
vocabulary_size = 10000
data, word_counts, word2idx, idx2word = collectData(vocabulary_size, remove_stop_words=True)

Found and verified text8.zip


In [27]:
batch_size = 256
num_skips = 4
window_size = 5
# input_words, output_words = generateBatch(data, batch_size, num_skips, window_size)

In [21]:
val_size = 16
val_window = 100
val_examples = np.random.choice(val_window, val_size, replace=False)

In [22]:
embedding_size = 300
learning_rate = 0.01
graph = tf.Graph()
with graph.as_default():
    
    # create
    training_inputs = tf.placeholder(dtype=tf.int32, shape=(batch_size))
    training_outputs = tf.placeholder(dtype=tf.int32, shape=(batch_size))
    val_inputs = tf.constant(val_examples, dtype=tf.int32)
    
    # construct the embedding matrix
    embedding_matrix = tf.get_variable(name='embedding_matrix', dtype=tf.float64,
                                       shape=[vocabulary_size, embedding_size],
                                       initializer=tf.contrib.layers.xavier_initializer())
    embeddings = tf.nn.embedding_lookup(embedding_matrix, training_inputs)  # shape: [n_train, embedding_size]
    
    # construct the weight and bias for the softmax
    W = tf.get_variable(name='W', dtype=tf.float64,
                              shape=[vocabulary_size, embedding_size],
                              initializer=tf.contrib.layers.xavier_initializer())
    b = tf.get_variable(name='b', dtype=tf.float64,
                        shape=[1, vocabulary_size],
                        initializer=tf.zeros_initializer())
    
    Z = tf.add(tf.matmul(embeddings, W, transpose_b=True), b)
    
    # convert the label to one-hot-encoded form
    train_outptus_one_hot = tf.one_hot(training_outputs, vocabulary_size)
    
    # construct the objective function (cross entropy) and Adam optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=Z, labels=train_outptus_one_hot))    
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    
    # Compute the cosine similarity
    norm = tf.square(tf.reduce_sum(tf.square(embedding_matrix), 1, keep_dims=True))
    normalized_embeddings = embedding_matrix / norm
    val_embeddings = tf.nn.embedding_lookup(normalized_embeddings, val_inputs)
    similarity = tf.matmul(val_embeddings, val_embeddings, transpose_b=True)
    
    init = tf.global_variables_initializer()

In [34]:
top_k = 8
def run(graph, num_iters, interval_size=100, print_valida_similarity=True):
    training_history = []
    with tf.Session(graph=graph) as session:
        init.run()
        
        avg_loss = 0.0
        for step in range(num_iters):
            batch_inputs, batch_outputs = generateBatch(data, batch_size, num_skips, window_size)
            
            feed_dict = {training_inputs: batch_inputs, training_outputs: batch_outputs}            
            _ , batch_cost = session.run([optimizer, cost], feed_dict=feed_dict)
            avg_loss += batch_cost
            
            if step > 0 and step % interval_size == 0:
                avg_loss /= interval_size
                training_history.append(avg_loss)
                #print("step: {0}, avg. loss: {1:}".format(step, avg_loss))
                avg_loss = 0.0
                              
            if print_valida_similarity and step > 0 and step % 10*interval_size == 0:
                sim = similarity.eval()
                for i in range(val_size):
                    val_word = idx2word[val_examples[i]]
                    nearest = (-sim[i, :]).argsort()[1:top_k+1] 
                    print('top {0:} words closest to {1:}'.format(top_k, val_word))
                    for k in range(top_k):
                        close_word = idx2word[nearest[k]]
                        print('  ' + close_word)        
                print('------------------')
                    
    return training_history            

In [36]:
num_iters = min(10 * len(data), 50000)
start_time = datetime.datetime.now()
training_history = run(graph, num_iters=num_iters, print_valida_similarity=False)
elapsed_time = datetime.datetime.now() - start_time

print('elapsed time: {0:}'.format(elapsed_time))

elapsed time: 3:09:47.107237
