In [1]:
import numpy as np
import tensorflow as tf
import os

# 0.1 get_data.py

In [2]:
# import tensorflow as tf
import collections
import os
import random
import pickle
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
from tqdm import tqdm

# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urllib.request.urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception(
        'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

# Read the data into a list of strings.
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

words = read_data(filename)
print('Data size', len(words))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
del words  # Hint to reduce memory.

data_index = 0
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

batch_size = 128
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

# Step 5: Begin training.
num_steps = 30000
training_data = []
for step in tqdm(range(num_steps)):
  batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
  training_data.append((batch_inputs, batch_labels))

data_folder = './data'
if not os.path.exists(data_folder):
  os.makedirs(data_folder)
print('Saving training data...')
pickle.dump(training_data, open( "./data/train.p", "wb" ))
print('Saving validation data...')
pickle.dump(valid_examples, open("./data/val.p", "wb"))
print('Saving reverse_dictionary')
pickle.dump(reverse_dictionary, open("./data/reverse_dictionary.p", "wb"))  

Found and verified text8.zip
Data size 17005207


100%|██████████| 30000/30000 [00:14<00:00, 2009.23it/s]


Saving training data...
Saving validation data...
Saving reverse_dictionary


In [3]:
os.listdir('./')

['__notebook_source__.ipynb', 'text8.zip', '.ipynb_checkpoints', 'data']

In [4]:
os.listdir('./data/')

['reverse_dictionary.p', 'train.p', 'val.p']

# 0.2 utils.py

In [7]:
import matplotlib
# matplotlib.use('TKAgg')
from matplotlib import pyplot as plt
import pickle
import numpy as np

from sklearn.manifold import TSNE

def load_data():
    train_data_path = './data/train.p'
    val_data_path = './data/val.p'
    reverse_dictionary_path = './data/reverse_dictionary.p'

    train_data = pickle.load(open(train_data_path, 'rb'))
    print("Loaded train data!")
    val_data = pickle.load(open(val_data_path, 'rb'))
    print("Loaded val data!")
    reverse_dictionary = pickle.load(open(reverse_dictionary_path, 'rb'))
    print("Loaded reverse dictionary!")
    return train_data, val_data, reverse_dictionary

def print_closest_words(val_index, nearest, reverse_dictionary):
    val_word = reverse_dictionary[val_index]                 
    log_str = "Nearest to %s:" % val_word                          
    for k in xrange(len(nearest)):                                        
        close_word = reverse_dictionary[nearest[k]]                
        log_str = "%s %s," % (log_str, close_word)                 
    print(log_str)

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
  assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)

def visualize_embeddings(final_embeddings, reverse_dictionary):
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 500
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels)

# 1. word2vec

In [None]:
import math
import numpy as np
import tensorflow as tf
# from utils import *
'''
Consider the following sentence:
"the first cs224n homework was a lot of fun"
With a window size of 1, we have the dataset:
([the, cs224n], first), ([lot, fun], of) ...
Remember that Skipgram tries to predict each context word from 
its target word, and so the task becomes to predict 'the' and
'cs224n' from first, 'lot' and 'fun' from 'of' and so on.
Our dataset now becomes:
(first, the), (first, cs224n), (of, lot), (of, fun) ...
'''
# Let's define some constants first
batch_size = 128
vocabulary_size = 50000
embedding_size = 128  # Dimension of the embedding vector.
num_sampled = 64    # Number of negative examples to sample.

'''
load_data loads the already preprocessed training and val data.
train data is a list of (batch_input, batch_labels) pairs.
val data is a list of all validation inputs.
reverse_dictionary is a python dict from word index to word
'''
train_data, val_data, reverse_dictionary = load_data()
print("Number of training examples:", len(train_data)*batch_size)
print("Number of validation examples:", len(val_data))

def skipgram():
    batch_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    batch_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    val_dataset = tf.constant(val_data, dtype=tf.int32)

    with tf.variable_scope('word2vec') as scope:
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, 
                                                    embedding_size], 
                                                    -1.0, 1.0))
        batch_embeddings = tf.nn.embedding_lookup(embeddings, batch_inputs)

        weights = tf.Variable(tf.truncated_normal([vocabulary_size, 
                                                   embedding_size],
                                                   stddev=1.0/math.sqrt(embedding_size)))
        biases = tf.Variable(tf.zeros([vocabulary_size]))

        # This objective is maximized when the model assigns high probabilities
        # to the real words, and low probabilities to noise words.
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=weights, 
                                             biases=biases,
                                             labels=batch_labels,
                                             inputs=batch_embeddings,
                                             num_sampled=num_sampled,
                                             num_classes=vocabulary_size))


        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings/norm
        
        val_embeddings = tf.nn.embedding_lookup(normalized_embeddings, 
                                                val_dataset)
        similarity = tf.matmul(val_embeddings, 
                               normalized_embeddings, transpose_b=True)

    return batch_inputs, batch_labels, normalized_embeddings, similarity, loss

def run():
    # load model
    batch_inputs, batch_labels, normalized_embeddings, similarity, loss = skipgram()
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)

        average_loss = 0
        for step, batch_data in enumerate(train_data):
            inputs, labels = batch_data
            feed_dict = {batch_inputs: inputs, batch_labels: labels}

            _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
            average_loss += loss_val
            
            if step % 1000 == 0:
                if step > 0:
                    average_loss /= 1000
                print("Average loss at step ", step, ": ", average_loss)
                average_loss = 0
                    
            if step % 5000 == 0:
                sim = similarity.eval()
                for i in xrange(len(val_data)):
                    top_k = 8  # number of nearest neighbors                       
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1] 
                    print_closest_words(val_data[i], nearest, reverse_dictionary)
  
        final_embeddings = normalized_embeddings.eval()
        return final_embeddings

# Let's start training
final_embeddings = run()

# Visualize the embeddings.
visualize_embeddings(final_embeddings, reverse_dictionary)