In [1]:
import numpy as np
import zipfile
import tensorflow as tf
import collections
import random
import math
from sklearn.manifold import TSNE
from matplotlib import pylab

  return f(*args, **kwds)


In [2]:
text8_filepath = "text8.zip"

def loadFile(filename):
    with zipfile.ZipFile(filename) as _zipfile:
        with _zipfile.open(_zipfile.namelist()[0]) as data_file:
            data = tf.compat.as_str(data_file.read()).split()
    return data


In [3]:
def build_dataset(words, n_words):
    count = [["UNK", -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    
    for word, _ in count:
        dictionary[word] = len(dictionary)
        
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [4]:
def generate_batch(n, data, batch_size=10, num_skips=10, skip_window=5):
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    
    if n + span > len(data):
        n = 0
    
    buffer.extend(data[n: n + span])
    n += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        
        if n == len(data):
            buffer.clear()
            buffer.extend(data[:span])
            n = span
        else:
            buffer.append(data[n])
            n += 1
        
    n =  (n + len(data) - span) % len(data)

    return batch, labels, n


    

In [5]:
# n = 0
# batch, labels, n = generate_batch(n, data, batch_size=8, num_skips=2, skip_window=1)
# # print(labels[1: 10])
# for i in range(8):
#         print(batch[i], reverse_dictionary[batch[i]],
#             '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

In [6]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
num_sampled = 64
vocabulary_size = 50000

valid_size = 16
valid_window = 100
valid_examples = np.array(random.sample(range(valid_window), valid_size))

graph = tf.Graph()

with graph.as_default():
    training_data = tf.placeholder(tf.int32, shape=[batch_size])
    training_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    validation_set = tf.constant(valid_examples, dtype=tf.int32)
    
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], 
                                                -1.0, 1.0))
    
    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], 
                                                       stddev=1.0/math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Model
    embed = tf.nn.embedding_lookup(embeddings, training_data)
    
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                                  labels=training_labels, num_sampled=num_sampled, num_classes=vocabulary_size)
    )
    
    
    # optimizer
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
    
    # compute similarity between
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, validation_set)
    
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

    
def get_flattened_embeddings(embeddings):
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
    embeddings = tsne.fit_transform(embeddings[1:, :])
    return embeddings

def flattened_embedding_to_graph_data(embeddings, labels):
    return [[a, b, c] for (a, b), c in zip(embeddings.tolist(), words)]

In [None]:
num_steps = 100001
raw_text = loadFile(text8_filepath)
data, count, dictionary, reverse_dictionary = build_dataset(raw_text, vocabulary_size)
average_loss = 0

graph_data = []
graph_data_polling_interval = 1000
num_points = 100
words = [reverse_dictionary[i] for i in range(1, num_points+1)]
tf.logging.set_verbosity(tf.logging.INFO)
with tf.Session(graph=graph, config=tf.ConfigProto(log_device_placement=True)) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    n = 0
    for step in range(num_steps):
        
        batch_data, batch_labels, n = generate_batch(n, data, batch_size=batch_size, 
                                                num_skips=num_skips, skip_window=skip_window)
        
        feed_dict = {training_data: batch_data, 
                     training_labels: batch_labels}
        
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        
        average_loss += l
        if step % graph_data_polling_interval == 0:
            if step > 0:
                average_loss = average_loss / graph_data_polling_interval
            
            current_embeddings = get_flattened_embeddings(normalized_embeddings.eval()[:num_points + 1])
            graph_data.append(flattened_embedding_to_graph_data(current_embeddings, words))
            
            print("step: {}, loss: {}".format(step, average_loss))
            
            average_loss = 0
            
    final_embeddings = normalized_embeddings.eval()

    print()

Initialized
step: 0, loss: 7.5491437911987305
step: 1000, loss: 4.706552924633026
step: 2000, loss: 4.042916623830795
step: 3000, loss: 3.960678095340729
step: 4000, loss: 3.8947622339725494
step: 5000, loss: 3.8269441561698914
step: 6000, loss: 3.783018235683441
step: 7000, loss: 3.771661952257156
step: 8000, loss: 3.7516676864624023
step: 9000, loss: 3.702561943531036


In [None]:
import json
def save_to_file(data, filename):
    json_object = {}
    
    for i, snapshot in enumerate(data):
        if i not in json_object:
            json_object[i] = {}
            
        for j, row in enumerate(snapshot):
            json_object[i][j] = {"x": row[0], "y": row[1], "word": row[2]}
    
    with open("./" + filename, "w+") as f:
        json.dump(json_object, f)

save_to_file(graph_data, "snapshots.json")

In [None]:
num_points = 100
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])


def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(15,15))  # in inches
    for i, label in enumerate(labels):
        x, y = embeddings[i,:]
        pylab.scatter(x, y)
        pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                       ha='right', va='bottom')
    pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

In [None]:
print(span)