In [7]:
import numpy as np
import zipfile
import tensorflow as tf
import collections
import random
import math
from sklearn.manifold import TSNE
from matplotlib import pylab

In [8]:
text8_filepath = "text8.zip"

def loadFile(filename):
    with zipfile.ZipFile(filename) as _zipfile:
        with _zipfile.open(_zipfile.namelist()[0]) as data_file:
            data = tf.compat.as_str(data_file.read()).split()
    return data


In [9]:
def build_dataset(words, n_words):
    count = [["UNK", -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    
    for word, _ in count:
        dictionary[word] = len(dictionary)
        
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [10]:
def generate_batch(n, data, batch_size=10, num_skips=10, skip_window=5):
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    
    if n + span > len(data):
        n = 0
    
    buffer.extend(data[n: n + span])
    n += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        
        if n == len(data):
            buffer[:] = data[:span]
            n = span
        else:
            buffer.append(data[n])
            n += 1
        
    n =  (n + len(data) - span) % len(data)

    return batch, labels, n


    

In [11]:
# n = 0
# batch, labels, n = generate_batch(n, data, batch_size=8, num_skips=2, skip_window=1)
# # print(labels[1: 10])
# for i in range(8):
#         print(batch[i], reverse_dictionary[batch[i]],
#             '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

In [12]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
num_sampled = 64
vocabulary_size = 50000

valid_size = 16
valid_window = 100
valid_examples = np.array(random.sample(range(valid_window), valid_size))

graph = tf.Graph()

with graph.as_default():
    training_data = tf.placeholder(tf.int32, shape=[batch_size])
    training_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    validation_set = tf.constant(valid_examples, dtype=tf.int32)
    
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], 
                                                -1.0, 1.0))
    
    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], 
                                                       stddev=1.0/math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Model
    embed = tf.nn.embedding_lookup(embeddings, training_data)
    
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                                  labels=training_labels, num_sampled=num_sampled, num_classes=vocabulary_size)
    )
    
    
    # optimizer
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
    
    # compute similarity between
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, validation_set)
    
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))


AttributeError: module 'tensorflow' has no attribute 'Graph'

In [13]:
num_steps = 100001
raw_text = loadFile(text8_filepath)
data, count, dictionary, reverse_dictionary = build_dataset(raw_text, vocabulary_size)
average_loss = 0
graph_data = []
graph_data_polling_interval = 500
num_points = 400

with tf.Session(graph=graph, config=tf.ConfigProto(log_device_placement=True)) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    n = 0
    for step in range(num_steps):
        
        batch_data, batch_labels, n = generate_batch(n, data, batch_size=batch_size, 
                                                num_skips=num_skips, skip_window=skip_window)
        
        feed_dict = {training_data: batch_data, 
                     training_labels: batch_labels}
        
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        
        average_loss += l
        if step % graph_data_polling_interval == 0:
            if step > 0:
                average_loss = average_loss / graph_data_polling_interval
                # The average loss is an estimate of the loss over the last 2000 batches.
#             print('Average loss at step %d: %f' % (step, average_loss))
            current_embeddings = get_flattened_embeddings(normalized_embeddings.eval()[:num_points + 1])
            graph_data.append(flattened_embedding_to_graph_data(current_embeddings, words))
            average_loss = 0
        
        if step % 2000 == 0:
            print("Step {}".format(step))
    final_embeddings = normalized_embeddings.eval()

    

    

AttributeError: module 'tensorflow' has no attribute 'compat'

In [14]:
num_points = 400
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])


def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(15,15))  # in inches
    for i, label in enumerate(labels):
        x, y = embeddings[i,:]
        pylab.scatter(x, y)
        pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                       ha='right', va='bottom')
    pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

NameError: name 'final_embeddings' is not defined

In [26]:
def get_flattened_embeddings(embeddings):
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
    embeddings = tsne.fit_transform(embeddings[1:, :])
    return embeddings

def flattened_embedding_to_graph_data(embeddings, labels):
    return [[a, b, c] for (a, b), c in zip(embeddings.tolist(), words)]

flattened_embeddings = get_flattened_embeddings(final_embeddings[:num_points, :])
_data = flattened_embedding_to_graph_data(flattened_embeddings, word)

In [30]:
flattened_embeddings = get_flattened_embeddings(final_embeddings[:num_points, :])
_data = flattened_embedding_to_graph_data(flattened_embeddings, words)
_data

[[-41.31324768066406, 25.045658111572266, 'the'],
 [18.37194061279297, 28.956485748291016, 'of'],
 [11.772801399230957, 25.642641067504883, 'and'],
 [42.3211669921875, -20.980093002319336, 'one'],
 [24.41452980041504, 48.44853591918945, 'in'],
 [-38.73276901245117, 26.362993240356445, 'a'],
 [-40.42744827270508, 54.632171630859375, 'to'],
 [40.240360260009766, -29.392635345458984, 'zero'],
 [47.403594970703125, -26.09493064880371, 'nine'],
 [40.764522552490234, -23.179250717163086, 'two'],
 [35.698543548583984, 22.321081161499023, 'is'],
 [27.963716506958008, 33.358604431152344, 'as'],
 [46.277313232421875, -28.03636932373047, 'eight'],
 [11.868157386779785, 42.699676513671875, 'for'],
 [-49.99739074707031, 17.042314529418945, 's'],
 [42.330833435058594, -27.11469268798828, 'five'],
 [43.099945068359375, -24.7246150970459, 'three'],
 [37.11980438232422, 22.270479202270508, 'was'],
 [24.714427947998047, 31.831531524658203, 'by'],
 [-11.484792709350586, 51.977447509765625, 'that'],
 [40.

In [28]:
def flattened_embedding_to_graph_data(embeddings, labels):
    return [[a, b, c] for (a, b), c in zip(embeddings.tolist(), words)]

graph_data = get_flattened_embeddings(final_embeddings[:num_points, :], words)
graph_data

TypeError: get_flattened_embeddings() takes 1 positional argument but 2 were given