In [1]:
# import the required python files
import os
from six.moves.urllib.request import urlretrieve
import zipfile
import tensorflow as tf
import collections
import itertools #to show the items in dictionary
import numpy as np
import random
import math

First download the file from the source. This Udacity training is based on [Text8](http://mattmahoney.net/dc/textdata) dataset

In [2]:
url = 'http://mattmahoney.net/dc/'

def download_text8(file_name, expected_bytes):
    # Check the existing of the file and download if it does not exist
    if not os.path.exists(file_name):
        file_name, _ = urlretrieve(url+file_name, file_name)
    stat_info = os.stat(file_name)
    if stat_info.st_size == expected_bytes:
        print('Found and verified %s' % file_name)
    else:
        print(stat_info.st_size)
        raise Exception(
            'Failed to verify ' + file_name + '. Can you get to it with a browser?')
    return file_name

file_name = download_text8('text8.zip', 31344016)

Found and verified text8.zip


Read the data into a string.

In [3]:
def read_data(file_name):
    """Extract the first file enclosed in a zip file as a list of words"""
    with zipfile.ZipFile(file_name) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data
  
words = read_data(file_name)
print('Data size %d' % len(words))
# Sample of the sentence splitted as words
print(words[0:100])

Data size 17005207
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing']


Build the dictionary and replace rare words with UNK token. using collections library

In [4]:
# Enter the most common words as vocabualry
vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) # get the length of current dictionary as the index of the word
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
del words  # Hint to reduce memory.

In [5]:
# Get some information on the data counts and dictionaries
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
print('sample dictionary items: \n')
dicts= itertools.islice(dictionary.items(),0,5)
for key, values in dicts: 
    print(key, values)

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5240, 3084, 12, 6, 195, 2, 3135, 46, 59, 156]
sample dictionary items: 

toole 21274
dollars 3645
factorial 15110
synthesized 9137
uprooted 39723


Function to generate a training batch for the skip-gram model.

In [6]:
# batch_size determines how many words we need in the training step
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1), dtype=np.int32)
    span = 2*skip_window +1 #[skip_window target skip_window]
    buffer =collections.deque(maxlen=span) #list like container with fast appends and pops on either end
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
        
    for i in range(batch_size // num_skips):
        target = skip_window # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):  # positions the words around the target in the "label" array
            while target in targets_to_avoid:
                target = random.randint(0, span -1)
            targets_to_avoid.append(target)
            batch[i*num_skips + j] = buffer[skip_window]
            labels[i*num_skips+j,0] = buffer[target]
        buffer.append(data[data_index]) # the append pops up the first buffer element and renew the target 
        data_index = (data_index + 1) % len(data)
    return batch, labels

print('data: ', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2,1), (4,2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\n with num_skip = %d and skip_window=%d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

data:  ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']

 with num_skip = 2 and skip_window=1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['anarchism', 'as', 'originated', 'a', 'as', 'term', 'of', 'a']

 with num_skip = 4 and skip_window=2:
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels: ['originated', 'a', 'anarchism', 'term', 'term', 'of', 'originated', 'as']


Now, it is time to convert words to vectors and train the model using skip-gram model. [This blog](https://iksinc.wordpress.com/tag/skip-gram-model/) provided a good explanation for word2vec algorithms especially Skip gram and bagging word algorithms

In [8]:
batch_size = 128  # for training the model
embedding_size = 128 # dimension of embedding vector
# we will start from the second word in the document and consider one word to the right
# and one word to the left of the central target word
skip_window = 1 # How many words to consider left and right of the central word
num_skips = 2 # How many times to reuse an input to generate a label
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph_skip = tf.Graph()

with graph_skip.as_default(), tf.device('/cpu:0'):
    
    #input data
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size,1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    #variable
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    #Model
    #look up embeddings for inputs
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed, 
                                   labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))
    # Optimizer.
    # Note: The optimizer will optimize the softmax_weights AND the embeddings.
    # This is because the embeddings are defined as a variable quantity and the
    # optimizer's `minimize` method will by default modify all variable quantities 
    # that contribute to the tensor it is passed.
    # See docs on `tf.train.Optimizer.minimize()` for more details.

    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))


In [9]:
num_steps = 100001

with tf.Session(graph=graph_skip) as sess:
    tf.global_variables_initializer().run()
    print('Initialized!!')
    average_loss = 0
    for step in range(num_steps):       
        batch_data, batch_label = generate_batch(batch_size=batch_size, 
                                                 num_skips=num_skips, skip_window=skip_window)
        feed_dict = {train_dataset: batch_data, train_labels: batch_label}
        _, l = sess.run([optimizer, loss], feed_dict=feed_dict)
        average_loss +=1       
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            # average loss is an estimate of loss over the last 2000 batches
            print("average loss is at step %d: %f" %(step, average_loss))
            average_loss = 0
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 #number of nearest neighbors
                nearest = (-sim[i,:]).argsort()[1:top_k+1]
                log = "Nearest to %s:" %valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s' %(log, close_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()
    

Initialized!!
average loss is at step 0: 1.000000
Nearest to called: eudoxia cmd liturgies cheat unlike smack starry intercal
Nearest to used: evading broaden wien cheeks exporter impinging naismith winston
Nearest to on: damascus trintignant ordinals valid audiences avars gender vilayet
Nearest to five: despatch miner spoofed irregulars appeasement cheaply cindy navin
Nearest to also: cracking deals potter exclusively va insect webcams roman
Nearest to UNK: champ smf deities ziibi raisins icmp measures filler
Nearest to its: constipation maguire centurion magnon zagreb domes forefathers peep
Nearest to have: inexpensive abode overlooked lecoq gorillaz granted machinima eurasian
Nearest to may: deceiver tricked seek kaolinite sotho armaments zamboanga hs
Nearest to he: fleshed tab neko partitions transmissible amalric royalist ppg
Nearest to up: sadducees bidirectional melvin valleys hydrochloric era tibetan att
Nearest to war: grimes fared magazine promiscuous milano neopagans microme

In [None]:
num_points = 400
