In [1]:
%matplotlib inline

In [2]:
import os
import collections
import random
import itertools
import logging
import zipfile
import gzip

from datetime import datetime
from contextlib import closing

import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp

import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [3]:
def timestamp():
    return datetime.now().isoformat()

In [4]:
logging.basicConfig(
    filename='notebook.log',
    level=logging.DEBUG,
    format='%(asctime)-6s: %(name)s - %(levelname)s - %(message)s',
)
logger = logging.getLogger('02-text')

https://github.com/wangz10/UdacityDeepLearning/blob/master/5_word2vec.ipynb

In [5]:
def read_wackypedia():
    files = [
        '/home/dnm11/WaCky/wackypedia_en_indices/wackypedia_en1_indices.gz',
        '/home/dnm11/WaCky/wackypedia_en_indices/wackypedia_en2_indices.gz',
        '/home/dnm11/WaCky/wackypedia_en_indices/wackypedia_en3_indices.gz',
        '/home/dnm11/WaCky/wackypedia_en_indices/wackypedia_en4_indices.gz',
    ]
    
    for file_name in files:
        with gzip.open(file_name, mode='rt') as f:
            for line in f:
                tokens = map(int, line.split())
                tokens = filter(lambda t: t > 0, tokens)
                
                yield list(tokens)

In [6]:
next(read_wackypedia())

[60507, 3]

In [14]:
vocabulary_size = 161048

In [7]:
def skip_grams(documents, samples_per_target):
    for right in documents:
        left = []
        while left or right:
            target, *right = right

            context = [*left, *right]
            if context:
                sampled_contexts = random.sample(context, k=min(samples_per_target, len(context)))
                for c in sampled_contexts:
                    yield target, c

            if not right:
                break

            left.append(target)

In [40]:
random.seed(2)
list(skip_grams(itertools.islice(read_wackypedia(), 2), samples_per_target=2))[:10]

[(60507, 3),
 (3, 60507),
 (60507, 356),
 (60507, 160),
 (11, 3980),
 (11, 3),
 (9, 2838),
 (9, 30),
 (356, 33),
 (356, 5)]

In [12]:
def generate_batches(input_, batch_size=128):
    input_ = iter(input_)

    while True:
        batch = list(itertools.islice(input_, batch_size))
        
        x = np.array(batch)

        if len(x):
            yield x
        else:
            break
        

In [13]:
random.seed(2)
next(generate_batches(skip_grams(read_wackypedia(), samples_per_target=2), batch_size=6))

array([[60507,     3],
       [    3, 60507],
       [60507,   356],
       [60507,   160],
       [   11,  3980],
       [   11,     3]])

In [16]:
class Word2Vec:

    def __init__(self, vocabulary_size, embedding_size=300, negative_samples=16):
        self.embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), name='embeddings')

        with tf.name_scope('input'):
            self.batch_inputs = tf.placeholder(tf.int32, shape=None)
            self.batch_labels = tf.placeholder(tf.int32, shape=[None, 1])

            self.batch_embeddings = tf.nn.embedding_lookup(self.embeddings, self.batch_inputs, name='embeddings')

        with tf.name_scope('NCE'):
            self.nce_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / np.sqrt(embedding_size)),
                name='weights',
            )
            self.nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name='biases')

            self.loss = tf.reduce_mean(
                tf.nn.nce_loss(
                    weights=self.nce_weights,
                    biases=self.nce_biases,
                    labels=self.batch_labels,
                    inputs=self.batch_embeddings,
                    num_sampled=negative_samples,
                    num_classes=vocabulary_size,
                ),
                name='loss',
            )

            self.global_step = tf.Variable(1, name='global_step', trainable=False)
            #self.optimizer = tf.train.AdamOptimizer(1.0).minimize(self.loss, global_step=self.global_step)
            self.optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(self.loss, global_step=self.global_step)

In [None]:
g = tf.Graph()
ts = timestamp()
log_dir = 'notebook_runs'
metadata_path = os.path.join(log_dir, 'metadata.tsv')

batch_size = 128

with g.as_default():
    w2v = Word2Vec(vocabulary_size, embedding_size=300)

    init_op = tf.global_variables_initializer()

    tf.summary.scalar(w2v.loss.op.name, w2v.loss)
    summary_op = tf.summary.merge_all()

    config = projector.ProjectorConfig()
    projector_embedding = config.embeddings.add()
    projector_embedding.tensor_name = w2v.embeddings.name
    projector_embedding.metadata_path = metadata_path

    saver = tf.train.Saver()

with tf.Session(graph=g) as sess, \
     closing(tf.summary.FileWriter(os.path.join(log_dir, 'w2v-skipgram-{}'.format(ts)), sess.graph)) as train_summary_writer:

    sess.run(init_op)
    #saver.restore(sess, 'notebook_runs/model.ckpt-74737')

    projector.visualize_embeddings(train_summary_writer, config)
    
    for epoch in range(10):
        logger.info('Epoch: %s', epoch)

        batches = generate_batches(
            skip_grams(read_wackypedia(), samples_per_target=5),
            batch_size=batch_size,
        )

        for i, batch in enumerate(batches):
            if len(batch) != batch_size:
                logger.debug('The batch size of %s is smaller than expected %s.', len(batch), batch_size)

            x = batch[:, 0]
            y = batch[:, 1, np.newaxis]

            _, summary, current_step = sess.run(
                [w2v.optimizer, summary_op, w2v.global_step],
                feed_dict={
                    w2v.batch_inputs: x,
                    w2v.batch_labels: y,
                },
            )
            train_summary_writer.add_summary(summary, current_step)
            
        logger.info('Step: %s, saving the model', current_step)
        saver.save(sess, os.path.join(log_dir, "model.ckpt"), current_step)

In [None]:
g = tf.Graph()

with g.as_default():
    w2v = Word2Vec(len(word_index), embedding_size=300)

    column_a = tf.placeholder(tf.int32, shape=None, name='column_a')
    column_b = tf.placeholder(tf.int32, shape=None, name='column_b')
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(w2v.embeddings), 1, keep_dims=True), name='norm')
    normalized_embeddings = w2v.embeddings / norm

    embeddings_a = tf.nn.embedding_lookup(normalized_embeddings, column_a, name='embeddings_a')
    embeddings_b = tf.nn.embedding_lookup(normalized_embeddings, column_b, name='embeddings_b')
    
    similarity = tf.reduce_sum(embeddings_a * embeddings_b, 1, keep_dims=True)
    saver = tf.train.Saver()
    
with tf.Session(graph=g) as sess:
    saver.restore(sess, 'notebook_runs/model.ckpt-74737')
    
    x = [word_index[w.encode()] for w in  simlex.index.get_level_values('word1')]
    y = [word_index[w.encode()] for w in simlex.index.get_level_values('word2')]
    
    similarity_estimates = sess.run(similarity, feed_dict={column_a: x, column_b: y})
similarity_estimates = pd.Series(similarity_estimates.flatten(), index=simlex.index, name='Word2Vec')