In [1]:
import collections
import os
import random
import requests
import shutil
import zipfile

import tensorflow as tf
import numpy as np

tf.logging.set_verbosity(tf.logging.ERROR)

tf.VERSION

'1.2.1'

## Download

In [2]:
HOME_DIR = 'wikipedia'
DATA_DIR = os.path.join(HOME_DIR, 'data')

if not os.path.isdir(DATA_DIR):
    os.makedirs(DATA_DIR)
    
TEXT_URL = 'http://mattmahoney.net/dc/text8.zip'
TEXT_FILENAME = TEXT_URL.split('/')[-1]
TEXT_FILE = os.path.join(DATA_DIR, TEXT_FILENAME)

text_missing = not os.path.isfile(TEXT_FILE)

if text_missing:
    print('Downloading {}...'.format(TEXT_FILENAME))
    r = requests.get(TEXT_URL, stream=True)
    with open(TEXT_FILE, 'wb') as f:
        for chunk in r.iter_content(chunk_size=32768):
            if chunk:
                f.write(chunk)
    print('Done!')

## Vocabulary

In [3]:
def load_raw_text_from_zip(file):
    with zipfile.ZipFile(file) as f:
        return f.read(f.namelist()[0]).decode('utf-8')

raw_text = load_raw_text_from_zip(TEXT_FILE)

print('{}...\n\n({:,d} chars)\n\n...{}'.format(
    raw_text[:1000], len(raw_text) - 2000, raw_text[-1000:]))

 anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic instituti

In [4]:
words = raw_text.split()
words_freq = collections.Counter(words).most_common()

print('Words (total):\n\n{:,d}\n'.format(len(words)))
print('Words (unique):\n\n{:,d}\n'.format(len(words_freq)))
print('Most common:\n')
for word, freq in words_freq[:20]:
    print('{} ({:,d})'.format(word, freq))
print('\nLeast common:\n')
for word, freq in words_freq[-20:]:
    print('{} ({:,d})'.format(word, freq))

Words (total):

17,005,207

Words (unique):

253,854

Most common:

the (1,061,396)
of (593,677)
and (416,629)
one (411,764)
in (372,201)
a (325,873)
to (316,376)
zero (264,975)
nine (250,430)
two (192,644)
is (183,153)
as (131,815)
eight (125,285)
for (118,445)
s (116,710)
five (115,789)
three (114,775)
was (112,807)
by (111,831)
that (109,510)

Least common:

jebe (1)
mncs (1)
intitially (1)
privolnoye (1)
gennadi (1)
gorbachyova (1)
democratised (1)
clandenstine (1)
buildups (1)
gorby (1)
kajn (1)
gorbacheva (1)
mikhailgorbachev (1)
englander (1)
workmans (1)
erniest (1)
metzada (1)
metzuda (1)
fretensis (1)
exortation (1)


In [5]:
words_10plus = sum(1 for _, freq in words_freq if freq >= 10)

print('Words 10+: {:,d}'.format(words_10plus))

Words 10+: 47,134


In [6]:
vocabulary_size = 50_000

words_freq[vocabulary_size - 1]

('aggadic', 9)

In [7]:
words_vocab = words_freq[:(vocabulary_size-1)]

print('Words for the vocabulary: {:,d}'.format(len(words_vocab)))

Words for the vocabulary: 49,999


In [8]:
UNK_ID = 0
word_to_id = dict((word, word_id) for word_id, (word, _) in enumerate(words_vocab, UNK_ID+1))
word_to_id['UNK'] = UNK_ID
word_from_id = dict((word_id, word) for word, word_id in word_to_id.items())

print('Vocabulary size: {:d}'.format(len(word_to_id)))

Vocabulary size: 50000


In [9]:
words_to_unk = words_freq[(vocabulary_size-1):]
unk_freq = sum(freq for _, freq in words_to_unk)

print('UNK words: {:,d}'.format(len(words_to_unk)))
print('UNK frequency: {:,d}'.format(unk_freq))

UNK words: 203,855
UNK frequency: 418,391


In [10]:
VOCABULARY_FILE = os.path.join(HOME_DIR, 'vocabulary.txt')

with open(VOCABULARY_FILE, 'w') as f:
    for word_id in range(vocabulary_size):
        f.write(word_from_id[word_id] + '\n')

print('Vocabulary file size: {:,d} bytes'.format(os.stat(VOCABULARY_FILE).st_size))

Vocabulary file size: 418,684 bytes


In [11]:
with open(VOCABULARY_FILE, newline='') as f:
    word_from_id_ = dict((word_id, word.strip()) for word_id, word in enumerate(f))
    word_to_id_ = dict((word, word_id) for word_id, word in word_from_id_.items())

# print(word_from_id_)
print('Vocabulary size: {:,d}'.format(len(word_to_id_)))
assert word_to_id_ == word_to_id
assert word_from_id_ == word_from_id
del word_to_id_, word_from_id_

Vocabulary size: 50,000


In [12]:
data = list(word_to_id.get(word, UNK_ID) for word in words)

print('Size:\n\n{:,d}\n'.format(len(data)))
print('Text (IDs):\n\n{}\n'.format(data[:10]))
print('Text (Words):\n\n{}'.format(list(word_from_id[word_id] for word_id in data[:10])))

Size:

17,005,207

Text (IDs):

[5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

Text (Words):

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


## CBOW

**Input**

In [13]:
def context_window(window_words, target_index):
    words = list(window_words)
    del words[target_index]
    return words

def input_cbow(data, batch_size, window_size):
    if window_size % 2 == 0 or window_size < 3 \
        or window_size > (len(data) - batch_size) / 2:
        # {window_size} must be odd: (n words left) target (n words right)
        raise Exception(
            'Invalid parameters: window_size must be a small odd number')

    num_words = len(data)
    num_windows = num_words - window_size + 1
    num_batches = num_windows // batch_size
    target_index = window_size // 2
    
    words = collections.deque(data[window_size:])
    window_words = collections.deque(data[:window_size], maxlen=window_size)
    
    for n in range(num_batches):
        batch = np.ndarray(shape=(batch_size, window_size-1), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

        for i in range(batch_size):
            batch[i,:] = context_window(window_words, target_index)
            labels[i, 0] = window_words[target_index]
            window_words.append(words.popleft())

        yield batch, labels

In [14]:
batch_size = 2
window_size = 3
num_iters = 2
num_words = window_size + num_iters * batch_size - 1
text = ' '.join(word_from_id[word_id] for word_id in data[:num_words])
print('Text\n\n', text, '\n')

data_iter = input_cbow(data, batch_size, window_size)
for k in range(1, num_iters+1):
    print('Batch {}\n'.format(k))
    batch_context, batch_target = next(data_iter)
    for i in range(batch_size):
        context_words = ', '.join(
            word_from_id[word_id] for word_id in batch_context[i, :])
        target_word = word_from_id[batch_target[i, 0]]
        print('[{}] -> {}'.format(context_words, target_word))
    print()

Text

 anarchism originated as a term of 

Batch 1

[anarchism, as] -> originated
[originated, a] -> as

Batch 2

[as, term] -> a
[a, of] -> term



**Model**

In [15]:
graph = tf.Graph()
graph.as_default()
session = tf.InteractiveSession(graph=graph)
session

<tensorflow.python.client.session.InteractiveSession at 0x7ff0dc14b358>

In [16]:
batch_size = 4
context_size = 2
vocabulary_size = 20
embedding_size = 3
num_sampled = 2

In [17]:
X = tf.constant(np.random.randint(low=0,
                                  high=vocabulary_size,
                                  size=(batch_size, context_size),
                                  dtype=np.int32))

print(X, '\n')
print(X.eval())

Tensor("Const:0", shape=(4, 2), dtype=int32) 

[[ 4 18]
 [15  3]
 [19 19]
 [15 14]]


In [18]:
y = tf.constant(np.random.randint(low=0,
                                  high=vocabulary_size,
                                  size=(batch_size, 1),
                                  dtype=np.int32))

print(y, '\n')
print(y.eval())

Tensor("Const_1:0", shape=(4, 1), dtype=int32) 

[[17]
 [14]
 [ 2]
 [17]]


In [19]:
# ~ tf.random_uniform(shape=(vocabulary_size, embedding_size),
#                     minval=-1.0, maxval=1.0)
embeddings = tf.Variable(
    2 * np.random.rand(vocabulary_size, embedding_size) - 1, dtype=tf.float32)

embeddings.initializer.run()

print(embeddings, '\n')
print(embeddings.eval())

<tf.Variable 'Variable:0' shape=(20, 3) dtype=float32_ref> 

[[ 0.97920001  0.11915912 -0.65950805]
 [-0.7181164   0.21858613  0.84338182]
 [-0.74519354 -0.35278967  0.07332963]
 [ 0.1848488  -0.96011204 -0.79462063]
 [-0.30188584  0.01549303  0.77631754]
 [ 0.1540166   0.73346096  0.51200521]
 [ 0.24492104  0.17639561 -0.75872827]
 [-0.46113685  0.63823926  0.17274176]
 [ 0.38870367 -0.39054361 -0.35646498]
 [ 0.99312258  0.13203035  0.80822062]
 [-0.34097305  0.71183199  0.47748134]
 [-0.22796054  0.4742429  -0.39534894]
 [-0.84676147  0.60309488 -0.90840894]
 [ 0.90246803  0.70090604 -0.38884836]
 [ 0.14717796 -0.46216559  0.01172501]
 [ 0.24660414  0.91782361  0.01999395]
 [-0.23404419 -0.76223528 -0.18675013]
 [-0.44710794  0.9421941   0.91397119]
 [ 0.95795065  0.10389818 -0.76700127]
 [-0.265735   -0.09574403  0.88250047]]


In [20]:
X_embed = tf.nn.embedding_lookup(embeddings, X)

print(X_embed, '\n')
print(X_embed.eval())

Tensor("embedding_lookup:0", shape=(4, 2, 3), dtype=float32) 

[[[-0.30188584  0.01549303  0.77631754]
  [ 0.95795065  0.10389818 -0.76700127]]

 [[ 0.24660414  0.91782361  0.01999395]
  [ 0.1848488  -0.96011204 -0.79462063]]

 [[-0.265735   -0.09574403  0.88250047]
  [-0.265735   -0.09574403  0.88250047]]

 [[ 0.24660414  0.91782361  0.01999395]
  [ 0.14717796 -0.46216559  0.01172501]]]


In [21]:
X_avg = tf.reduce_mean(X_embed, axis=1)

print(X_avg, '\n')
print(X_avg.eval())

Tensor("Mean:0", shape=(4, 3), dtype=float32) 

[[ 0.3280324   0.05969561  0.00465813]
 [ 0.21572646 -0.02114421 -0.38731334]
 [-0.265735   -0.09574403  0.88250047]
 [ 0.19689105  0.22782901  0.01585948]]


In [22]:
c0_w0 = X_embed[0,:,0].eval()
print('first dimension of each verctor of first context:\n\n', c0_w0, '\n')
print('first dimension avarage:\n\n', np.mean(c0_w0))

first dimension of each verctor of first context:

 [-0.30188584  0.95795065] 

first dimension avarage:

 0.328032


In [23]:
# ~ tf.truncated_normal(shape=(vocabulary_size, embedding_size),
#                       stddev=1.0 / np.sqrt(embedding_size))
W = tf.Variable(
    np.random.randn(vocabulary_size, embedding_size) / np.sqrt(embedding_size),
    dtype=tf.float32)

W.initializer.run()

print(W)

<tf.Variable 'Variable_1:0' shape=(20, 3) dtype=float32_ref>


In [24]:
b = tf.Variable(np.zeros(vocabulary_size), dtype=tf.float32)

b.initializer.run()

print(b)

<tf.Variable 'Variable_2:0' shape=(20,) dtype=float32_ref>


In [25]:
sampled_loss = tf.nn.sampled_softmax_loss(weights=W,
                                          biases=b,
                                          inputs=X_avg,
                                          labels=y,
                                          num_sampled=num_sampled,
                                          num_classes=vocabulary_size)

print(sampled_loss, '\n')
print(sampled_loss.eval())

Tensor("Reshape_2:0", shape=(4,), dtype=float32) 

[ 0.91412306  1.03236949  0.65414065  0.87589288]


In [26]:
loss = tf.reduce_mean(sampled_loss)

print(loss, '\n')
print(loss.eval())

Tensor("Mean_1:0", shape=(), dtype=float32) 

0.993677


In [27]:
session.close()
del X, y, embeddings, X_embed, X_avg, c0_w0, W, b, sampled_loss, loss
del graph, session

In [28]:
def model_cbow(vocabulary_size, embedding_size, num_sampled):
    X = tf.placeholder_with_default([[0]], shape=(None, None), name='X')
    y = tf.placeholder_with_default([[0]], shape=(None, 1), name='y')
    
    embeddings = tf.Variable(
        tf.random_uniform(shape=(vocabulary_size, embedding_size),
                          minval=-1.0, maxval=1.0),
        name='embeddings')

    X_embed = tf.nn.embedding_lookup(embeddings, X)
    X_avg = tf.reduce_mean(X_embed, axis=1)
    
    softmax_weights = tf.Variable(
        tf.truncated_normal(shape=(vocabulary_size, embedding_size),
                            stddev=1.0 / np.sqrt(embedding_size)),
        name='W')
    softmax_biases = tf.Variable(
        tf.zeros(shape=(vocabulary_size,)),
        name='b')
    
    with tf.name_scope('loss'):
        sampled_loss = tf.nn.sampled_softmax_loss(weights=softmax_weights,
                                                  biases=softmax_biases,
                                                  inputs=X_avg,
                                                  labels=y,
                                                  num_sampled=num_sampled,
                                                  num_classes=vocabulary_size)
        loss = tf.reduce_mean(sampled_loss, name='mean')

    norm = tf.norm(embeddings, axis=1, keep_dims=True)
    normalized_embeddings = embeddings / norm

    return X, y, normalized_embeddings, loss

In [29]:
batch_size = 4
context_size = 2
vocabulary_size = 20
embedding_size = 3
num_sampled = 2

with tf.Graph().as_default() as graph, \
    tf.Session(graph=graph) as session:

    X, y, embeddings, loss_op = model_cbow(vocabulary_size,
                                           embedding_size,
                                           num_sampled)

    tf.global_variables_initializer().run()

    X_batch = np.random.randint(low=0,
                                high=vocabulary_size,
                                size=(batch_size, context_size),
                                dtype=np.int32)
    y_batch = np.random.randint(low=0,
                                high=vocabulary_size,
                                size=(batch_size, 1),
                                dtype=np.int32)
    feed_data = {X: X_batch, y: y_batch}

    loss, embeddings_ = session.run([loss_op, embeddings], feed_dict=feed_data)

    print('Avarage loss: {:,.3f}\n'.format(loss))
    print(embeddings_)

Avarage loss: 0.659

[[-0.35204613 -0.49285588 -0.79571134]
 [ 0.90027946  0.00527231 -0.43528083]
 [-0.45081681 -0.8457576   0.28540915]
 [ 0.93553668 -0.06053047  0.34800476]
 [ 0.62018192  0.77692211 -0.10847333]
 [-0.05211216  0.72057694  0.691414  ]
 [ 0.85853469 -0.40509552 -0.31434992]
 [ 0.65642017 -0.38012969  0.65162414]
 [ 0.25981882 -0.48941576  0.8324461 ]
 [ 0.02589937 -0.97443479 -0.22317269]
 [-0.81627828  0.2518996  -0.51984262]
 [ 0.69813287  0.71491545  0.03881122]
 [-0.76648676  0.62046701  0.16588779]
 [-0.03354356  0.7318964  -0.6805898 ]
 [-0.8009752   0.15727918 -0.5776695 ]
 [ 0.07402308 -0.83239329  0.54921955]
 [ 0.16653079 -0.84062201  0.51538551]
 [-0.01632886 -0.62661898  0.77915466]
 [ 0.73106146 -0.66975462  0.13029923]
 [ 0.16645701  0.67231679  0.72130591]]


## Skip-gram Model

In [30]:
def context_window(window_words, target_index):
    words = list(window_words)
    del words[target_index]
    return words

def context_sample(context_words, sample_size):
    return random.sample(context_words, sample_size)

def context_skips(window_words, target_index, sample_size, use_sample):
    words = context_window(window_words, target_index)
    if use_sample:
        words = context_sample(words, sample_size) 
    return words

def input_skip_gram(data, batch_size, window_size, num_skips):
    if window_size % 2 == 0 or window_size < 3 \
        or window_size > (len(data) - batch_size) / 2:
        # {window_size} must be odd: (n words left) target (n words right)
        raise Exception(
            'Invalid parameters: window_size must be a small odd number')
    if num_skips > window_size - 1:
        # It is not possible to generate {num_skips} different pairs
        # with the second word coming from {window_size - 1} words.
        raise Exception(
            'Invalid parameters: num_skips={}, window_size={}'.format(
                num_skips, window_size))

    num_words = len(data)
    num_windows = num_words - window_size + 1
    num_batches = num_windows * num_skips // batch_size
    target_index = window_size // 2
    use_sample = num_skips < window_size - 1

    words = collections.deque(data[window_size:])
    window_words = collections.deque(data[:window_size], maxlen=window_size)
    target_word = window_words[target_index]
    context_words = context_skips(window_words,
                                  target_index,
                                  num_skips,
                                  use_sample)

    for n in range(num_batches):
        batch = np.ndarray(shape=(batch_size,), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

        for i in range(batch_size):
            batch[i] = target_word
            labels[i, 0] = context_words.pop()
            if not context_words:
                window_words.append(words.popleft())
                target_word = window_words[target_index]
                context_words = context_skips(window_words,
                                              target_index,
                                              num_skips,
                                              use_sample)

        yield batch, labels

In [31]:
batch_size = 2
window_size = 3
num_skips = 2
num_iters = 2
num_words = window_size + num_iters * batch_size // num_skips - 1
text = ' '.join(word_from_id[word_id] for word_id in data[:num_words])
print('Text\n\n', text, '\n')

data_iter = input_skip_gram(data, batch_size, window_size, num_skips)
for k in range(1, num_iters+1):
    print('Batch {}\n'.format(k))
    batch_target, batch_context = next(data_iter)
    for i in range(batch_size):
        target_word = word_from_id[batch_target[i]]
        context_word = word_from_id[batch_context[i, 0]]
        print('{} -> {}'.format(target_word, context_word))
    print()


Text

 anarchism originated as a 

Batch 1

originated -> as
originated -> anarchism

Batch 2

as -> a
as -> originated



**Model**

In [32]:
graph = tf.Graph()
graph.as_default()
session = tf.InteractiveSession(graph=graph)
session

<tensorflow.python.client.session.InteractiveSession at 0x7ff0da6b45c0>

In [33]:
batch_size = 4
vocabulary_size = 20
embedding_size = 3
num_sampled = 2

In [34]:
X = tf.constant(np.random.randint(low=0,
                                  high=vocabulary_size,
                                  size=(batch_size,),
                                  dtype=np.int32))

print(X, '\n')
print(X.eval())

Tensor("Const:0", shape=(4,), dtype=int32) 

[ 1  2  8 15]


In [35]:
y = tf.constant(np.random.randint(low=0,
                                  high=vocabulary_size,
                                  size=(batch_size, 1),
                                  dtype=np.int32))

print(y, '\n')
print(y.eval())

Tensor("Const_1:0", shape=(4, 1), dtype=int32) 

[[ 6]
 [10]
 [19]
 [19]]


In [36]:
# ~ tf.random_uniform(shape=(vocabulary_size, embedding_size),
#                     minval=-1.0, maxval=1.0)
embeddings = tf.Variable(
    2 * np.random.rand(vocabulary_size, embedding_size) - 1, dtype=tf.float32)

embeddings.initializer.run()

print(embeddings, '\n')
print(embeddings.eval())

<tf.Variable 'Variable:0' shape=(20, 3) dtype=float32_ref> 

[[ 0.72141814  0.21209571 -0.17463139]
 [ 0.28396884 -0.6399644  -0.91048425]
 [ 0.28922462 -0.25126225  0.62883925]
 [ 0.68827802 -0.50261569 -0.93463796]
 [-0.61851805 -0.96802092 -0.30891985]
 [-0.12337061  0.03989112  0.88043302]
 [-0.81038785  0.26255792  0.47203571]
 [ 0.07161405  0.03550801 -0.09085903]
 [-0.85476762 -0.97345501 -0.93148029]
 [-0.9334175   0.21117514 -0.40872216]
 [ 0.78244454 -0.30483389  0.49863824]
 [ 0.79928863 -0.77424914  0.36524931]
 [ 0.02280475 -0.73790503  0.23547477]
 [-0.58185732  0.832335    0.73280942]
 [-0.57512617 -0.74317229 -0.02220738]
 [-0.83003467  0.28576311 -0.5987463 ]
 [ 0.14718378  0.1985718   0.1044677 ]
 [ 0.30989203 -0.85167205  0.89571708]
 [ 0.85655212 -0.36275253  0.40441647]
 [-0.42263338 -0.39505187 -0.7440151 ]]


In [37]:
X_embed = tf.nn.embedding_lookup(embeddings, X)

print(X_embed, '\n')
print(X_embed.eval())

Tensor("embedding_lookup:0", shape=(4, 3), dtype=float32) 

[[ 0.28396884 -0.6399644  -0.91048425]
 [ 0.28922462 -0.25126225  0.62883925]
 [-0.85476762 -0.97345501 -0.93148029]
 [-0.83003467  0.28576311 -0.5987463 ]]


In [38]:
# ~ tf.truncated_normal(shape=(vocabulary_size, embedding_size),
#                       stddev=1.0 / np.sqrt(embedding_size))
W = tf.Variable(
    np.random.randn(vocabulary_size, embedding_size) / np.sqrt(embedding_size),
    dtype=tf.float32)

W.initializer.run()

print(W)

<tf.Variable 'Variable_1:0' shape=(20, 3) dtype=float32_ref>


In [39]:
b = tf.Variable(np.zeros(vocabulary_size), dtype=tf.float32)

b.initializer.run()

print(b)

<tf.Variable 'Variable_2:0' shape=(20,) dtype=float32_ref>


In [40]:
sampled_loss = tf.nn.sampled_softmax_loss(weights=W,
                                          biases=b,
                                          inputs=X_embed,
                                          labels=y,
                                          num_sampled=num_sampled,
                                          num_classes=vocabulary_size)

print(sampled_loss, '\n')
print(sampled_loss.eval())

Tensor("Reshape_2:0", shape=(4,), dtype=float32) 

[ 1.51178932  0.79882336  0.91567236  0.64354497]


In [41]:
loss = tf.reduce_mean(sampled_loss)

print(loss, '\n')
print(loss.eval())

Tensor("Mean:0", shape=(), dtype=float32) 

0.907835


In [42]:
session.close()
del X, y, embeddings, X_embed, W, b, sampled_loss, loss
del graph, session

In [43]:
def model_skip_gram(vocabulary_size, embedding_size, num_sampled):
    X = tf.placeholder_with_default([0], shape=(None,), name='X')
    y = tf.placeholder_with_default([[0]], shape=(None, 1), name='y')
    
    embeddings = tf.Variable(
        tf.random_uniform(shape=(vocabulary_size, embedding_size),
                          minval=-1.0, maxval=1.0),
        name='embeddings')

    X_embed = tf.nn.embedding_lookup(embeddings, X)

    softmax_weights = tf.Variable(
        tf.truncated_normal(shape=(vocabulary_size, embedding_size),
                            stddev=1.0 / np.sqrt(embedding_size)),
        name='W')
    softmax_biases = tf.Variable(
        tf.zeros(shape=(vocabulary_size,)),
        name='b')

    with tf.name_scope('loss'):
        sampled_loss = tf.nn.sampled_softmax_loss(weights=softmax_weights,
                                                  biases=softmax_biases,
                                                  inputs=X_embed,
                                                  labels=y,
                                                  num_sampled=num_sampled,
                                                  num_classes=vocabulary_size)
        loss = tf.reduce_mean(sampled_loss, name='mean')

    norm = tf.norm(embeddings, axis=1, keep_dims=True)
    normalized_embeddings = embeddings / norm

    return X, y, normalized_embeddings, loss

In [44]:
batch_size = 4
vocabulary_size = 20
embedding_size = 3
num_sampled = 2

with tf.Graph().as_default() as graph, \
    tf.Session(graph=graph) as session:

    X, y, embeddings, loss_op = model_skip_gram(vocabulary_size,
                                                embedding_size,
                                                num_sampled)

    tf.global_variables_initializer().run()

    X_batch = np.random.randint(low=0,
                                high=vocabulary_size,
                                size=(batch_size,),
                                dtype=np.int32)
    y_batch = np.random.randint(low=0,
                                high=vocabulary_size,
                                size=(batch_size, 1),
                                dtype=np.int32)
    feed_data = {X: X_batch, y: y_batch}

    loss, embeddings_ = session.run([loss_op, embeddings], feed_dict=feed_data)

    print('Avarage loss: {:,.3f}\n'.format(loss))
    print(embeddings_)

Avarage loss: 0.848

[[ 0.36134785 -0.71822989  0.59462047]
 [ 0.3036077   0.49467725 -0.81431985]
 [-0.24205504  0.0727653   0.96753013]
 [ 0.49863282  0.29093915 -0.8165291 ]
 [ 0.87326252 -0.36429736 -0.32357374]
 [ 0.47323585 -0.74552345  0.46930027]
 [-0.52143198  0.1009413   0.8473013 ]
 [-0.24573667  0.50146824  0.82954383]
 [-0.43916667 -0.76209062  0.47576305]
 [ 0.47012448  0.49093866  0.73345912]
 [ 0.51530111 -0.67711431  0.52533883]
 [ 0.87959242  0.45998335 -0.12137818]
 [ 0.45687237 -0.80557758 -0.37724325]
 [-0.073541    0.7254191   0.68436748]
 [-0.59740579 -0.767735   -0.23170929]
 [ 0.78319734  0.52396584 -0.3347564 ]
 [ 0.85303044 -0.51144564  0.10374222]
 [-0.52321196 -0.6087727  -0.59635991]
 [ 0.58028489 -0.66106236 -0.47567412]
 [-0.90289181 -0.34156856 -0.260993  ]]


## Nearest Neighbors

In [45]:
graph = tf.Graph()
graph.as_default()
session = tf.InteractiveSession(graph=graph)
session

<tensorflow.python.client.session.InteractiveSession at 0x7ff0d18c44a8>

In [46]:
v_0 = tf.constant([3, 4], dtype=tf.float32)
v_1 = tf.constant([4, 3], dtype=tf.float32)
v_2 = tf.constant([-3, 4], dtype=tf.float32)
v_3 = tf.constant([-4, 3], dtype=tf.float32)

V = tf.stack([v_0, v_1, v_2, v_3])

print(V, '\n')
print(V.eval())

Tensor("stack:0", shape=(4, 2), dtype=float32) 

[[ 3.  4.]
 [ 4.  3.]
 [-3.  4.]
 [-4.  3.]]


In [47]:
V_norm = tf.norm(V, axis=1, keep_dims=True)

print(V_norm, '\n')
print(V_norm.eval())

Tensor("norm/Sqrt:0", shape=(4, 1), dtype=float32) 

[[ 5.]
 [ 5.]
 [ 5.]
 [ 5.]]


In [48]:
U = V / V_norm

print(U, '\n')
print(U.eval())

Tensor("truediv:0", shape=(4, 2), dtype=float32) 

[[ 0.60000002  0.80000001]
 [ 0.80000001  0.60000002]
 [-0.60000002  0.80000001]
 [-0.80000001  0.60000002]]


In [49]:
UU = tf.diag_part(tf.matmul(U, U, transpose_b=True))

print(UU, '\n')
print(UU.eval())

Tensor("DiagPart:0", shape=(4,), dtype=float32) 

[ 1.  1.  1.  1.]


In [50]:
i = tf.constant([0], dtype=tf.int32)

u_i = tf.nn.embedding_lookup(U, i)

print(u_i, '\n')
print(u_i.eval())

Tensor("embedding_lookup:0", shape=(1, 2), dtype=float32) 

[[ 0.60000002  0.80000001]]


In [51]:
S = tf.matmul(u_i, U, transpose_b=True)

print(S, '\n')
print(S.eval())

Tensor("MatMul_1:0", shape=(1, 4), dtype=float32) 

[[  1.00000000e+00   9.60000038e-01   2.80000001e-01   7.15255766e-09]]


In [52]:
nn_values, nn_indices = tf.nn.top_k(S, 2)

print(nn_values, '\n')
print(nn_values.eval(), '\n')
print(nn_indices, '\n')
print(nn_indices.eval())

Tensor("TopKV2:0", shape=(1, 2), dtype=float32) 

[[ 1.          0.96000004]] 

Tensor("TopKV2:1", shape=(1, 2), dtype=int32) 

[[0 1]]


In [53]:
session.close()
del v_0, v_1, v_2, v_3, V, V_norm, U
del i, u_i, S, nn_values, nn_indices
del graph, session

In [54]:
class NearestWordsQuery:
    
    def __init__(self, word_from_id, words, k=4):
        self.word_from_id = word_from_id
        self.words = words
        self.k = k

    def build_graph(self, embeddings, name=None):
        with tf.name_scope(name, "nearest_words", [self.words, self.k]):
            input_words = tf.placeholder(tf.int32, shape=(None,))

            input_embed = tf.nn.embedding_lookup(embeddings, input_words)
            similarity = tf.matmul(input_embed, embeddings, transpose_b=True)
            nearest = tf.nn.top_k(similarity, self.k+1)

        self.input_words = {input_words: self.words}
        self.nearest = nearest
    
    def nearest_words(self, target_id, nearest_indices, nearest_values):
        id_pairs = zip(nearest_indices, nearest_values)
        word_pairs = list((self.word_from_id[word_id], value)
                          for word_id, value in id_pairs
                          if word_id != target_id)
        return word_pairs[:self.k]
    
    def format_words(self, word_pairs):
        return ('{} ({:,.3f})'.format(word, value)
                for word, value in word_pairs)
    
    def run(self, session):
        nearest_val, nearest_id = session.run(self.nearest,
                                              feed_dict=self.input_words)
        for i, word_id in enumerate(self.words):
            word = self.word_from_id[word_id]
            nearest_words = self.nearest_words(
                word_id, nearest_id[i], nearest_val[i])
            nearest_words = ', '.join(self.format_words(nearest_words))
            print('{}: {}'.format(word, nearest_words))

In [55]:
rev_vocab = {0: 'unk', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e'}
vocabulary_size = len(rev_vocab)
embedding_size = 3

nn = NearestWordsQuery(rev_vocab, words=[2, 5], k=2)

with tf.Graph().as_default() as graph, \
    tf.Session(graph=graph) as session:
    
    V = 2 * np.random.rand(vocabulary_size, embedding_size) - 1
    U = V / np.linalg.norm(V, axis=1, keepdims=True)
    embeddings = tf.constant(U)
    
    nn.build_graph(embeddings)
    nn.run(session)

b: e (0.840), unk (0.616)
e: b (0.840), c (0.419)


## Experiments

In [None]:
def opt_adagrad(loss, learning_rate=1.0):
    return tf.contrib.layers.optimize_loss(
        loss=loss,
        global_step=tf.train.get_or_create_global_step(),
        learning_rate=learning_rate,
        optimizer='Adagrad')

In [None]:
def train(model_fn, input_fn, opt_fn, query,
          num_epochs=1, model_dir='/tmp/embedding_model', remove_model=True):
    if remove_model and os.path.isdir(model_dir):
        shutil.rmtree(model_dir)

    with tf.Graph().as_default():
        X, y, embeddings, loss_op = model_fn()
        train_op = opt_fn(loss_op)

        query.build_graph(embeddings)

        with tf.train.MonitoredTrainingSession(
            checkpoint_dir=model_dir) as session:
            
            for epoch in range(1, num_epochs+1):
                print('Epoch {}\n'.format(epoch))

                avg_loss = 0
                for step, (X_batch, y_batch) in enumerate(input_fn()):
                    _, loss = session.run([train_op, loss_op],
                                          feed_dict={X: X_batch, y: y_batch})

                    avg_loss = (loss + step * avg_loss) / (step + 1)
                    if step % 10_000 == 0:
                        print('...{:,d} Avarage loss: {:.3f}'.format(
                            step, avg_loss))

                print('\nAvarage loss: {:.3f}\n'.format(avg_loss))
                query.run(session)
                print()

            return session.run(embeddings)

In [68]:
def save_embeddings(file, embeddings):
    with open(file, 'w') as f:
        vocabulary_size = embeddings.shape[0] 
        for word_id in range(vocabulary_size):
            embedding = embeddings[word_id]
            embedding_string = ('{:.5f}'.format(k) for k in embedding)
            embedding_string = ' '.join(embedding_string)
            f.write(embedding_string)
            f.write('\n')

In [57]:
valid_num_words = 8
valid_range_words = 1000
valid_words = random.sample(range(1, valid_range_words), valid_num_words)

for word_id in valid_words:
    print(word_from_id[word_id])

each
length
writer
great
go
literature
seven
examples


In [58]:
nearest_words = NearestWordsQuery(word_from_id, valid_words, 4)

In [59]:
%%time

MODEL_DIR = os.path.join(HOME_DIR, 'cbow')

vocabulary_size = len(word_to_id)
embedding_size = 128
num_sampled = 64

batch_size = 128
window_size = 3

model_fn = lambda: model_cbow(vocabulary_size, embedding_size, num_sampled)
input_fn = lambda: input_cbow(data, batch_size, window_size)
opt_fn = lambda loss: opt_adagrad(loss, learning_rate=1.0)

cbow_embeddings = train(model_fn,
                        input_fn,
                        opt_fn,
                        nearest_words,
                        num_epochs=1,
                        model_dir=MODEL_DIR)

save_embeddings(EMBEDDING_FILE, cbow_embeddings)

Epoch 1

...0 Avarage loss: 7.609
...10,000 Avarage loss: 3.445
...20,000 Avarage loss: 3.273
...30,000 Avarage loss: 3.186
...40,000 Avarage loss: 3.118
...50,000 Avarage loss: 3.076
...60,000 Avarage loss: 3.035
...70,000 Avarage loss: 2.999
...80,000 Avarage loss: 2.969
...90,000 Avarage loss: 2.944
...100,000 Avarage loss: 2.916
...110,000 Avarage loss: 2.886
...120,000 Avarage loss: 2.869
...130,000 Avarage loss: 2.847

Avarage loss: 2.844

each: every (0.632), any (0.551), all (0.357), incitement (0.327)
length: variation (0.377), maximum (0.375), halting (0.363), speed (0.360)
writer: author (0.525), politician (0.521), mathematician (0.519), poet (0.511)
great: little (0.422), dearborn (0.397), soi (0.373), considerable (0.365)
go: went (0.380), move (0.362), pass (0.358), preventative (0.351)
literature: texts (0.400), beaverbrook (0.357), markup (0.344), playwright (0.340)
seven: eight (0.869), five (0.833), six (0.832), four (0.826)
examples: aspects (0.431), elements (0.409

In [65]:
cbow_embeddings[0]

array([ 0.01355005, -0.19189504,  0.08304871,  0.14788906, -0.01292881,
        0.08955733,  0.06460255, -0.01952947,  0.10569677,  0.01884526,
       -0.01156916, -0.14599091,  0.05206586,  0.10930529,  0.11521848,
       -0.03579207, -0.17535175,  0.13498679, -0.11360314,  0.00087663,
       -0.00106169, -0.05076494,  0.13074888,  0.00617049, -0.0657478 ,
        0.03324445, -0.09406804,  0.13334005,  0.03737927,  0.03893398,
        0.0273369 , -0.09000934, -0.02187724,  0.10807586, -0.07821658,
        0.02211384, -0.13984069, -0.07788607,  0.02657686,  0.07809026,
        0.01229459,  0.03779913, -0.05638366,  0.04662901, -0.08944251,
        0.07706796, -0.04529163,  0.02438457, -0.09905559, -0.06779324,
        0.04311862,  0.08599722,  0.01276149, -0.04310194,  0.02160779,
       -0.08086037,  0.06877899,  0.02897387,  0.04288683, -0.04100583,
       -0.02869713,  0.08337607,  0.03640424, -0.07170308, -0.03564997,
       -0.00045511, -0.01639324,  0.10533239,  0.0118693 ,  0.07

In [70]:
EMBEDDINGS_FILE = os.path.join(HOME_DIR, 'cbow.txt')
save_embeddings(EMBEDDINGS_FILE, cbow_embeddings)

print('Embeddings file size: {:,d} bytes'.format(os.stat(EMBEDDINGS_FILE).st_size))

Embeddings file size: 54,399,864 bytes


In [60]:
%%time

MODEL_DIR = os.path.join(HOME_DIR, 'skip_gram')

vocabulary_size = len(word_to_id)
embedding_size = 128
num_sampled = 64

batch_size = 128
window_size = 3
num_skips = 2

model_fn = lambda: model_skip_gram(vocabulary_size, embedding_size, num_sampled)
input_fn = lambda: input_skip_gram(data, batch_size, window_size, num_skips)
opt_fn = lambda loss: opt_adagrad(loss, learning_rate=1.0)

skip_embeddings = train(model_fn,
                        input_fn,
                        opt_fn,
                        nearest_words,
                        num_epochs=1,
                        model_dir=MODEL_DIR)

Epoch 1

...0 Avarage loss: 8.529
...10,000 Avarage loss: 3.854
...20,000 Avarage loss: 3.685
...30,000 Avarage loss: 3.615
...40,000 Avarage loss: 3.570
...50,000 Avarage loss: 3.541
...60,000 Avarage loss: 3.517
...70,000 Avarage loss: 3.490
...80,000 Avarage loss: 3.472
...90,000 Avarage loss: 3.462
...100,000 Avarage loss: 3.449
...110,000 Avarage loss: 3.436
...120,000 Avarage loss: 3.423
...130,000 Avarage loss: 3.414
...140,000 Avarage loss: 3.401
...150,000 Avarage loss: 3.390
...160,000 Avarage loss: 3.383
...170,000 Avarage loss: 3.376
...180,000 Avarage loss: 3.369
...190,000 Avarage loss: 3.362
...200,000 Avarage loss: 3.351
...210,000 Avarage loss: 3.336
...220,000 Avarage loss: 3.332
...230,000 Avarage loss: 3.325
...240,000 Avarage loss: 3.322
...250,000 Avarage loss: 3.312
...260,000 Avarage loss: 3.307

Avarage loss: 3.306

each: every (0.658), any (0.554), all (0.441), several (0.371)
length: size (0.406), amount (0.394), cost (0.385), omphalos (0.339)
writer: author 

In [71]:
EMBEDDINGS_FILE = os.path.join(HOME_DIR, 'skip_gram.txt')
save_embeddings(EMBEDDINGS_FILE, skip_embeddings)

print('Embeddings file size: {:,d} bytes'.format(os.stat(EMBEDDINGS_FILE).st_size))

Embeddings file size: 54,399,384 bytes
