## Preprocessing input and output

In [1]:
from pathlib import Path
from math import floor, ceil
import numpy as np
data_dir = Path('./data')
files = ['wiki_' + color for color in ['black', 'blue', 'brown', 'color', 'cyan', 'green',
                                       'grey', 'indigo', 'magenta', 'orange', 'pink', 'purple',
                                       'red', 'violet', 'white', 'yellow']]
strings = []
for file in files:
    strings.append(open(data_dir.joinpath(file).with_suffix('.txt')).read())

corpus = (' '.join(strings)).split()
vocab = set(corpus)
vocab_size = len(vocab)
print("Corpus length : {}".format(len(corpus)))
print("Vocab length : {}".format(len(vocab)))

word_index = {}
index_word = {}
# create word indices
for i, word in enumerate(vocab):
    word_index[word] = i
    index_word[i] = word

# create input and output samples
# window iterator


def windows(corpus, window_len=3):
    corpus_len = len(corpus)
    if corpus_len < window_len:
        raise ValueError("Corpus length cannot be smaller than window length")
    half = int(floor(window_len/2))
    if corpus_len % 2:  # even
        pre_pad = half - 1
        post_pad = half
    else:  # odd
        pre_pad = half
        post_pad = half
    for i in range(pre_pad, corpus_len - post_pad):
        yield corpus[i-pre_pad: i + post_pad + 1]

# Creates one hot vectors for words


def get_one_hot(index, vocab_size):
    zeros = np.zeros(vocab_size)
    zeros[index] = 1.
    return zeros


# hardcoding for window_size = 3
vI = []
vO1 = []
vO2 = []

for window in windows(corpus):
    # hard coding for window_size=3
    vI.append(get_one_hot(word_index[window[1]], vocab_size))
    vO1.append(get_one_hot(word_index[window[0]], vocab_size))
    vO2.append(get_one_hot(word_index[window[2]], vocab_size))

Corpus length : 28210
Vocab length : 3508


In [2]:

vnpI = np.vstack(vI)
vnpO1 = np.vstack(vO1)
vnpO2 = np.vstack(vO2)

In [3]:
print("Shape of Input: {}".format(vnpI.shape))
print("Shape of Output-1: {}".format(vnpO1.shape))
print("Shape of Output-2: {}".format(vnpO2.shape))

Shape of Input: (28208, 3508)
Shape of Output-1: (28208, 3508)
Shape of Output-2: (28208, 3508)


## Model Implementation

We need the following components to implement the core word2vec model in tensorflow and as we will see, these components can seen as layers:

1. **The input word embeddings (weights)**: These weights, represented as a matrix $\boldsymbol{W_i}$, transform the onehot encoded word representation $\boldsymbol{i}$ into the distributed wordvector representation $\boldsymbol{v}$.
 
2. **The output word embeddings (set of weights)**: These set of matrices $\boldsymbol{W_{o_1}}, \boldsymbol{W_{o_2}}, ...$, hold the embeddings of words when they are in the context i.e., output. The number depends on the size of the context that we choose for the model.

3. **Softmax**: To transform the dot product scores into probabilities.

4. **Negative log-likelihood/ Cross-entropy loss**: The loss functions for the optimizer.

![Model](word2vec_model_structure.png)

In [4]:
import tensorflow as tf
embd_dim = 10
# create placeholder to feed the input
I = tf.placeholder(tf.float32, shape=(None, vocab_size))
O1 = tf.placeholder(tf.float32, shape=(None, vocab_size))
O2 = tf.placeholder(tf.float32, shape=(None, vocab_size))
# Input and output embeddings
Wi = tf.get_variable("Wi", shape=(vocab_size, embd_dim))
Wo1 = tf.get_variable("Wo1", shape=(embd_dim, vocab_size))
Wo2 = tf.get_variable("Wo2", shape=(embd_dim, vocab_size))

# create the model
Ei = tf.matmul(I, Wi)
So1 = tf.matmul(Ei, Wo1)
So2 = tf.matmul(Ei,Wo2)
#Po1 = tf.nn.softmax(So1)
#Po2 = tf.nn.softmax(So2)
loss1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=So1, labels=O1), name="loss1")
loss2 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=So2, labels=O2), name="loss2")
loss = tf.add(loss1, loss2, name="total_loss")
training_op = tf.train.AdamOptimizer().minimize(loss)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



## Training

In [5]:
np.random.seed(1)
batch_size = 64
epochs = 100
num_samples = vnpI.shape[0]
num_steps = epochs*ceil(num_samples/batch_size)


def shuffle_in_unison(a, b, c):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p], c[p]


def train_batch(I, O1, O2, batch_size=64, steps=1000):
    for i in range(steps):
        I_batch = np.take(
            I, range(batch_size*i, batch_size*(i+1)), axis=0, mode='wrap')
        O1_batch = np.take(
            O1, range(batch_size*i, batch_size*(i+1)), axis=0, mode='wrap')
        O2_batch = np.take(
            O2, range(batch_size*i, batch_size*(i+1)), axis=0, mode='wrap')
        yield shuffle_in_unison(I_batch, O1_batch, O2_batch)


init = tf.global_variables_initializer()
loss_summary = tf.summary.scalar("loss", loss)
logdir = 'logs/word2vec'
with tf.Session() as sess:
    logwriter = tf.summary.FileWriter(logdir, tf.get_default_graph())
    sess.run(init)
    for step, (i_batch, o1_batch, o2_batch) in enumerate(train_batch(vnpI, vnpO1, vnpO2, batch_size=batch_size, steps=num_steps)):
        sess.run(training_op, feed_dict={
                 I: i_batch, O1: o1_batch, O2: o2_batch})
        if step % 100 == 0:
            summary_str = loss_summary.eval(feed_dict={I: i_batch, O1: o1_batch, O2: o2_batch})
            logwriter.add_summary(summary_str, step)
    summary_str = loss_summary.eval(feed_dict={I: i_batch, O1: o1_batch, O2: o2_batch})
    logwriter.add_summary(summary_str, step)
    saver = tf.train.Saver([Wi, Wo1, Wo2])
    saver.save(sess, logdir)

In [12]:
with open(logdir + '/words.tsv', 'w') as words_file:
    for i in range(vocab_size):
        words_file.write(index_word[i])
        words_file.write('\n')

In [7]:
2 % 100

2

In [8]:
Ei

<tf.Tensor 'MatMul:0' shape=(?, 10) dtype=float32>

In [9]:
Wo1

<tf.Variable 'Wo1:0' shape=(10, 3508) dtype=float32_ref>

In [10]:
So1

<tf.Tensor 'MatMul_1:0' shape=(?, 3508) dtype=float32>

In [11]:
O1

<tf.Tensor 'Placeholder_1:0' shape=(?, 3508) dtype=float32>