## nb.
### 1. This does not implement negative sampling or hierarchical softmax, this is only intended as a simple illustration.
### 2. This implementation uses Skip-gram

In [1]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
import numpy as np

## Raw Text for training word representation (Word2vec)

In [3]:
corpus_raw = 'He is the king . The king is royal . She is the royal  queen '
corpus_raw = corpus_raw.lower()

## Corpus derived from raw text

In [4]:
words = []
for word in corpus_raw.split():
    if word != '.':
        words.append(word)
        
words = set(words)

word2int = {}
int2word = {}

vocab_size = len(words)

#enumerate adds counter, i, to iterable
for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word


In [5]:
word2int,int2word

({'he': 6, 'is': 5, 'king': 4, 'queen': 2, 'royal': 3, 'she': 1, 'the': 0},
 {0: 'the', 1: 'she', 2: 'queen', 3: 'royal', 4: 'king', 5: 'is', 6: 'he'})

In [6]:
len(words)

7

In [7]:
int2word[word2int['queen']]

'queen'

In [8]:
raw_sentences = corpus_raw.split('.')
sentences = [sentence for sentence in raw_sentences]
sentences

['he is the king ', ' the king is royal ', ' she is the royal  queen ']

## Generating skip-grams with window size 2

In [9]:
data = []
WINDOW_SIZE = 2
for sentence in sentences:
    sentence = sentence.split()
    sentence_length = len(sentence)
    for word_index,word in enumerate(sentence):
        #getting n-grams of sizes WINDOW SIZE; nb stands for neighbour
        for nb_word in sentence[max(word_index - WINDOW_SIZE,0):1+min(word_index + WINDOW_SIZE,sentence_length)]:
            if nb_word != word:
                data.append([word,nb_word])

In [10]:
data

[['he', 'is'],
 ['he', 'the'],
 ['is', 'he'],
 ['is', 'the'],
 ['is', 'king'],
 ['the', 'he'],
 ['the', 'is'],
 ['the', 'king'],
 ['king', 'is'],
 ['king', 'the'],
 ['the', 'king'],
 ['the', 'is'],
 ['king', 'the'],
 ['king', 'is'],
 ['king', 'royal'],
 ['is', 'the'],
 ['is', 'king'],
 ['is', 'royal'],
 ['royal', 'king'],
 ['royal', 'is'],
 ['she', 'is'],
 ['she', 'the'],
 ['is', 'she'],
 ['is', 'the'],
 ['is', 'royal'],
 ['the', 'she'],
 ['the', 'is'],
 ['the', 'royal'],
 ['the', 'queen'],
 ['royal', 'is'],
 ['royal', 'the'],
 ['royal', 'queen'],
 ['queen', 'the'],
 ['queen', 'royal']]

## Transforming to one-hot encoding

In [11]:
def to_one_hot(data_point_index,vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

In [12]:
x_train = []
y_train = []

for data_word in data:
    x_train.append(to_one_hot(word2int[data_word[0]],vocab_size))
    y_train.append(to_one_hot(word2int[data_word[1]],vocab_size))
    
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [13]:
x_train[:5]

array([[0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.]])

In [14]:
y_train[:5]

array([[0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.]])

In [15]:
#34 training examples, 7 vocabularies
x_train.shape,y_train.shape

((34, 7), (34, 7))

## Input layer and Expected Output

In [16]:
x = tf.placeholder(tf.float32,shape=(None,vocab_size))
y_label = tf.placeholder(tf.float32,shape=(None,vocab_size))

## Hidden Layer (Training 5 dimension word vectors)

In [17]:
dimension = 5
W1 = tf.Variable(tf.random_normal([vocab_size,dimension]))
b1 = tf.Variable(tf.random_normal([dimension]))
hidden_representation = tf.add(tf.matmul(x,W1),b1)

## Output Layer

In [18]:
W2 = tf.Variable(tf.random_normal([dimension,vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation,W2),b2))

## Instantiation of Session; Initialization of Placeholders, Variables and Constants

In [19]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

## Loss Function

In [20]:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))

## Gradient Descent

In [21]:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)

In [22]:
n_epoch = 5001

In [23]:
for epoch in range(n_epoch):
    sess.run(train_step,feed_dict={x:x_train,y_label:y_train})
    if epoch%1000 == 0: print('epoch:',epoch,'loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

epoch: 0 loss is :  6.3425035
epoch: 1000 loss is :  1.3293065
epoch: 2000 loss is :  1.3236219
epoch: 3000 loss is :  1.3222727
epoch: 4000 loss is :  1.3216951
epoch: 5000 loss is :  1.3213799


In [24]:
print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

loss is :  1.3213799


In [25]:
sess.run(W1)[:10]

array([[ 0.7030329 ,  0.03842852, -0.24403833,  1.5422153 , -2.71979   ],
       [ 0.82068396, -1.5129514 , -0.6688866 , -1.2041605 ,  0.8323536 ],
       [ 2.4273376 ,  0.3290126 , -0.5134532 , -0.2482488 ,  0.8731189 ],
       [-0.16053797, -2.0391061 , -0.82571256,  1.152142  , -0.06711878],
       [ 0.8737598 ,  0.74421084, -1.013451  , -1.0456054 ,  0.23717858],
       [ 0.71118534,  1.4477081 ,  1.5142787 ,  1.349745  ,  0.8524112 ],
       [-0.24708463, -0.3634684 , -0.9939318 , -1.0739028 ,  1.012634  ]],
      dtype=float32)

In [26]:
sess.run(b1)

array([-1.5086834 ,  0.75874305, -0.7443114 , -0.4656814 ,  0.51429737],
      dtype=float32)

## Word Vectors = (Weights + Bias) of hidden layer

In [27]:
word_vectors = sess.run(W1+b1)

In [28]:
word_vectors[word2int['queen']]

array([ 0.9186542,  1.0877557, -1.2577646, -0.7139302,  1.3874162],
      dtype=float32)

In [29]:
word_vectors.shape

(7, 5)

## Measure similarity of vectors through euclidean distance
### e.g. root((y2-y1) **2 + (X2 - X1) **2) == distance between points x and y

In [30]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))

def find_closest(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index

In [31]:
int2word[find_closest(word2int['king'],word_vectors)]

'he'

In [32]:
int2word[find_closest(word2int['queen'],word_vectors)]

'king'

In [33]:
int2word[find_closest(word2int['royal'],word_vectors)]

'she'

In [34]:
int2word[find_closest(word2int['is'],word_vectors)]

'queen'