In [1]:
import numpy as np
from collections import defaultdict

<h1>Word2Vec</h1>

<p>CBOW: context -> center</p>
<p>Skip-Gram: center -> context</p>

<h2>A Toy Example (using the skip-gram model)</h2>
<p>Assuming: </p>
<ul>
    <li>vocabulary of 8</li>
    <li>word-representation vectors of size 6 (i.e 6x1)</li>
    <li>scaling our outputs using a softmax function</li>
</ul>

In [2]:
corpus = np.array([["the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog"]])
embedding_length = 6

def softmax(z):
    e_z = np.exp(z - np.max(z))
    return e_z / np.sum(e_z, axis=0)

<h3>Set parameters and generate training data</h3>

In [3]:
# Word counts
wd_counts = defaultdict(int)
for row in corpus:
    for word in row:
        wd_counts[word] += 1

# Set parameters
# Size of vocabulary
V = len(wd_counts.keys())
# Length of word vector embeddings
N = embedding_length
# Context window size (i.e. n words on either side of target word)
C = 2
# Learning rate for the network
ETA = 0.1

# Look-up tables
vocabulary = sorted(list(wd_counts.keys()))
word_index = dict((word, i) for i, word in enumerate(vocabulary))
index_word = dict((i, word) for i, word in enumerate(vocabulary))

def one_hot(word):
    word_vec = np.zeros(shape=(V,))
    word_vec[word_index[word]] = 1
    return word_vec

training_data = []

for sentence in corpus:
    sentence_len = len(sentence)
        
    for i, word in enumerate(sentence):
        w_target = one_hot(word)
        w_context = []
        
        for j in range(i - C, i + C + 1):
            if j >= 0 and j != i and j < len(sentence):
                w_context.append(one_hot(sentence[j]))
        training_data.append([w_target, np.array(w_context)])
                
training_data = np.array(training_data)

print(training_data)

[[array([0., 0., 0., 0., 0., 0., 0., 1.])
  array([[0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.]])]
 [array([0., 0., 0., 0., 0., 0., 1., 0.])
  array([[0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.]])]
 [array([1., 0., 0., 0., 0., 0., 0., 0.])
  array([[0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.]])]
 [array([0., 0., 1., 0., 0., 0., 0., 0.])
  array([[0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.]])]
 [array([0., 0., 0., 1., 0., 0., 0., 0.])
  array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])]
 [array([0., 0., 0., 0., 0., 1., 0., 0.])
  array([[0., 0., 1., 0., 0., 0

<h3>Initializing values</h3>
<p>Assuming word 3 is our target (center word)</p>

In [4]:
# x -> a one-hot encoding of our word of interest ("brown" in this case)
x = training_data[2][0]
Yc = training_data[2][1]

# W -> the word vectors for each word in our vocabulary
W1 = np.random.uniform(low=-0.5, high=0.5, size=(V,N))
W2 = np.random.uniform(low=-0.5, high=0.5, size=(N,V))

print("x:\n{}".format(x))
print("yc:\n{}".format(Yc))
print("W1:\n{}".format(W1))
print("W2:\n{}".format(W2))

x:
[1. 0. 0. 0. 0. 0. 0. 0.]
yc:
[[0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]]
W1:
[[ 0.08614698 -0.36750216 -0.33938485  0.43620256 -0.26515335  0.18553837]
 [ 0.12985622  0.04285785  0.15063536  0.37230336  0.12039416 -0.22919682]
 [ 0.17431804  0.49787207  0.215355    0.02048259 -0.14082999  0.13227316]
 [-0.20620978 -0.37244012  0.01139629  0.49346479  0.41051038  0.01246903]
 [ 0.26771751 -0.14853589  0.44364546 -0.21972808  0.18541168 -0.13683058]
 [-0.25515609  0.43829243 -0.41089684  0.28826452 -0.313391    0.42865522]
 [-0.24822564  0.22069793  0.17458605 -0.38330372  0.47706552 -0.03015181]
 [ 0.14855631 -0.47574165 -0.01324896 -0.10695233 -0.16533702  0.37926315]]
W2:
[[-0.13743996  0.18054415  0.38884914 -0.37203156 -0.31430562  0.33042897
   0.06733084 -0.0082668 ]
 [-0.20449573  0.27514662 -0.40266812  0.04738372  0.0676584  -0.10378056
  -0.14718391  0.34350396]
 [-0.40257345 -0.16245016  0.48960491 -0.007522

<h3>Forward Pass: Getting Through the network</h3>
<ol>
    <li>Use one-hot encoding to extract word vector of interest (h)</li>
    <li>Find scores for the output (i.e. scores that represent confidence of window co-occurence with the center word</li>
    <li>Pass the scores through the softmax function to compress them to "probabilities" (values in the range [0, 1])</li>
</ol>

In [5]:
# Step 1
h = np.dot(x, W1)
print("h:\n{}".format(h))

# Step 2
u = np.dot(W2.T, h)
print("u:\n{}".format(u))

y = softmax(u)

h:
[ 0.08614698 -0.36750216 -0.33938485  0.43620256 -0.26515335  0.18553837]
u:
[ 0.13033247 -0.10238548 -0.16131858 -0.06921065  0.0370069  -0.16138054
  0.19822338 -0.14208718]


<h3>Error calculation and backpropagation</h3>

In [6]:
EI = np.sum([np.subtract(y, yc) for yc in Yc], axis = 0)
print("Error: \n{}".format(EI))

# Calculate the deltas
dl_dW2 = np.outer(h, EI)
print("Deltas for W2:\n{}".format(dl_dW2))
# Note: updates for W1 will only have one row of non-zero values:
# That means that only the word vector for "brown" (i.e. W1[0]) has been updated -- which is what we expected
dl_dW1 = np.outer(x, np.dot(W2, EI.T))
print("Deltas for W1:\n{}".format(dl_dW1))

# Update the weights 
W1 -= ETA * dl_dW1
W2 -= ETA * dl_dW2

Error: 
[ 0.5840973   0.46282529 -0.56366228 -0.52156304  0.53205242  0.43631069
 -0.37487071 -0.55518968]
Deltas for W2:
[[ 0.05031822  0.039871   -0.0485578  -0.04493108  0.04583471  0.03758685
  -0.03229398 -0.04782791]
 [-0.21465702 -0.17008929  0.2071471   0.19167554 -0.19553041 -0.16034512
   0.13776579  0.20403341]
 [-0.19823377 -0.15707589  0.19129844  0.17701059 -0.18057053 -0.14807724
   0.12722544  0.18842297]
 [ 0.25478474  0.20188558 -0.24587093 -0.22750713  0.23208263  0.19031984
  -0.16351956 -0.24217516]
 [-0.15487536 -0.12271968  0.14945694  0.13829419 -0.14107548 -0.11568924
   0.09939822  0.1472104 ]
 [ 0.10837246  0.08587185 -0.10458098 -0.09676996  0.09871614  0.08095238
  -0.0695529  -0.10300899]]
Deltas for W1:
[[-0.06556768  0.06533697 -0.74971385 -0.20639773 -0.22495561 -0.45691992]
 [-0.          0.         -0.         -0.         -0.         -0.        ]
 [-0.          0.         -0.         -0.         -0.         -0.        ]
 [-0.          0.         -0.  