In [1]:
import numpy as np
from collections import defaultdict

<h1>Word2Vec</h1>

<p>CBOW: context -> center</p>
<p>Skip-Gram: center -> context</p>

<h2>A Toy Example (using the skip-gram model)</h2>
<p>Assuming: </p>
<ul>
    <li>vocabulary of 8</li>
    <li>word-representation vectors of size 6 (i.e 6x1)</li>
    <li>scaling our outputs using a softmax function</li>
</ul>

In [2]:
corpus = np.array([["the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog"]])
embedding_length = 6

def softmax(z):
    e_z = np.exp(z - np.max(z))
    return e_z / np.sum(e_z, axis=0)

<h3>Set parameters and generate training data</h3>

In [3]:
# Word counts
wd_counts = defaultdict(int)
for row in corpus:
    for word in row:
        wd_counts[word] += 1

# Set parameters
# Size of vocabulary
V = len(wd_counts.keys())
# Length of word vector embeddings
N = embedding_length
# Context window size (i.e. n words on either side of target word)
C = 2
# Learning rate for the network
ETA = 0.1

# Look-up tables
vocabulary = sorted(list(wd_counts.keys()))
word_index = dict((word, i) for i, word in enumerate(vocabulary))
index_word = dict((i, word) for i, word in enumerate(vocabulary))

def one_hot(word):
    word_vec = np.zeros(shape=(V,))
    word_vec[word_index[word]] = 1
    return word_vec

training_data = []

for sentence in corpus:
    sentence_len = len(sentence)
        
    for i, word in enumerate(sentence):
        w_target = one_hot(word)
        w_context = []
        
        for j in range(i - C, i + C + 1):
            if j >= 0 and j != i and j < sentence_len:
                w_context.append(one_hot(sentence[j]))
        training_data.append([w_target, np.array(w_context)])
                
training_data = np.array(training_data)

print(training_data)
print(training_data.shape)

[[array([0., 0., 0., 0., 0., 0., 0., 1.])
  array([[0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.]])]
 [array([0., 0., 0., 0., 0., 0., 1., 0.])
  array([[0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.]])]
 [array([1., 0., 0., 0., 0., 0., 0., 0.])
  array([[0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.]])]
 [array([0., 0., 1., 0., 0., 0., 0., 0.])
  array([[0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.]])]
 [array([0., 0., 0., 1., 0., 0., 0., 0.])
  array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])]
 [array([0., 0., 0., 0., 0., 1., 0., 0.])
  array([[0., 0., 1., 0., 0., 0

<h3>Initializing values</h3>
<p>Assuming word 3 is our target (center word)</p>

In [4]:
# x -> a one-hot encoding of our word of interest ("brown" in this case)
x = training_data[0][0]
Yc = training_data[0][1]

# W -> the word vectors for each word in our vocabulary
W1 = np.random.uniform(low=-0.5, high=0.5, size=(V,N))
W2 = np.random.uniform(low=-0.5, high=0.5, size=(N,V))

print("x:\n{}".format(x))
print("yc:\n{}".format(Yc))
print("W1:\n{}".format(W1))
print("W2:\n{}".format(W2))

x:
[0. 0. 0. 0. 0. 0. 0. 1.]
yc:
[[0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]]
W1:
[[-0.05133965  0.31932449 -0.00076711 -0.31183346 -0.06962123 -0.04505802]
 [ 0.14988032 -0.23083012 -0.11909168 -0.49174947  0.49448661  0.46887054]
 [ 0.33277589  0.05662652  0.2674883   0.19789967  0.02792778 -0.22627179]
 [-0.02431183 -0.4039943   0.38077218 -0.01073757  0.41596685 -0.37685937]
 [ 0.39006347  0.47808053 -0.24210394  0.30791468  0.27615354  0.38503764]
 [ 0.2705183   0.43405893  0.10413934 -0.34114987 -0.26725919  0.4783493 ]
 [-0.07697521  0.20290811  0.23088942 -0.37217582  0.21298191  0.15616153]
 [-0.12239315 -0.48511851  0.21738647 -0.01070983  0.12328203  0.45453927]]
W2:
[[ 0.19396665 -0.27469498 -0.39012654  0.00415826 -0.48835719  0.44879869
   0.09504474 -0.1669008 ]
 [-0.3625185   0.47701613 -0.42872927 -0.10263754 -0.14334517  0.37864322
  -0.09233567 -0.19690589]
 [-0.15118394 -0.01370951  0.31048555 -0.44533149  0.48563386 -0.10220793
  -0.04726353 -0.47660161]


<h3>Forward Pass: Getting Through the network</h3>
<ol>
    <li>Use one-hot encoding to extract word vector of interest (h)</li>
    <li>Find scores for the output (i.e. scores that represent confidence of window co-occurence with the center word</li>
    <li>Pass the scores through the softmax function to compress them to "probabilities" (values in the range [0, 1])</li>
</ol>

In [5]:
# Step 1
h = np.dot(x, W1)
print("h:\n{}".format(h))

# Step 2
u = np.dot(W2.T, h)
print("u:\n{}".format(u))

y = softmax(u)

h:
[-0.12239315 -0.48511851  0.21738647 -0.01070983  0.12328203  0.45453927]
u:
[-0.06666894 -0.42384246  0.52351384 -0.16437925  0.43352415 -0.26011944
 -0.11550046  0.1059884 ]


<h3>Error calculation and backpropagation</h3>

In [14]:
EI = np.sum([np.subtract(y, yc) for yc in Yc], axis = 0)
print("Error: \n{}".format(EI))
# Calculate the deltas
dl_dW2 = np.outer(h, EI)
print("Deltas for W2, shape: {}\n{}".format(dl_dW2.shape, dl_dW2))
# Note: updates for W1 will only have one row of non-zero values:
# That means that only the word vector for "brown" (i.e. W1[0]) has been updated -- which is what we expected
dl_dW1 = np.outer(x, np.dot(W2, EI.T))
print("Deltas for W1, shape:{}\n{}".format(dl_dW1.shape, dl_dW1))

# Update the weights 
W1 -= ETA * dl_dW1
W2 -= ETA * dl_dW2

Error: 
[-0.77838921  0.15505023  0.39985637  0.20098139  0.36544497  0.1826318
 -0.78895084  0.26337529]
Deltas for W2, shape: (6, 8)
[[ 0.09526951 -0.01897709 -0.04893968 -0.02459874 -0.04472796 -0.02235288
   0.09656218 -0.03223533]
 [ 0.37761102 -0.07521774 -0.19397773 -0.09749979 -0.17728412 -0.08859807
   0.38273466 -0.12776823]
 [-0.16921128  0.03370582  0.08692336  0.04369063  0.07944279  0.03970168
  -0.17150724  0.05725422]
 [ 0.00833642 -0.00166056 -0.00428239 -0.00215248 -0.00391385 -0.00195596
   0.00844953 -0.0028207 ]
 [-0.0959614   0.01911491  0.0492951   0.02477739  0.0450528   0.02251522
  -0.09726346  0.03246944]
 [-0.35380847  0.07047642  0.18175042  0.09135393  0.16610909  0.08301333
  -0.35860914  0.11971441]]
Deltas for W1, shape:(8, 6)
[[-0.          0.          0.          0.         -0.          0.        ]
 [-0.          0.          0.          0.         -0.          0.        ]
 [-0.          0.          0.          0.         -0.          0.        ]
 [-0.

In [None]:
derp = np.dot(W2, EI.T)
print(derp)
print(x)