In [1]:
import numpy as np
import re
import nltk
from utils2 import get_dict

# The continuous bag-of-words model
The CBOW model is based on a neural network, the architecture of which looks like the figure below, as you'll recall from the lecture.

<div style="width:image width px; font-size:100%; text-align:center;"><img src='cbow_model_architecture.png' alt="alternate text" width="width" height="height" style="width:917;height:337;" /> Figure 1 </div>

This part of the notebook will walk you through:

- The two activation functions used in the neural network.

- Forward propagation.

- Cross-entropy loss.

- Backpropagation.

- Gradient descent.

- Extracting the word embedding vectors from the weight matrices once the neural network has been trained.

### Activation Functions
**Softmax**
The second activation function that you need is softmax. This function is used to calculate the values of the output layer of the neural network, using the following formulas:

\begin{align}
 \mathbf{z_2} &= \mathbf{W_2}\mathbf{h} + \mathbf{b_2}   \tag{3} \\
 \mathbf{\hat y} &= \mathrm{softmax}(\mathbf{z_2})   \tag{4} \\
\end{align}

To calculate softmax of a vector $\mathbf{z}$, the $i$-th component of the resulting vector is given by:

$$ \textrm{softmax}(\textbf{z})_i = \frac{e^{z_i} }{\sum\limits_{j=1}^{V} e^{z_j} }  \tag{5} $$

Let's work through an example.

**ReLU**  
ReLU is used to calculate the values of the hidden layer, in the following formulas:

\begin{align}
 \mathbf{z_1} &= \mathbf{W_1}\mathbf{x} + \mathbf{b_1}  \tag{1} \\
 \mathbf{h} &= \mathrm{ReLU}(\mathbf{z_1})  \tag{2} \\
\end{align}


In [6]:
def relu(z):
    # BEGIN your code here
    result = z.copy()
    result[result < 0] = 0
    # END your code here
    
    return result

def softmax(z):
    # BEGIN your code here
    e_z = np.exp(z)
    sum_e_z = np.sum(e_z)
    return e_z / sum_e_z
    # END your code here

In [2]:
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)  # tokenize string to words
    data = [ ch.lower() for ch in data
             if ch.isalpha()
             or ch == '.'
             or emoji.get_emoji_regexp().search(ch)
           ]
    return data


corpus = 'I am happy because I am learning'
print(f'Corpus:  {corpus}')
words = tokenize(corpus)
print(f'Words (tokens):  {words}')
word2Ind, Ind2word = get_dict(words)

V = len(word2Ind)
print("Size of vocabulary: ", V)

Corpus:  I am happy because I am learning
Words (tokens):  ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']
Size of vocabulary:  5


## Forward Propagation

<div style="width:image width px; font-size:100%; text-align:center;"><img src='cbow_model_dimensions_single_input.png' alt="alternate text" width="width" height="height" style="width:839;height:349;" /> Figure 2 </div>
Set $N$ equal to 3. Remember that $N$ is a hyperparameter of the CBOW model that represents the size of the word embedding vectors, as well as the size of the hidden layer.

In [3]:
N = 3

### Initialization of Weights and Biases
- Initialize the weights and biases
- Here prepopulated matrices and vectors are presented

In [4]:
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

In [5]:
print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'Size of W1: {W1.shape} (NxV)')
print(f'Size of b1: {b1.shape} (Nx1)')
print(f'Size of W2: {W2.shape} (VxN)')
print(f'Size of b2: {b2.shape} (Vx1)')

V (vocabulary size): 5
N (embedding size / size of the hidden layer): 3
Size of W1: (3, 5) (NxV)
Size of b1: (3, 1) (Nx1)
Size of W2: (5, 3) (VxN)
Size of b2: (5, 1) (Vx1)


#### Training Example

In [13]:
def word_to_one_hot_vector(word, word2Ind, V):
    # BEGIN your code here
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    # END your code here
    return one_hot_vector

def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

def context_words_to_vector(context_words, word2Ind, V):
    # BEGIN your code here
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    # END your code here
    return context_words_vectors

def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

training_examples = get_training_example(words, 2, word2Ind, V)

In [14]:
x_array, y_array = next(training_examples)

In [15]:
x_array

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [16]:
y_array

array([0., 0., 1., 0., 0.])

In [17]:
x = x_array.copy()
x.shape = (V, 1)
print('x')
print(x)
print()

y = y_array.copy()
y.shape = (V, 1)
print('y')
print(y)

x
[[0.25]
 [0.25]
 [0.  ]
 [0.5 ]
 [0.  ]]

y
[[0.]
 [0.]
 [1.]
 [0.]
 [0.]]


#### Values of Hidden Layer

$$z_1 = W_1x + b_1$$
$$h = ReLU\left( z_1 \right)$$

In [24]:
z1 = np.dot(W1, x) + b1
print("z1")
print(z1)
h = relu(z1)
print("h")
print(h)

z1
[[ 0.36483875]
 [ 0.63710329]
 [-0.3236647 ]]
h
[[0.36483875]
 [0.63710329]
 [0.        ]]


#### Values of Output Layer
$$z_2 = W_2h + b_2$$
$$\hat y = softmax\left( z_2 \right)$$

In [25]:
z2 = np.dot(W2, h) + b2
print("z2")
print(z2)
y_hat = softmax(z2)
print("y_hat")
print(y_hat)

z2
[[-0.31973737]
 [-0.28125477]
 [-0.09838369]
 [-0.33512159]
 [-0.19919612]]
y_hat
[[0.18519074]
 [0.19245626]
 [0.23107446]
 [0.18236353]
 [0.20891502]]


### Cross Entropy Loss
$$J = - \sum\limits_{k=1}^V y_k \log\hat y_k $$ 

In [26]:
def cross_entropy_loss(y_pred, y_actual):
    loss = np.sum(-np.log(y_hat)*y)
    return loss

In [27]:
cross_entropy_loss(y_hat, y)

1.4650152923611106

## Backpropagation
$$\frac{\partial J}{\partial W_1} = ReLU\left( W^T_2(\hat y - y)\right)x^T \tag{1}$$
$$\frac{\partial J}{\partial W_2} = (\hat y - y)h^T \tag{2}$$
$$\frac{\partial J}{\partial b_1} = ReLU\left(W^T_2(\hat y - y)\right) \tag{3}$$
$$\frac{\partial J}{\partial b_2} = \hat y - y \tag{4}$$

**Calculate partial derivative of the loss function wrt $b_2$**
$$\frac{\partial J}{\partial b_2} = \hat y - y \tag{4}$$

In [38]:
grad_b2 = y_hat - y
print("grad_b2")
print(grad_b2)

grad_b2
[[ 0.18519074]
 [ 0.19245626]
 [-0.76892554]
 [ 0.18236353]
 [ 0.20891502]]


**Calculate partial derivative of loss function wrt $W_2$**
$$\frac{\partial J}{\partial W_2} = (\hat y - y)h^T \tag{2}$$

In [39]:
grad_W2 = np.dot(y_hat - y, h.T)
print("grad_W2")
print(grad_W2)

grad_W2
[[ 0.06756476  0.11798563  0.        ]
 [ 0.0702155   0.12261452  0.        ]
 [-0.28053384 -0.48988499  0.        ]
 [ 0.06653328  0.1161844   0.        ]
 [ 0.07622029  0.13310045  0.        ]]


**Calculate partial derivative of loss function wrt $b_1$**
$$\frac{\partial J}{\partial b_1} = ReLU\left(W^T_2(\hat y - y)\right) \tag{3}$$


In [40]:
grad_b1 = relu(np.dot(W2.T, y_hat - y))
print("grad_b1")
print(grad_b1)

grad_b1
[[0.        ]
 [0.        ]
 [0.17045858]]


**Calculate partial derivative of loss function wrt $W_1$**
$$\frac{\partial J}{\partial W_1} = ReLU\left( W^T_2(\hat y - y)\right)x^T \tag{1}$$

In [41]:
grad_W1 = np.dot(relu(np.dot(W2.T, y_hat - y)), x.T)
print("grad_W1")
print(grad_W1)

grad_W1
[[0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.04261464 0.04261464 0.         0.08522929 0.        ]]


In [42]:
print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of grad_W1: {grad_W1.shape} (NxV)')
print(f'size of grad_b1: {grad_b1.shape} (Nx1)')
print(f'size of grad_W2: {grad_W1.shape} (VxN)')
print(f'size of grad_b2: {grad_b2.shape} (Vx1)')

V (vocabulary size): 5
N (embedding size / size of the hidden layer): 3
size of grad_W1: (3, 5) (NxV)
size of grad_b1: (3, 1) (Nx1)
size of grad_W2: (3, 5) (VxN)
size of grad_b2: (5, 1) (Vx1)


## Gradient Descent
During gradient descent phase, update the weights and biases by subtracting $\alpha$ times the gradient from the original matrtices and vectors

$$W_1 := W1 - \alpha\frac{\partial J}{\partial W_1} \tag{5}$$
$$W_2 := W1 - \alpha\frac{\partial J}{\partial W_2} \tag{5}$$
$$b_1 := b1 - \alpha\frac{\partial J}{\partial b_1} \tag{5}$$
$$b_2 := b2 - \alpha\frac{\partial J}{\partial b_2} \tag{5}$$

In [43]:
alpha = 0.3
W1_new = W1 - alpha * grad_W1
W2_new = W2 - alpha * grad_W2
b1_new = b1 - alpha * grad_b1
b2_new = b2 - alpha * grad_b2

### Compare Values
#### $W_1, W_{1_{new}}$

In [45]:
print("W1")
print(W1)
print("W1 New")
print(W1_new)

W1
[[ 0.41687358  0.08854191 -0.23495225  0.28320538  0.41800106]
 [ 0.32735501  0.22795148 -0.23951958  0.4117634  -0.23924344]
 [ 0.26637602 -0.23846886 -0.37770863 -0.11399446  0.34008124]]
W1 New
[[ 0.41687358  0.08854191 -0.23495225  0.28320538  0.41800106]
 [ 0.32735501  0.22795148 -0.23951958  0.4117634  -0.23924344]
 [ 0.25359163 -0.25125325 -0.37770863 -0.13956325  0.34008124]]


#### $W_2, W_{2_{new}}$

In [46]:
print("W2")
print(W2)
print("W2 New")
print(W2_new)

W2
[[-0.22182064 -0.43008631  0.13310965]
 [ 0.08476603  0.08123194  0.1772054 ]
 [ 0.1871551  -0.06107263 -0.1790735 ]
 [ 0.07055222 -0.02015138  0.36107434]
 [ 0.33480474 -0.39423389 -0.43959196]]
W2 New
[[-0.24209007 -0.465482    0.13310965]
 [ 0.06370138  0.04444758  0.1772054 ]
 [ 0.27131525  0.08589287 -0.1790735 ]
 [ 0.05059224 -0.0550067   0.36107434]
 [ 0.31193865 -0.43416402 -0.43959196]]


#### $b_1, b_{1_{new}}$

In [49]:
print("b1")
print(b1)
print("b1 New")
print(b1_new)

b1
[[ 0.09688219]
 [ 0.29239497]
 [-0.27364426]]
b1 New
[[ 0.09688219]
 [ 0.29239497]
 [-0.32478183]]


#### $b_2, b_{2_{new}}$

In [51]:
print("b2")
print(b2)
print("b2 New")
print(b2_new)

b2
[[ 0.0352008 ]
 [-0.36393384]
 [-0.12775555]
 [-0.34802326]
 [-0.07017815]]
b2 New
[[-0.02035642]
 [-0.42167072]
 [ 0.10292211]
 [-0.40273232]
 [-0.13285266]]
