<img style="float: left;;" src='Figures/alinco.png' /></a>

# Modulo II: Vectores Palabra (Word Embeddings) y CBOW 02


Veremos cómo preparar los datos para aplicar:


- Propagación hacia adelante (Forward propagation).

- Pérdida de entropía cruzada (crosss-entropy loss).

- Retropropagación (Backpropagation).

- Descenso de gradiente (gradient descent).


In [2]:
import numpy as np

## Forward propagation


<div style="width:image width px; font-size:100%; text-align:center;"><img src='Figures/cbow_model_dimensions_single_input.png' alt="alternate text" width="width" height="height" style="width:839;height:349;" /> Figure 2 </div>

In [3]:
N = 3
V = 5


# Inicializando los pesos de la red


### Inicialización de pesos y bías

In [4]:
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])


W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])


b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])


b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

Agregar las funciones vistas en los notebooks pasados

In [5]:
def get_dict(data):
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    # return these correctly
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word

def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)


In [7]:
words = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [8]:

word2Ind, Ind2word = get_dict(words)


In [9]:
Ind2word

{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}

In [10]:
word2Ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

## Datos de entrenamiento

In [11]:
training_examples = get_training_example(words, 2, word2Ind, V)

In [12]:
training_examples

<generator object get_training_example at 0x7fc9f562b9e0>

In [13]:
x_array, y_array = next(training_examples)

In [14]:
x_array

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [15]:
y_array

array([0., 0., 1., 0., 0.])

In [16]:
x = x_array.copy()

In [17]:
x.reshape(V,1)

array([[0.25],
       [0.25],
       [0.  ],
       [0.5 ],
       [0.  ]])

In [19]:
x.shape=(V,1)
x

array([[0.25],
       [0.25],
       [0.  ],
       [0.5 ],
       [0.  ]])

In [22]:
y = y_array.copy()

In [23]:
y.shape = (V,1)

In [24]:
y

array([[0.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [25]:
def relu(z):
    result = z.copy()
    result[result<0]=0
    return result

def softmax(z):
    e_z= np.exp(z)
    sum_ez = np.sum(e_z)
    return e_z / sum_ez

## Forward

### Valores de la capa oculta

\begin{align}
\mathbf{z_1} = \mathbf{W_1}\mathbf{x} + \mathbf{b1} \\
\mathbf{h} = \mathbf{ReLu}(\mathbf{z_1)} \\
\end{align}

In [26]:
z1 = np.dot(W1, x) + b1

In [27]:
z1

array([[ 0.36483875],
       [ 0.63710329],
       [-0.3236647 ]])

In [29]:
h = relu(z1)
h

array([[0.36483875],
       [0.63710329],
       [0.        ]])

### Valores de la capa de salida

\begin{align}
\mathbf{z_2} = \mathbf{W_2}\mathbf{h} + \mathbf{b2} \\
\mathbf{\hat{y}} = \mathbf{softmax}(\mathbf{z_2)} \\
\end{align}

In [30]:
z2 = np.dot(W2, h) + b2

In [31]:
y_hat = softmax(z2)

In [32]:
y_hat

array([[0.18519074],
       [0.19245626],
       [0.23107446],
       [0.18236353],
       [0.20891502]])

2

In [33]:
y

array([[0.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [36]:
Ind2word[y_hat.argmax()]

'happy'

### Cross-entropy loss

$$ J = -\sum\limits_{k=1}^{V} y_k \log{\hat{y}_k}$$

In [37]:
def cross_entropy_loss(y_predicted, y_actual):
    loss = np.sum(-np.log(y_predicted)*y_actual)
    return loss

In [38]:
cross_entropy_loss(y_hat, y)

1.4650152923611106

### Backpropagation

Las formulas que necesitamos para implementar el backpropagation son:


\begin{align}
 \frac{\partial J}{\partial \mathbf{W_1}} &= \rm{ReLU}\left ( \mathbf{W_2^\top} (\mathbf{\hat{y}} - \mathbf{y})\right )\mathbf{x}^\top \tag{7}\\
 \frac{\partial J}{\partial \mathbf{W_2}} &= (\mathbf{\hat{y}} - \mathbf{y})\mathbf{h^\top} \tag{8}\\
 \frac{\partial J}{\partial \mathbf{b_1}} &= \rm{ReLU}\left ( \mathbf{W_2^\top} (\mathbf{\hat{y}} - \mathbf{y})\right ) \tag{9}\\
 \frac{\partial J}{\partial \mathbf{b_2}} &= \mathbf{\hat{y}} - \mathbf{y} \tag{10}
\end{align}


Calcule la derivada parcial de la función de pérdida con respecto a $ \mathbf {b_2} $ y almacene el resultado en `grad_b2`.


$$\frac{\partial J}{\partial \mathbf{b_2}} = \mathbf{\hat{y}} - \mathbf{y} \tag{10}$$

In [46]:
grad_b2 = y_hat - y

In [47]:
grad_b2

array([[ 0.18519074],
       [ 0.19245626],
       [-0.76892554],
       [ 0.18236353],
       [ 0.20891502]])

Calcular la derivada parcial de la función con respecto a $ \mathbf {w_2} $, y guardarlo en `grad_W2`

$$\frac{\partial J}{\partial \mathbf{W_2}} = (\mathbf{\hat{y}} - \mathbf{y})\mathbf{h^\top} \tag{8}$$


In [41]:
grad_w2 = np.dot((y_hat - y), h.T)
grad_w2

array([[ 0.06756476,  0.11798563,  0.        ],
       [ 0.0702155 ,  0.12261452,  0.        ],
       [-0.28053384, -0.48988499, -0.        ],
       [ 0.06653328,  0.1161844 ,  0.        ],
       [ 0.07622029,  0.13310045,  0.        ]])

**Ahora, calcule la derivada con respecto a $\mathbf{b_1}$ y guardar el resultado en `grad_b1`.**

$$\frac{\partial J}{\partial \mathbf{b_1}} = \rm{ReLU}\left ( \mathbf{W_2^\top} (\mathbf{\hat{y}} - \mathbf{y})\right ) \tag{9}$$

In [42]:
grad_b1 = relu(np.dot(W2.T, (y_hat - y)))
grad_b1





array([[0.        ],
       [0.        ],
       [0.17045858]])

**Finalmente, calcular la derivada parcial del loss con respecto a $\mathbf{W_1}$, y guardarlo en`grad_W1`.**

$$\frac{\partial J}{\partial \mathbf{W_1}} = \rm{ReLU}\left ( \mathbf{W_2^\top} (\mathbf{\hat{y}} - \mathbf{y})\right )\mathbf{x}^\top \tag{7}$$

In [43]:
grad_w1 = np.dot(relu(np.dot(W2.T,grab_b2)),x.T)
grad_w1






array([[0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.04261464, 0.04261464, 0.        , 0.08522929, 0.        ]])

Resultado esperado

    array([[0.        , 0.        , 0.        , 0.        , 0.        ],
           [0.        , 0.        , 0.        , 0.        , 0.        ],
           [0.04261464, 0.04261464, 0.        , 0.08522929, 0.        ]])

## Gradiante descendente

Durante la fase del gradiante descendente, actualizará los pesos y los bías $ \alpha $ veces el gradiente de las matrices y vectores originales, utilizando las siguientes fórmulas.


\begin{align}
 \mathbf{W_1} &:= \mathbf{W_1} - \alpha \frac{\partial J}{\partial \mathbf{W_1}} \tag{11}\\
 \mathbf{W_2} &:= \mathbf{W_2} - \alpha \frac{\partial J}{\partial \mathbf{W_2}} \tag{12}\\
 \mathbf{b_1} &:= \mathbf{b_1} - \alpha \frac{\partial J}{\partial \mathbf{b_1}} \tag{13}\\
 \mathbf{b_2} &:= \mathbf{b_2} - \alpha \frac{\partial J}{\partial \mathbf{b_2}} \tag{14}\\
\end{align}


In [44]:
alpha = 0.03


**Ahora Calcule los nuevo valores de $\mathbf{W_2}$ (que serán guardados en `W2_new`), $\mathbf{b_1}$ (en `b1_new`), y $\mathbf{b_2}$ (en `b2_new`).**

\begin{align}
 \mathbf{W_2} &:= \mathbf{W_2} - \alpha \frac{\partial J}{\partial \mathbf{W_2}} \tag{12}\\
 \mathbf{b_1} &:= \mathbf{b_1} - \alpha \frac{\partial J}{\partial \mathbf{b_1}} \tag{13}\\
 \mathbf{b_2} &:= \mathbf{b_2} - \alpha \frac{\partial J}{\partial \mathbf{b_2}} \tag{14}\\
\end{align}

In [48]:
# Actualizacion de pesos
w1_new = W1 - alpha*grad_w1
w2_new = W2 - alpha*grad_w2
b1_new = b1 - alpha*grad_b1
b2_new = b2 - alpha*grad_b2


In [50]:
W1

array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
       [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
       [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

In [51]:
w1_new

array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
       [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
       [ 0.25359163, -0.25125325, -0.37770863, -0.13956325,  0.34008124]])

## Opción 1: extraer los embeddings de W1

In [53]:
w1_new

array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
       [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
       [ 0.25359163, -0.25125325, -0.37770863, -0.13956325,  0.34008124]])

In [55]:
for i in range(V):
    print(Ind2word[i])

am
because
happy
i
learning


In [57]:
word2Ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

In [None]:
print(f'{x} ----> {w1_new[]}')

## Opción 2: extraer los embeddings de W2

In [58]:
w2_new

array([[-0.24209007, -0.465482  ,  0.13310965],
       [ 0.06370138,  0.04444758,  0.1772054 ],
       [ 0.27131525,  0.08589287, -0.1790735 ],
       [ 0.05059224, -0.0550067 ,  0.36107434],
       [ 0.31193865, -0.43416402, -0.43959196]])

## Opción 3: extraer los embeddings de W1 y W2

In [None]:
W3 = (W1 + W2)/2


In [None]:
def gradient_descent(xtrain,ytrain, N, V, numiter, alpha):
    
    #inicicalizar el modelo
    W1,W2,b1,b2
    
    for i in range(numiter):
        
        z,h,yhat = forward(xtrain,ytrain,w1,w2,b1,b2)
        
        cost = funcion_costo(ytrain, yhat)
        
        grad_w1,grad_w2 ... = back_prop
        
        # actualizar los pesos
        W1 = W1 - alpha*grad_w1
        W2 = W2 - alpha*grad_w2
        b1 = b1 - alpha*grad_b1
        b2 = b2 - alpha*grad_b2
        
        

        
        
        
        