In [28]:
import numpy as np

## Theory

### Formula
$$f_t = δ(W_f[h_{t-1},x_t]+b_f) $$
$$i_t = δ(W_i[h_{t-1},x_t]+b_i) $$
$$\tilde{C}_t = tanh(W_c[h_{t-1},x_t]+b_C)$$
$$C_t = f_t * C_{t-1} + i_t*\tilde{C}_t$$
$$o_t = δ(W_o[h_{t-1},x_t]+b_o) $$
$$h_t = o_t * tanh(C_t)$$
$$Loss: L = (h_t-y)^2$$

### Backward
$$\frac{d_{L}}{d_{h_t}} = h_t - y$$
$$\frac{d_{L}}{d_{o_t}} = \frac{d_{L}}{d_{h_t}}tanh(C_t)$$
$$\frac{d_{L}}{d_{C_t}} =\frac{d_{L}}{d_{h_t}} o_t * (1-tanh(C_t)^2)$$
$$\frac{d_{L}}{d_{f_t}} = \frac{d_{L}}{d_{C_t}} \frac{C_t}{d_{f_t}} = \frac{d_{L}}{d_{C_t}} C_{t-1}$$
$$\frac{d_{L}}{d_{i_t}} = \frac{d_{L}}{d_{C_t}} \frac{C_t}{d_{i_t}} =  \frac{d_{L}}{d_{C_t}} \tilde{C}_t$$
$$\frac{d_L}{d_{\tilde{C}_t}} = \frac{d_{L}}{d_{C_t}} \frac{C_t}{d_{\tilde{C}_t}} = \frac{d_{L}}{d_{C_t}} i_t $$
$$\frac{d_{L}}{d_{W_o}} = \frac{d_{L}}{d_{o_t}} \frac{o_t}{d_{W_o}} = \frac{d_{L}}{d_{o_t}} o_t (1-o_t) [h_{t-1},x_t]$$
$$\frac{d_{L}}{d_{W_f}} = \frac{d_{L}}{d_{f_t}} \frac{f_t}{d_{W_f}} = \frac{d_{L}}{d_{i_t}} f_t (1-f_t) [h_{t-1},x_t]$$
$$\frac{d_{L}}{d_{W_i}} = \frac{d_{L}}{d_{i_t}} \frac{i_t}{d_{W_i}} = \frac{d_{L}}{d_{i_t}} i_t (1-i_t) [h_{t-1},x_t]$$
$$\frac{d_{L}}{d_{W_c}} = \frac{d_{L}}{\tilde{C}_t} \frac{\tilde{C}_t}{d_{W_c}} = \frac{d_{L}}{\tilde{C}_t} \tilde{C}_t(1-\tilde{C}_t) [h_{t-1},x_t]$$

$$\frac{d_{L}}{d_{b_o}} = \frac{d_{L}}{d_{o_t}} \frac{o_t}{d_{W_o}} = \frac{d_{L}}{d_{o_t}} o_t (1-o_t)\$$
$$\frac{d_{L}}{d_{b_f}} = \frac{d_{L}}{d_{f_t}} \frac{f_t}{d_{W_f}} = \frac{d_{L}}{d_{i_t}} f_t (1-f_t)$$
$$\frac{d_{L}}{d_{b_i}} = \frac{d_{L}}{d_{i_t}} \frac{i_t}{d_{W_i}} = \frac{d_{L}}{d_{i_t}} i_t (1-i_t)$$
$$\frac{d_{L}}{d_{b_c}} = \frac{d_{L}}{\tilde{C}_t} \frac{\tilde{C}_t}{d_{W_c}} = \frac{d_{L}}{\tilde{C}_t} \tilde{C}_t(1-\tilde{C}_t)




### Shape:
#### N_in: num-features-in
#### N_out: num-features-out
#### N_t: num-timesteps
$$x:  (N_t,N_i).$$
$$y: (N_t,N_o).$$
$$h: (N_t,N_o).$$
$$C: (N_t,N_o).$$
$$f,i,o: (N_t,N_o).$$
$$W_f,W_i,W_o,W_C: (N_i+N_o,N_o)$$
$$b_f,b_i,b_o,b_c: (1,N_o)$$


## Implementation

In [29]:
def sigmoid(x):
    return 1 / (1 + np.e**x)

In [45]:
def forward(X,C_t,h_t,W_f,W_i,W_o,W_C,b_f,b_i,b_o,b_C):
    f_t = sigmoid(np.concatenate([h_t.T,X.T]).T.dot(W_f) + b_f)
    i_t = sigmoid(np.concatenate([h_t.T,X.T]).T.dot(W_i) + b_i)
    C_candidate = np.tanh(np.concatenate([h_t.T,X.T]).T.dot(W_C) + b_C)
    C_new = np.multiply(f_t,C_t) + np.multiply(i_t,C_candidate)
    o_t = sigmoid(np.concatenate([h_t.T,X.T]).T.dot(W_o)+b_o)
    h_new = np.multiply(o_t,np.tanh(C_new))
    return C_new,h_new,f_t,i_t,o_t,C_candidate

In [46]:
def backward(X,y,C_new,h_new,f_t,i_t,o_t,C_candidate,C_t,h_t,W_f,W_i,W_o,W_C,b_f,b_i,b_o,b_C,lr):
    # Calculate gradient
    dL_dh = h_new - y
    dL_do = np.multiply(dL_dh,np.tanh(C_new))
    dL_dC = np.multiply(dL_dh,np.multiply(o_t,1-np.tanh(C_new)**2))
    dL_df = np.multiply(dL_dC,C_t)
    dL_di = np.multiply(dL_dC,C_candidate) 
    dL_dCc = np.multiply(dL_dC,i_t)

    dL_dWo = np.concatenate([h_t.T,X.T]).dot(np.multiply(dL_do,np.multiply(o_t,1-o_t)))
    dL_dWi = np.concatenate([h_t.T,X.T]).dot(np.multiply(dL_di,np.multiply(i_t,1-i_t)))
    dL_dWf = np.concatenate([h_t.T,X.T]).dot(np.multiply(dL_df,np.multiply(f_t,1-f_t)))
    dL_dWc = np.concatenate([h_t.T,X.T]).dot(np.multiply(dL_dCc,np.multiply(C_candidate,1-C_candidate)))

    dL_dbo = np.mean(np.multiply(dL_do,np.multiply(o_t,1-o_t)),axis=0)
    dL_dbi = np.mean(np.multiply(dL_di,np.multiply(i_t,1-i_t)),axis=0)
    dL_dbf = np.mean(np.multiply(dL_df,np.multiply(f_t,1-f_t)),axis=0)
    dL_dbc = np.mean(np.multiply(dL_dCc,np.multiply(C_candidate,1-C_candidate)),axis=0)

    #Update weights and biases
    W_f = W_f - lr * dL_dWf
    W_i = W_i - lr * dL_dWi
    W_o = W_o - lr * dL_dWo
    W_C = W_C - lr * dL_dWc

    b_f = b_f - lr * dL_dbf
    b_i = b_i - lr * dL_dbi
    b_o = b_o - lr * dL_dbo
    b_C = b_C - lr * dL_dbc
    
    return W_f,W_i,W_o,W_C,b_f,b_i,b_o,b_C