In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Model - RNN
#### Notation

$$x_j^{(i)[l]\langle t \rangle}$$
- $[l]$: layer l-th (L layers)
+ $(i)$: data point i-th (m data points)
+ $\langle t \rangle$: $\text{t}^\text{th}$ timestep (T timesteps)
+ $j$: feature j-th (n features)


#### Situations when this RNN will perform better:
- This will work well enough for some applications, but it suffers from the **vanishing gradient** problems. 
- The RNN works best when each output $\hat{y}^{\langle t \rangle}$ can be estimated using "local" context.  
    - "Local" context refers to information that is close to the prediction's time step $t$.
    - More formally, local context refers to inputs $x^{\langle t' \rangle}$ and predictions $\hat{y}^{\langle t \rangle}$ where $t'$ is close to $t$.

## 1. Forward

#### RNN Cell
<img src="./assets/rnn_step_forward.png" width="800"/>

In [2]:
def softmax(Z):
    ez = np.exp(Z)
    return ez / np.sum(ez, axis=0)

In [3]:
def rnn_cell_forward(xt, a_prev, parameters):
    """
    Arguments:
        xt (ndarray (n_x, m))     : Input data at timestep "t"
        a_prev (ndarray (n_a, m)) : Hidden state at timestep "t-1"
        parameters (dict) :
            Wax (ndarray (n_a, n_x)) : input-to-hidden Weight matrix
            Waa (ndarray (n_a, n_a)) : hidden-to-hidden Weight matrix
            Wya (ndarray (n_y, n_a)) : hidden-to-output Weight matrix
            ba (ndarray (n_a, 1))    : Hidden bias vector
            by (ndarray (n_y, 1))    : Output Bias vector
    Returns:
        a_next (ndarray (n_a, m))    : next hidden state, of shape
        yt_hat (ndarray (n_y, m))    : prediction at timestep "t"
        cache (tuple)                : contains (a_next, a_prev, xt, parameters)
                                       needed for the backward pass
    """
    # Retrieve parameters
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    # Forward pass
    a_next = np.tanh(np.dot(Wax, xt) + np.dot(Waa, a_prev) + ba)
    yt_hat = softmax(np.dot(Wya, a_next) + by)

    # Cache + return
    cache = (a_next, a_prev, xt, parameters)
    return a_next, yt_hat, cache

#### Forward Pass
<img src="./assets/rnn_forward_sequence_figure3_v3a.png" width="1100"/>

In [4]:
def rnn_forward(X, a0, parameters):
    """
    Arguments:
        X (ndarray (n_x, m, T_x)) : Input data for every time-step
        a0 (ndarray (n_a, m)) : Initial hidden state
        parameters (dict) :
            Wax (ndarray (n_a, n_x)) : input-to-hidden Weight matrix
            Waa (ndarray (n_a, n_a)) : hidden-to-hidden Weight matrix
            Wya (ndarray (n_y, n_a)) : hidden-to-output Weight matrix
            ba (ndarray (n_a, 1))    : Hidden bias vector
            by (ndarray (n_y, 1))    : Output Bias vector

    Returns:
        a (ndarray (n_a, m, T_x))        : Hidden states for every time-step
        Y_hat (ndarray (n_y, m, T_x))    : Predictions for every time-step
        caches ((list(cache_t), X))      : list of cache every timestep t, X for the backward pass
    """
    # Retrieve dim
    n_x, m, T_x = X.shape
    n_y, n_a = parameters["Wya"].shape

    # Initialize "a" and "Y_hat"
    a = np.zeros([n_a, m, T_x])
    Y_hat = np.zeros([n_y, m, T_x])
    
    # Initialize a_t
    at = np.copy(a0)

    # loop over all time-steps
    caches = []
    for t in range(T_x):
        # Cell forward
        at, yt_hat, cache = rnn_cell_forward(X[:,:,t], at, parameters)

        # Update params_t
        a[:,:,t] = at
        Y_hat[:,:,t] = yt_hat
        
        # Append "cache" to "caches"
        caches.append(cache)

    # store values needed for backward propagation in cache
    caches = (caches, X)
    return a, Y_hat, caches

## 2. Backpropagation

#### RNN backward Cell

<img src="./assets/rnn_cell_backward_3a_c.png" width="950"/>

In [5]:
def rnn_cell_backward(dy, da_next, cache):
    """
    Arguments:
        dy (ndarray (n_y, 1)) : Gradient of loss with respect to output
        da_next (ndarray (n_a, m)) : Gradient of loss with respect to next hidden state
        cache (tuple) : contain output of rnn_cell_forward()

    Returns:
        gradients (dict) :
            dxt (ndarray (n_x, m))      : Gradients of input data
            da_prev (ndarray (n_a, m))  : Gradients of previous hidden state
            dWax (ndarray (n_a, n_x))   : Gradients of input-to-hidden Weight matrix
            dWaa (ndarray (n_a, n_a))   : Gradients of hidden-to-hidden Weight matrix
            dba (ndarray (n_a, 1))      : Gradients of hidden bias vector
            dWya (ndarray (n_y, n_a))   : Gradients of hidden-to-output Weight matrix
            dby (ndarray (n_y, 1))      : Gradients of output bias vector
    """
    
    # Retrieve values from cache
    (a_next, a_prev, xt, parameters) = cache
    
    # Retrieve values from parameters
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    # Output grad
    dWya = np.dot(dy, a_next.T)
    dby = dy

    # compute the gradient of tanh with respect to a_next
    dtanh = (1 - np.power(a_next, 2))*da_next

    # compute the gradient of the loss with respect to Wax
    dxt = np.dot(Wax.T,  dtanh)
    dWax = np.dot(dtanh, xt.T)

    # compute the gradient with respect to Waa
    da_prev = np.dot(Waa.T, dtanh)
    dWaa = np.dot(dtanh, a_prev.T)

    # compute the gradient with respect to b
    dba = np.sum(dtanh, keepdims=True, axis=1)

    # Store the gradients in a python dictionary
    gradients = {"dxt": dxt, "da_prev": da_prev,
        "dWax": dWax, "dWaa": dWaa, "dba": dba,
        "dWya": dWya, "dby": dby}
    return gradients

#### RNN backward Pass

In [6]:
def rnn_backward(X, Y_hat, da, caches):
    """
    Arguments:
        da (ndarray (n_a, m, T_x)) :  Gradients of all hidden states
        caches (tuple)              : tuples containing information from the forward pass (rnn_forward)
    
    Returns:
        gradients (dict) :
            dx (ndarray (n_x, m, T_x))      : Gradients of input data from every time-step
            da0 (ndarray (n_a, m))          : Gradient of the initial hidden state
            dWax (ndarray (n_a, n_x))       : Gradient of input-to-hidden Weight matrix
            dWaa (ndarray (n_a, n_a))       : Gradient of hidden-to-hidden Weight matrix
            dba (ndarray (n_a, 1)           : Gradients of hidden bias vector
    """ 
    # Retrieve values from t=0 cache
    (caches, x) = caches
    (a1, a0, x1, parameters) = caches[0]
    
    # Retrieve dim
    n_a, m, T_x = da.shape
    n_x, m = x1.shape
    
    # Zero gradients
    dx = np.zeros((n_x, m, T_x)) 
    dWax = np.zeros((n_a, n_x))
    dWaa = np.zeros((n_a, n_a))
    dba = np.zeros((n_a, 1)) 
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))  
    
    # loop back over all timesteps
    for t in reversed(range(T_x)):
        # Compute dy, Note change respect to loss func
        dyt = np.copy(y_hat[t])
        dyt[Y_hat[t]] -= 1

        # Compute gradient timestep t
        gradients = rnn_cell_backward(dyt, da[:, :, t] + da_prevt, caches[t])

        # Update gradient at timestep t
        dx[:, :, t] = gradients["dxt"]
        da_prevt = gradients["da_prev"]
        
        dWax += gradients["dWax"]  
        dWaa += gradients["dWaa"]  
        dba += gradients["dba"]
    
    # Update da0
    da0 = da_prevt

    # Store the gradients in a python dictionary
    gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}
    return gradients