In [1]:
import numpy as np


In [2]:
def softmax(x):
    e_x = np.exp(x-np.max(x))
    return e_x / e_x.sum(axis=0)

In [8]:
#from utils import * 


def rnn_cell_forward(x_t, s_prev, parameters):
    
    U = parameters["U"]
    W = parameters["W"]
    V = parameters["V"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    s_next = np.tanh(np.dot(U, x_t) + np.dot(W, s_prev) + ba)
    
    out_pred = softmax(np.dot(V, s_next) + by)
    
    cache = (s_next, s_prev, x_t, parameters)
    
    return s_next, out_pred, cache 



def rnn_forward(x, s0, parameters):
    # forward propagations for all cells
    
    caches = []
    
    m, _, T =x.shape
    
    m, n = parameters["V"].shape
    
    s_next = s0
    
    s = np.zeros((n, 1, T))
    
    y = np.zeros((m, 1, T))
    
    for t in range(T):
        
        s_next, out_pred, cache = rnn_cell_forward(x[:, :, t], s_next, parameters)
        
        s[:, :, t] = s_next
        
        y[:, :, t] = out_pred
        
        caches.append(cache)
    
    return s, y, caches
    
    
def rnn_cell_backward(ds_next, cache):
    
    (s_next, s_prev, x_t, parameters) = cache
    print(type(parameters))
    
    U = parameters["U"]
    W = parameters["W"]
    V = parameters["V"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    dtanh = (1 - s_next ** 2) * ds_next
    
    dx_t = np.dot(U.T, dtanh)
    dU = np.dot(dtanh, x_t.T)
    
    ds_prev = np.dot(W.T, dtanh)
    dW = np.dot(dtanh, s_prev.T)
    
    dba = np.sum(dtanh, axis=1, keepdims=1)
    
    gradients = {"dx_t": dx_t, "ds_prev": ds_prev, "dU": dU, "dW": dW, "dba": dba}
    
    return gradients 

def rnn_backward(ds, caches):
    
    (s1, s0, x_1, parameters) = caches[0]
    
    n, _, T = ds.shape
    
    m, _ = x_1.shape
    
    dx = np.zeros((m, 1, T))
    dU = np.zeros((n, m))
    dW = np.zeros((n, n))
    dba = np.zeros((n, 1))
    ds0 = np.zeros((n, 1))
    ds_prevt = np.zeros((n, 1))
    
    for t in reversed(range(T)):
        
        gradients = rnn_cell_backward(ds[:, :, t] + ds_prevt, caches[t])
        
        ds_prevt = gradients["ds_prev"]
        
        dU += gradients["dU"]
        
        dW += gradients["dW"]
        
        dba += gradients["dba"]
        
        dx_t = gradients["dx_t"]
        
        dx[:, :, t] = dx_t
        
    gradients = {"dU": dU, "dW": dW, "dba": dba, "dx": dx}
    
    return gradients
    
    

if __name__ == '__main__':
    np.random.seed(1)
    
    x = np.random.randn(3,1,4)
    s0 = np.random.randn(5,1)
    
    U = np.random.randn(5,3)
    W = np.random.randn(5,5)
    
    V = np.random.randn(3,5)
    
    ba = np.random.randn(5,1)
    by = np.random.randn(3,1)
    
    parameters = {"U": U, "W": W, "V": V, "ba": ba, "by": by}
    
    s, y, caches = rnn_forward(x, s0, parameters)
    
    ds = np.random.randn(5,1,4)
    
    gradients = rnn_backward(ds, caches)
    
    print(gradients)
    
    print("s =", s)
    print("s.shape =", s.shape)
    print("y =", y)
    print("y.shape =", y.shape)
    
    

<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
{'dU': array([[-1.36113821, -0.65922426, -1.00375682],
       [ 1.66048254,  0.69485495,  0.30264786],
       [-0.62246348, -2.17785134, -0.17802186],
       [ 0.14405702,  0.83283752, -0.05638286],
       [-0.08526547, -0.30185192, -0.33629658]]), 'dW': array([[ 0.25963841,  0.45162921, -0.60950493,  0.60136583, -0.17012016],
       [-0.4179251 , -0.3354963 ,  1.30223714, -1.26953888, -0.24650054],
       [-0.75213045,  0.72496527,  0.97873189, -1.02299983, -0.57556053],
       [ 0.29560042, -0.18662077, -0.30101974,  0.31097946,  0.15793874],
       [-0.1372232 ,  0.19654372,  0.3649627 , -0.36754107, -0.24378311]]), 'dba': array([[-0.55728284],
       [ 1.16366439],
       [ 1.0416666 ],
       [-0.30953627],
       [ 0.35730405]]), 'dx': array([[[-0.51329353,  0.61956517,  0.0860038 , -0.23284264]],

       [[ 1.13179632,  1.27564468,  0.04060929, -0.15858812]],

       [[ 0.51788802, -0.60980593, -0.07552883,  0.1207458 ]