# Build Recurrent Neural Network

In [1]:
import numpy as np
from rnn_utils import *

### RNN single cell

#### Forward

$ a^{<t>} = tanh(W_{ax}x^{<t>} + W_{aa}a^{<t-1>} + b_a) $

$ \hat{y}^{<t>} = softmax(W_{ya}a^{<t>} + b_y) $


In [22]:
def rnn_cell_forward(xt, a_prev, params):
    """
    Implement a single forward step of the RNN-cell.
    
    Arguments:
        xt -- Input data at timesamp "t", numpy array of shape (n_x, m).
        a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m).
        params -- python dictionary containing:
            Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x).
            Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a).
            Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a).
            ba -- Bias, numpy array of shape (n_a, 1).
            by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1).
            
    Returns:
        a_next -- Next hidden-state, numpy array of shape (n_a, m).
        yt_pred -- Prediction at timestamp "t", numpy array of shape (n_y, m).
        caches -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, params).
    """
    
    # Retrieve parameters form "params"
    Wax = params["Wax"]
    Waa = params["Waa"]
    Wya = params["Wya"]
    ba = params["ba"]
    by = params["by"]
    
    a_next = np.tanh(np.dot(Wax, xt) + np.dot(Waa, a_prev) + ba)
    yt_pred = softmax(np.dot(Wya, a_next) + by)
    
    cache = (a_next, a_prev, xt, params)
    return a_next, yt_pred, cache

In [31]:
np.random.seed(1)
xt = np.random.randn(3, 10)
a_prev = np.random.randn(5, 10)
Waa = np.random.randn(5, 5)
Wax = np.random.randn(5, 3)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)

parameters = {
    "Waa": Waa,
    "Wax": Wax,
    "Wya": Wya,
    "ba": ba,
    "by": by
}

a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)
print("a_next[4] = {}".format(a_next[4]))
print("a_next.shape = {}".format(a_next.shape))
print("yt_pred[1] = {}".format(yt_pred[1]))
print("yt_pred.shape = {}".format(yt_pred.shape))

a_next[4] = [ 0.59584544  0.18141802  0.61311866  0.99808218  0.85016201  0.99980978
 -0.18887155  0.99815551  0.6531151   0.82872037]
a_next.shape = (5, 10)
yt_pred[1] = [0.9888161  0.01682021 0.21140899 0.36817467 0.98988387 0.88945212
 0.36920224 0.9966312  0.9982559  0.17746526]
yt_pred.shape = (2, 10)


### Backward

$ a^{<t>} = tanh(W_{ax}x^{<t>} + W_{aa}a^{<t-1>} + b) $

$ \frac{\partial tanh(x)}{\partial x} = 1 - tanh(x)^{2} $

$ \frac{\partial a^{<t>}}{\partial W_{ax}} = (1 - tanh(W_{ax}x^{<t>} + W_{aa}a^{<t-1>} + b)^2)x^{<t>T} = \partial tanh(x)x^{<t>T} $

$ \frac{\partial a^{<t>}}{\partial W_{aa}} = (1 - tanh(W_{ax}x^{<t>} + W_{aa}a^{<t-1>} + b)^2)x^{<t-1>T} = \partial tanh(x)x^{<t-1>T} $

$ \frac{\partial a^{<t>}}{\partial b} = \sum_{batch} (1 - tanh(W_{ax}x^{<t>} + W_{aa}a^{<t-1>} + b)^2) = \sum_{batch} \partial tanh(x) $

$ \frac{\partial a^{<t>}}{\partial x^{<t>}} = W_{ax}^{T}(1 - tanh(W_{ax}x^{<t>} + W_{aa}a^{<t-1>} + b)^2) = W_{ax}^{T} \partial tanh(x) $

$ \frac{\partial a^{<t>}}{\partial a^{<t-1>}} = W_{aa}^{T}(1 - tanh(W_{ax}x^{<t>} + W_{aa}a^{<t-1>} + b)^2) = W_{aa}^{T} \partial tanh(x)x^{<t>T} $

In [39]:
def rnn_cell_backward(da_next, cache):
    """
    Implement the backward pass for the RNN-cell.
    
    Arguments:
        da_next -- Gradient of loss with respect to next hidden-state.
        cache -- python dictionary containing useful values.
        
    Returns:
        gradients -- Python dictionary containing:
            dx -- Gradients of input data, numpy array of shape (n_x, m).
            da_prev -- Gradients of previous hidden-state, numpy array of shape (n_a, m).
            dWax -- Gradients of input-to-hidden weights, numpy array of shape (n_a, n_x).
            dWaa -- Gradients of hidden-to-hidden weights, numpy array of shape (n_a, n_a).
            dba -- Gradients of bias vector, numpy array of shape (n_a, 1).
    """
    
    # Retrieve values from cache.
    (a_next, a_prev, xt, params) = cache
    
    # Retrieve values from params.
    Wax = params["Wax"]
    Waa = params["Waa"]
    Wya = params["Wya"]
    ba = params["ba"]
    by = params["by"]
    
    dtanh = (1 - a_next * a_next) * da_next
    dWaa = np.dot(dtanh, a_prev.T)
    dWax = np.dot(dtanh, xt.T)
    dba = np.sum(dtanh, keepdims=True, axis=-1)

    dxt = np.dot(Wax.T, dtanh)
    da_prev = np.dot(Waa.T, dtanh)
    
    gradients = {
        "dxt": dxt,
        "da_prev": da_prev,
        "dWax": dWax,
        "dWaa": dWaa,
        "dba": dba
    }
    
    return gradients

In [40]:
np.random.seed(1)
xt = np.random.randn(3, 10)
a_prev = np.random.randn(5, 10)
Waa = np.random.randn(5, 5)
Wax = np.random.randn(5, 3)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)
parameters = {
    "Waa": Waa,
    "Wax": Wax,
    "Wya": Wya,
    "ba": ba,
    "by": by
}

a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)
# print("a_next[4] = {}".format(a_next[4]))
# print("a_next.shape = {}".format(a_next.shape))
# print("yt_pred[1] = {}".format(yt_pred[1]))
# print("yt_pred.shape = {}".format(yt_pred.shape))

da_next = np.random.randn(5, 10)
gradients = rnn_cell_backward(da_next, cache)
print("gradients[\"dxt\"][1][2] ={}".format(gradients["dxt"][1][2]))
print("gradients[\"dxt\"].shape ={}".format(gradients["dxt"].shape))
print("gradients[\"da_prev\"][2][3] ={}".format(gradients["da_prev"][2][3]))
print("gradients[\"da_prev\"].shape ={}".format(gradients["da_prev"].shape))
print("gradients[\"dWax\"][3][1] ={}".format(gradients["dWax"][3][1]))
print("gradients[\"dWax\"].shape ={}".format(gradients["dWax"].shape))
print("gradients[\"dWaa\"][1][2] ={}".format(gradients["dWaa"][1][2]))
print("gradients[\"dWaa\"].shape ={}".format(gradients["dWaa"].shape))
print("gradients[\"dba\"][4] ={}".format(gradients["dba"][4]))
print("gradients[\"dba\"].shape ={}".format(gradients["dba"].shape))

gradients["dxt"][1][2] =1.3653821219712916
gradients["dxt"].shape =(3, 10)
gradients["da_prev"][2][3] =-0.04357779106461625
gradients["da_prev"].shape =(5, 10)
gradients["dWax"][3][1] =-1.5012584841864745
gradients["dWax"].shape =(5, 3)
gradients["dWaa"][1][2] =1.1441951795389382
gradients["dWaa"].shape =(5, 5)
gradients["dba"][4] =[1.42397243]
gradients["dba"].shape =(5, 1)


### RNN forward

In [5]:
def rnn_forward(x, a0, params):
    """
    Implement the forward propagation of the recurrent neural network.
    
    Arguments:
        x -- Input data for every time-step, numpy array of shape (n_x, m, T_x).
        a0 -- Initial hidden-state, numpy array of shape (n_a, m).
        params -- python dictionary containing:
            Waa -- Weight matrix multiplying the hidden-state, numpy array of shape (n_a, n_a).
            Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x).
            Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a).
            ba -- Bias, numpy array of shape (n_a, 1).
            by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1).
            
    Returns:
        a -- Hidden-states for every time-step, numpy array of shape (n_a, m, T_x).
        y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x).
        caches -- Tuple of values needed for the backward pass, contains (list of caches, x)
    """
    
    # Initialize "caches" which will contain the list of all caches
    caches = []
    
    # Retrieve dimensions from shapes of x and Wy
    n_x, m, T_x = x.shape
    n_y, n_a = params["Wya"].shape
    
    # Initialize "a" and "y" with zeros
    a = np.zeros((n_a, m , T_x))
    y_pred = np.zeros((n_y, m , T_x))
    
    # Initialize a_next
    a_next = a0
    
    for t in range(T_x):
        a_next, yt_pred, cache = rnn_cell_forward(x[:, :, t], a_next, params)
        a[:, :, t] = a_next
        y_pred[:, :, t] = yt_pred
        caches.append(cache)
        
    caches = (caches, x)
    
    return a, y_pred, caches
    

In [13]:
np.random.seed(1)
x = np.random.randn(3, 10, 4)
a0 = np.random.randn(5, 10)
Waa = np.random.randn(5, 5)
Wax = np.random.randn(5, 3)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)

parameters = {
    "Waa": Waa,
    "Wax": Wax,
    "Wya": Wya,
    "ba": ba,
    "by": by
}

a, y_pred, caches = rnn_forward(x, a0, parameters)
print("a[4][1] = {}".format(a[4][1]))
print("a.shape = {}".format(a.shape))
print("y_pred[1][3] = {}".format(y_pred[1][3]))
print("y_pred.shape = {}".format(y_pred.shape))
print("caches[1][1][3] = {}".format(caches[1][1][3]))
print("len(caches) = {}".format(len(caches)))

a[4][1] = [-0.99999375  0.77911235 -0.99861469 -0.99833267]
a.shape = (5, 10, 4)
y_pred[1][3] = [0.79560373 0.86224861 0.11118257 0.81515947]
y_pred.shape = (2, 10, 4)
caches[1][1][3] = [-1.1425182  -0.34934272 -0.20889423  0.58662319]
len(caches) = 2


### RNN backward

In [49]:
def rnn_backward(da, caches):
    """
    Implement the backward pass for a RNN over an entire sequence of input data.
    
    Arguments:
        da --  Upstream gradients of all hidden-states, numpy array of shape (n_a, m, T_x).
        caches -- Tuple containing information from the forward pass
        
    Returns:
        gradients -- Python dictionary containing:
            dx -- Gradient w.r.t the input data, numpy array of shape (n_x, m, T_x).
            da0 -- Gradient w.r.t the initial hidden-state, numpy array of shape (n_a, m).
            dWax -- Gradient w.r.t the input's weight matrix, numpy array of shape (n_a, n_x).
            dWaa -- Gradient w.r.t the hidden-state's weight matrix, numpy array of shape (n_a, n_a).
            dba -- Gradient w.r.t the bias, numpy array of shape (n_a, 1)
    """
    
    (caches, x) = caches
    (a1, a0, x1, params) = caches[0]
    
    n_a, m, T_x = da.shape
    n_x, m = x1.shape
    
    dx = np.zeros((n_x, m, T_x))
    dWax = np.zeros((n_a, n_x))
    dWaa = np.zeros((n_a, n_a))
    dba = np.zeros((n_a, 1))
    da0 = np.zeros((n_a, m))
    da_prevt =np.zeros((n_a, m))
    
    for t in reversed(range(T_x)):
        gradients = rnn_cell_backward(da[:, :, t] + da_prevt, caches[t])
        dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
        dx[:, :, t] = dxt
        dWax += dWaxt
        dWaa += dWaat
        dba += dbat
    
    da0 = da_prevt
    
    gradients = {
        "dx": dx,
        "da0": da0,
        "dWax": dWax,
        "dWaa": dWaa,
        "dba": dba
    }
    return gradients

In [50]:
np.random.seed(1)
x = np.random.randn(3, 10, 4)
a0 = np.random.randn(5, 10)
Waa = np.random.randn(5, 5)
Wax = np.random.randn(5, 3)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)

parameters = {
    "Waa": Waa,
    "Wax": Wax,
    "Wya": Wya,
    "ba": ba,
    "by": by
}

a, y_pred, caches = rnn_forward(x, a0, parameters)
# print("a[4][1] = {}".format(a[4][1]))
# print("a.shape = {}".format(a.shape))
# print("y_pred[1][3] = {}".format(y_pred[1][3]))
# print("y_pred.shape = {}".format(y_pred.shape))
# print("caches[1][1][3] = {}".format(caches[1][1][3]))
# print("len(caches) = {}".format(len(caches)))

da = np.random.randn(5, 10, 4)
gradients = rnn_backward(da, caches)
print("gradients[\"dx\"][1][2] ={}".format(gradients["dx"][1][2]))
print("gradients[\"dx\"].shape ={}".format(gradients["dx"].shape))
print("gradients[\"da0\"][2][3] ={}".format(gradients["da0"][2][3]))
print("gradients[\"da0\"].shape ={}".format(gradients["da0"].shape))
print("gradients[\"dWax\"][3][1] ={}".format(gradients["dWax"][3][1]))
print("gradients[\"dWax\"].shape ={}".format(gradients["dWax"].shape))
print("gradients[\"dWaa\"][1][2] ={}".format(gradients["dWaa"][1][2]))
print("gradients[\"dWaa\"].shape ={}".format(gradients["dWaa"].shape))
print("gradients[\"dba\"][4] ={}".format(gradients["dba"][4]))
print("gradients[\"dba\"].shape ={}".format(gradients["dba"].shape))

gradients["dx"][1][2] =[-0.86050481 -0.14439617 -0.02986862  0.10659932]
gradients["dx"].shape =(3, 10, 4)
gradients["da0"][2][3] =0.005796914346531472
gradients["da0"].shape =(5, 10)
gradients["dWax"][3][1] =0.44296398799038283
gradients["dWax"].shape =(5, 3)
gradients["dWaa"][1][2] =0.4418386736206451
gradients["dWaa"].shape =(5, 5)
gradients["dba"][4] =[-3.51296232]
gradients["dba"].shape =(5, 1)


# Build Long Short-Term Memory (LSTM) network

## LSTM forward

### Annotation

* Forget gate

$ \Gamma_{f}^{<t>} = \sigma (W_{f}[a^{<t - 1>}, x^{<t>}] + b_{f}) $

* Update gate 

$ \Gamma_{u}^{<t>} = \sigma (W_{u}[a^{<t - 1>}, x^{<t>}] + b_{u}) $

* Updating the cell

$ \hat c^{<t>} = tanh (W_{c}[a^{<t - 1>}, x^{<t>}] + b_{c}) $

the new cell is:

$ c^{<t>} = \Gamma _{f}^{<t>} * c^{<t-1>}+ \Gamma _{u}^{<t>} * \hat c^{<t>} $

* Output gate

$ \Gamma_{o}^{<t>} = \sigma (W_{o}[a^{<t - 1>}, x^{<t>}] + b_{o}) $

$  a^{<t>} = \Gamma _{o}^{<t>} * tanh(c^{<t>}) $

### LSTM cell forward

In [10]:
def lstm_cell_forward(xt, a_prev, c_prev, params):
    """
    Implement a single forward of LSTM-cell
    
    Arguments:
        xt -- Input dtat at timestamp "t", numpy array of shape (n_x, m).
        a_prev -- Hidden state at timestamp "t-1", numpy array of shape (n_a, m).
        c_prev -- Memory state at timestamp "t-1", numpy array of shape (n_a, m).
        params -- Python dictionary containing:
            Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x).
            bf -- Bias of the forgat gate, numpy array of shape (n_a, 1).
            
            Wu -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x).
            bu -- Bias of the update gate, numpy array of shape (n_a, 1).
            
            Wc -- Weight matrix of the "updating cell", numpy array of shape (n_a, n_a + n_x).
            bc -- Bias of the "updating cell", numpy array of shape (n_a, 1).
            
            Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x).
            bo -- Bias of the output gate, numpy array shape (n_a, 1).
            
            Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a).
            by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1).
            
    Returns:
        a_next -- Next hidden state, numpy array of shape (n_a, m).
        c_next -- Next memory state, numpy array of shape (n_a, m).
        yt_pred -- Prediction at timestamp "t", numpy array of shape (n_y, m).
        cache -- Tuple of values needed for the backward pass, that contains (a_next, c_next, a_prev, c_prev, ft, ut, ct, ot, xt, params).
    """
    
    Wf = params["Wf"]
    bf = params["bf"]
    
    Wu = params["Wu"]
    bu = params["bu"]
    
    Wc = params["Wc"]
    bc = params["bc"]
    
    Wo = params["Wo"]
    bo = params["bo"]
    
    Wy = params["Wy"]
    by = params["by"]
    
    n_x, m = xt.shape
    n_y, n_a = Wy.shape
    
    concat = np.zeros(((n_a + n_x), m))
    concat[:n_a] = a_prev
    concat[n_a:] = xt
    
    ft = sigmoid(np.matmul(Wf, concat) + bf)
    ut = sigmoid(np.matmul(Wu, concat) + bu)
    ct = np.tanh(np.matmul(Wc, concat) + bc)
    c_next = ft * c_prev + ut * ct
    ot = sigmoid(np.matmul(Wo, concat) + bo)
    a_next = ot * np.tanh(c_next)
    
    yt_pred = softmax(np.matmul(Wy, a_next) + by)
    cache = (a_next, c_next, a_prev, c_prev, ft, ut, ct, ot, xt, params)
    
    return a_next, c_next, yt_pred, cache

In [11]:
np.random.seed(1)
xt = np.random.randn(3, 10)
a_prev = np.random.randn(5, 10)
c_prev = np.random.randn(5, 10)

Wf = np.random.randn(5, 5 + 3)
bf = np.random.randn(5, 1)

Wu = np.random.randn(5, 5 + 3)
bu = np.random.randn(5, 1)

Wc = np.random.randn(5, 5 + 3)
bc = np.random.randn(5, 1)

Wo = np.random.randn(5, 5 + 3)
bo = np.random.randn(5, 1)

Wy = np.random.randn(2, 5)
by = np.random.randn(2, 1)

params = {
    "Wf": Wf,
    "bf": bf,
    "Wu": Wu,
    "bu": bu,
    "Wc": Wc,
    "bc": bc,
    "Wo": Wo,
    "bo": bo,
    "Wy": Wy,
    "by": by
}

a_next, c_next, yt, cache = lstm_cell_forward(xt, a_prev, c_prev, params)

print("a_next[4] = {}".format(a_next[4]))
print("a_next.shape = {}".format(a_next.shape))

print("c_next[2] = {}".format(c_next[2]))
print("c_next.shape = {}".format(c_next.shape))

print("yt[1] = {}".format(yt[1]))
print("yt.shape = {}".format(yt.shape))

print("cache[1][3] = {}".format(cache[1][3]))
print("len(cache) = {}".format(len(cache)))

a_next[4] = [ 0.05302516 -0.49117974  0.00497924  0.0050003  -0.04937177 -0.72447576
  0.55359572  0.30934911 -0.79720182  0.00767235]
a_next.shape = (5, 10)
c_next[2] = [ 0.81236671  1.42016591  0.44683107  0.69566134 -0.69847574  0.62529235
  0.98900166 -0.980869   -0.05246253 -0.557182  ]
c_next.shape = (5, 10)
yt[1] = [0.19521836 0.77477337 0.32244571 0.28256318 0.44080494 0.86861407
 0.02557793 0.2527992  0.90271909 0.35126109]
yt.shape = (2, 10)
cache[1][3] = [-0.17804796 -0.5188874   0.16452496  0.46317388 -0.00286655 -1.74355573
 -0.09878795 -0.34519967 -0.33174189  1.08529658]
len(cache) = 10


### LSTM forward

In [14]:
def lstm_forward(x, a0, params):
    """
    Implement the forward propagation of the recurrent neural network using an LSTM cell.
    
    Arguments:
        x -- Input data for every time-step, numpy array of shape (n_x, m, T_x).
        a0 -- Initial hidden-state, numpy of shape (n_a, m)
        params -- Python dictionary containing:
            Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x).
            bf -- Bias of the forgat gate, numpy array of shape (n_a, 1).
            
            Wu -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x).
            bu -- Bias of the update gate, numpy array of shape (n_a, 1).
            
            Wc -- Weight matrix of the "updating cell", numpy array of shape (n_a, n_a + n_x).
            bc -- Bias of the "updating cell", numpy array of shape (n_a, 1).
            
            Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x).
            bo -- Bias of the output gate, numpy array shape (n_a, 1).
            
            Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a).
            by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1).
        
    Returns:
        a -- Hidden-states for every time-step, numpy array of shape (n_a, m, T_x).
        y -- Predictions for every time-step, numpy array of shape (n_y, m, T_x).
        caches -- Tuple of values needed for the backward pass, that contains (list of all the cache, x).
    """
    
    caches = []
    
    n_x, m, T_x = x.shape
    n_y, n_a = params["Wy"].shape
    
    a = np.zeros((n_a, m, T_x))
    c = np.zeros((n_a, m, T_x))
    y = np.zeros((n_y, m, T_x))
    
    a_next = a0
    c_next = np.zeros((n_a, m))
    
    for t in range(T_x):
        a_next, c_next, yt, cache = lstm_cell_forward(x[:, :, t], a_next, c_next, params)
        a[:, :, t] = a_next
        c[:, :, t] = c_next
        y[:, :, t] = yt
        caches.append(cache)
        
    caches = (caches, x)
    return a, c, y, caches

In [16]:
np.random.seed(1)
x = np.random.randn(3, 10, 7)
a0 = np.random.randn(5, 10)

Wf = np.random.randn(5, 5 + 3)
bf = np.random.randn(5, 1)

Wu = np.random.randn(5, 5 + 3)
bu = np.random.randn(5, 1)

Wc = np.random.randn(5, 5 + 3)
bc = np.random.randn(5, 1)

Wo = np.random.randn(5, 5 + 3)
bo = np.random.randn(5, 1)

Wy = np.random.randn(2, 5)
by = np.random.randn(2, 1)

params = {
    "Wf": Wf,
    "bf": bf,
    "Wu": Wu,
    "bu": bu,
    "Wc": Wc,
    "bc": bc,
    "Wo": Wo,
    "bo": bo,
    "Wy": Wy,
    "by": by
}

a, c, y, caches = lstm_forward(x, a0, params)

print("a[4][3][6] = {}".format(a[4][3][6]))
print("a.shape = {}".format(a.shape))

print("c[1][2][1] = {}".format(c[1][2][1]))
print("c.shape = {}".format(c.shape))

print("y[1][4][3] = {}".format(y[1][4][3]))
print("y.shape = {}".format(y.shape))

print("caches[1][1][1] = {}".format(caches[1][1][1]))
print("len(caches) = {}".format(len(caches)))

a[4][3][6] = -0.32076902501925975
a.shape = (5, 10, 7)
c[1][2][1] = 0.9946866065177291
c.shape = (5, 10, 7)
y[1][4][3] = 0.686637615446096
y.shape = (2, 10, 7)
caches[1][1][1] = [ 0.82797464  0.23009474  0.76201118 -0.22232814 -0.20075807  0.18656139
  0.41005165]
len(caches) = 2
