In [1]:
import numpy as np
import theano
import theano.tensor as TT
import matplotlib.pyplot as plt
import time
from datetime import datetime

# Gradients for RNN on 1 example at a time

## Gradients with simple functions

In [2]:
# Number of input units
nin = 1
# Number of hidden units
nh = 4
# Number of output units
nout = 1

In [3]:
th_W_xh = TT.matrix()
th_W_hh = TT.matrix()
th_W_hy = TT.matrix()

th_x = TT.matrix()
th_y = TT.matrix()
th_h0 = TT.vector()

In [None]:
# Compute gradients using Theano
# Hidden state
#th_h = TT.tanh(TT.dot(th_x, th_W_xh) + TT.dot(th_h0, th_W_hh))
#th_yhat = TT.tanh(TT.dot(th_h, th_W_hy))
def step(th_x_t, th_h_tm1, th_W_xh, th_W_hh, th_W_hy):
    th_h_t = TT.dot(th_x_t, th_W_xh) + TT.dot(th_h_tm1, th_W_hh)
    th_yhat_t = TT.dot(th_h_t, th_W_hy)
    return th_h_t, th_yhat_t

[th_h, th_yhat], _ = theano.scan(step,
                                 sequences=th_x,
                                 outputs_info=[th_h0, None],
                                 non_sequences=[th_W_xh, th_W_hh, 
                                                th_W_hy])

# Gradients using Cost function
th_gW_xh, th_gW_hh, th_gW_hy = \
        TT.grad(0.5 * ((th_yhat - th_y)**2).sum(),
               [th_W_xh, th_W_hh, th_W_hy])
th_fn = theano.function(
        [th_x, th_h0, th_y, th_W_xh, th_W_hh, th_W_hy],
        [th_gW_xh, th_gW_hh, th_gW_hy])

In [5]:
# Numerical examples
T = 10
np_h0 = np.random.uniform(size=(nh,))
np_x = np.random.uniform(size=(T, nin))
np_y = np.random.uniform(size=(T, nout))

np_W_xh = np.random.uniform(size=(nin, nh))
np_W_hh = np.random.uniform(size=(nh, nh))
np_W_hy = np.random.uniform(size=(nh, nout))

In [6]:
# Compute gradients using Numpy
#np_h = np.tanh(np.dot(np_x, np_W_xh) + np.dot(np_h0, np_W_hh))
#np_y = np.tanh(np.dot(np_h, np_W_hy))

# Forward propagation
def forwardPropagation(np_x, np_h0, W_xh, W_hh, W_hy):
    h_tm1 = np_h0
    T = np_x.shape[0]
    nin, nh = W_xh.shape
    nout = W_hy.shape[1]
    np_h = np.zeros((T+1, nh))
    np_h[0,:] = np_h0
    np_yhat = np.zeros((T+1, nout))
    for t in xrange(1, T+1):
        h_t = np.dot(np_x[t-1], W_xh) + np.dot(h_tm1, W_hh)
        np_h[t, :] = h_t
        h_tm1 = h_t
        yhat = np.dot(h_t, W_hy)
        np_yhat[t, :] = yhat
    return np_h, np_yhat
    
np_h, np_yhat = forwardPropagation(np_x, np_h0, np_W_xh, np_W_hh, np_W_hy)
loss = 0.5 * ((np_yhat[1:,:] - np_y)**2).sum()

In [7]:
# Theano gradients
th_dE_xh, th_dE_hh, th_dE_hy = \
    th_fn(np_x, np_h0, np_y, np_W_xh, np_W_hh, np_W_hy)
print "th_dE_xh:"
print th_dE_xh
print "th_dE_hh:"
print th_dE_hh
print "th_dE_hy:"
print th_dE_hy

th_dE_xh:
[[ 3843091.87195746  3628936.03503639  4728780.4133443   3696203.19943621]]
th_dE_hh:
[[ 15914946.88268464  17170350.24241574  20062981.09738154
   17643237.53417961]
 [ 41846309.03094722  45720706.27472171  52889715.52696946
   47015170.46867734]
 [ 19314667.46135995  20676929.01944304  24311933.07009304
   21236327.66783883]
 [ 32544741.54535294  35618185.19355418  41144126.16714437
   36630842.90496936]]
th_dE_hy:
[[  6956776.50330535]
 [ 20178556.44192761]
 [  7917117.01115057]
 [ 15875698.27320223]]


In [8]:
# Numpy gradients
# np_h : T  x nh, np_yhat : T x nout
np_dE_hy = np.dot(np_h[1:,:].T, (np_yhat[1:, :] - np_y))
print "np_dE_hy:"
print np_dE_hy

np_dE_hy:
[[  6956776.50330535]
 [ 20178556.44192761]
 [  7917117.01115057]
 [ 15875698.27320223]]


In [9]:
dE_hh = np.zeros((nh, nh))

dhh = np.zeros((T+1, nh))
for t in xrange(T,0,-1):
    # dE_t/dy_t * dy_t/dh_t
    dhh[t] += np.dot(np_W_hy, (np_yhat[t,:] - np_y[t-1,:]))
    dhh[t-1] += np.dot(np_W_hh, dhh[t])
    dE_hh += np.outer(np_h[t-1,:], dhh[t])
print "dE_hh:"
print dE_hh
print "th_dE_hh:"
print th_dE_hh
print np.allclose(dE_hh, th_dE_hh)

dE_hh:
[[ 15914946.88268464  17170350.24241574  20062981.09738154
   17643237.53417961]
 [ 41846309.03094722  45720706.27472171  52889715.52696946
   47015170.46867734]
 [ 19314667.46135995  20676929.01944304  24311933.07009304
   21236327.66783883]
 [ 32544741.54535295  35618185.19355417  41144126.16714437
   36630842.90496936]]
th_dE_hh:
[[ 15914946.88268464  17170350.24241574  20062981.09738154
   17643237.53417961]
 [ 41846309.03094722  45720706.27472171  52889715.52696946
   47015170.46867734]
 [ 19314667.46135995  20676929.01944304  24311933.07009304
   21236327.66783883]
 [ 32544741.54535294  35618185.19355418  41144126.16714437
   36630842.90496936]]
True


In [10]:
dE_xh = np.zeros((nin, nh))

dhh = np.zeros((T+1, nh))
for t in xrange(T,0,-1):
    # dE_t/dy_t * dy_t/dh_t
    dhh[t] += np.dot(np_W_hy, (np_yhat[t,:] - np_y[t-1,:]))
    dhh[t-1] += np.dot(np_W_hh, dhh[t])
    dE_xh += np.outer(np_x[t-1,:], dhh[t])
print "dE_xh:"
print dE_xh
print "th_dE_xh:"
print th_dE_xh
print np.allclose(dE_xh, th_dE_xh)

dE_xh:
[[ 3843091.87195746  3628936.03503639  4728780.4133443   3696203.19943621]]
th_dE_xh:
[[ 3843091.87195746  3628936.03503639  4728780.4133443   3696203.19943621]]
True


## Gradients with tanh non-linearity

In [11]:
# Compute gradients using Theano
# Hidden state
#th_h = TT.tanh(TT.dot(th_x, th_W_xh) + TT.dot(th_h0, th_W_hh))
#th_yhat = TT.tanh(TT.dot(th_h, th_W_hy))
def step(th_x_t, th_h_tm1, th_W_xh, th_W_hh, th_W_hy):
    th_h_t = TT.tanh(TT.dot(th_x_t, th_W_xh) + TT.dot(th_h_tm1, th_W_hh))
    th_yhat_t = TT.tanh(TT.dot(th_h_t, th_W_hy))
    return th_h_t, th_yhat_t

[th_h, th_yhat], _ = theano.scan(step,
                                 sequences=th_x,
                                 outputs_info=[th_h0, None],
                                 non_sequences=[th_W_xh, th_W_hh, 
                                                th_W_hy])

# Gradients using Cost function
th_gW_xh, th_gW_hh, th_gW_hy = \
        TT.grad(0.5 * ((th_yhat - th_y)**2).sum(),
               [th_W_xh, th_W_hh, th_W_hy])
th_fn = theano.function(
        [th_x, th_h0, th_y, th_W_xh, th_W_hh, th_W_hy],
        [th_gW_xh, th_gW_hh, th_gW_hy])

In [12]:
# Numerical examples
T = 10
np_h0 = np.random.uniform(size=(nh,))
np_x = np.random.uniform(size=(T, nin))
np_y = np.random.uniform(size=(T, nout))

np_W_xh = np.random.uniform(size=(nin, nh))
np_W_hh = np.random.uniform(size=(nh, nh))
np_W_hy = np.random.uniform(size=(nh, nout))

In [13]:
# Compute gradients using Numpy
#np_h = np.tanh(np.dot(np_x, np_W_xh) + np.dot(np_h0, np_W_hh))
#np_y = np.tanh(np.dot(np_h, np_W_hy))

# Forward propagation
def forwardPropagation(np_x, np_h0, W_xh, W_hh, W_hy):
    h_tm1 = np_h0
    T = np_x.shape[0]
    nin, nh = W_xh.shape
    nout = W_hy.shape[1]
    np_h = np.zeros((T+1, nh))
    np_h[0,:] = np_h0
    np_yhat = np.zeros((T+1, nout))
    for t in xrange(1, T+1):
        h_t = np.tanh(np.dot(np_x[t-1], W_xh) + np.dot(h_tm1, W_hh))
        np_h[t, :] = h_t
        h_tm1 = h_t
        yhat = np.tanh(np.dot(h_t, W_hy))
        np_yhat[t, :] = yhat
    return np_h, np_yhat
    
np_h, np_yhat = forwardPropagation(np_x, np_h0, np_W_xh, np_W_hh, np_W_hy)
loss = 0.5 * ((np_yhat[1:,:] - np_y)**2).sum()

In [14]:
# Theano gradients
th_dE_xh, th_dE_hh, th_dE_hy = \
    th_fn(np_x, np_h0, np_y, np_W_xh, np_W_hh, np_W_hy)
print "th_dE_xh:"
print th_dE_xh
print "th_dE_hh:"
print th_dE_hh
print "th_dE_hy:"
print th_dE_hy

th_dE_xh:
[[ 0.00797597  0.01390864  0.04685298  0.04595759]]
th_dE_hh:
[[ 0.00888297  0.01074361  0.06103421  0.05704386]
 [ 0.01136745  0.01589703  0.07488689  0.07049657]
 [ 0.00655993  0.00644153  0.04772336  0.04418974]
 [ 0.01205341  0.01794788  0.07802399  0.07367795]]
th_dE_hy:
[[ 0.58336303]
 [ 0.579037  ]
 [ 0.52081292]
 [ 0.51277335]]


In [15]:
# Numpy gradients
# np_h : T  x nh, np_yhat : T x nout
np_dE_hy = np.dot(np_h[1:,:].T, 
          (np_yhat[1:, :] - np_y) * (1 - np_yhat[1:, :] ** 2))
print "np_dE_hy:"
print np_dE_hy
print "th_dE_hy:"
print th_dE_hy
print np.allclose(np_dE_hy, th_dE_hy)

np_dE_hy:
[[ 0.58336303]
 [ 0.579037  ]
 [ 0.52081292]
 [ 0.51277335]]
th_dE_hy:
[[ 0.58336303]
 [ 0.579037  ]
 [ 0.52081292]
 [ 0.51277335]]
True


In [16]:
dE_hh = np.zeros((nh, nh))

dhh = np.zeros((T+1, nh))
for t in xrange(T,0,-1):
    # dE_t/dy_t * dy_t/dh_t
    dhh[t] += np.dot(np_W_hy, 
             (np_yhat[t,:] - np_y[t-1,:]) * (1 - np_yhat[t,:] ** 2))
    r = dhh[t] * (1 - np_h[t,:] ** 2)
    dhh[t-1] += np.dot(np_W_hh, r)
    dE_hh += np.outer(np_h[t-1,:], r)
print "dE_hh:"
print dE_hh
print "th_dE_hh:"
print th_dE_hh
print np.allclose(dE_hh, th_dE_hh)

dE_hh:
[[ 0.00888297  0.01074361  0.06103421  0.05704386]
 [ 0.01136745  0.01589703  0.07488689  0.07049657]
 [ 0.00655993  0.00644153  0.04772336  0.04418974]
 [ 0.01205341  0.01794788  0.07802399  0.07367795]]
th_dE_hh:
[[ 0.00888297  0.01074361  0.06103421  0.05704386]
 [ 0.01136745  0.01589703  0.07488689  0.07049657]
 [ 0.00655993  0.00644153  0.04772336  0.04418974]
 [ 0.01205341  0.01794788  0.07802399  0.07367795]]
True


In [17]:
dE_xh = np.zeros((nin, nh))

dhh = np.zeros((T+1, nh))
for t in xrange(T,0,-1):
    # dE_t/dy_t * dy_t/dh_t
    dhh[t] += np.dot(np_W_hy, 
             (np_yhat[t,:] - np_y[t-1,:]) * (1 - np_yhat[t,:] ** 2))
    r = dhh[t] * (1 - np_h[t,:] ** 2)
    dhh[t-1] += np.dot(np_W_hh, r)
    #dE_xh += np.outer(np_x[t-1,:], r)
    dE_xh += np.reshape(np_x[t-1,:], (np_x.shape[1], 1)) * r.T
print "dE_xh:"
print dE_xh
print "th_dE_xh:"
print th_dE_xh
print np.allclose(dE_xh, th_dE_xh)


dE_xh:
[[ 0.00797597  0.01390864  0.04685298  0.04595759]]
th_dE_xh:
[[ 0.00797597  0.01390864  0.04685298  0.04595759]]
True


# Gradients for RNN with mini-batches

In [18]:
# Number of input units
nin = 3
# Number of hidden units
nh = 7
# Number of output units
nout = 1
# Number of batches
nbatches = 10

In [19]:
th_W_xh = TT.matrix()
th_W_hh = TT.matrix()
th_W_hy = TT.matrix()

th_x = TT.tensor3()
th_y = TT.tensor3()
th_h0 = TT.matrix()

In [20]:
# Compute gradients using Theano
# Hidden state
#th_h = TT.tanh(TT.dot(th_x, th_W_xh) + TT.dot(th_h0, th_W_hh))
#th_yhat = TT.tanh(TT.dot(th_h, th_W_hy))
def step(th_x_t, th_h_tm1, th_W_xh, th_W_hh, th_W_hy):
    th_h_t = TT.tanh(TT.dot(th_x_t, th_W_xh) + TT.dot(th_h_tm1, th_W_hh))
    th_yhat_t = TT.tanh(TT.dot(th_h_t, th_W_hy))
    return th_h_t, th_yhat_t

[th_h, th_yhat], _ = theano.scan(step,
                                 sequences=th_x,
                                 outputs_info=[th_h0, None],
                                 non_sequences=[th_W_xh, th_W_hh, 
                                                th_W_hy])

th_error = 0.5 * ((th_yhat - th_y) ** 2).sum().sum()
# Gradients using Cost function
th_gW_xh, th_gW_hh, th_gW_hy = \
                    TT.grad(th_error, [th_W_xh, th_W_hh, th_W_hy])
th_fn = theano.function(
        [th_x, th_h0, th_y, th_W_xh, th_W_hh, th_W_hy],
        [th_gW_xh, th_gW_hh, th_gW_hy])

In [21]:
# Numerical examples
T = 10
np_h0 = np.random.uniform(size=(nbatches, nh))
np_x = np.random.uniform(size=(T, nbatches, nin))
np_y = np.random.uniform(size=(T, nbatches, nout))

np_W_xh = np.random.uniform(size=(nin, nh))
np_W_hh = np.random.uniform(size=(nh, nh))
np_W_hy = np.random.uniform(size=(nh, nout))

In [22]:
# Theano gradients
th_dE_xh, th_dE_hh, th_dE_hy = \
    th_fn(np_x, np_h0, np_y, np_W_xh, np_W_hh, np_W_hy)
print "th_dE_xh:"
print th_dE_xh
print "th_dE_hh:"
print th_dE_hh
print "th_dE_hy:"
print th_dE_hy

th_dE_xh:
[[  9.46756716e-04   1.42526584e-03   6.11103857e-05   1.88734985e-03
    1.37105367e-03   1.58197431e-03   1.64599757e-03]
 [  1.36294685e-03   2.36832618e-03   1.05536765e-04   3.06072720e-03
    2.19073716e-03   2.04764692e-03   2.86222114e-03]
 [  8.74789750e-04   1.40926535e-03   6.10092933e-05   1.57079518e-03
    1.50081140e-03   1.39801721e-03   1.57214105e-03]]
th_dE_hh:
[[  1.67672259e-03   2.09713662e-03   8.28578133e-05   2.74616998e-03
    1.93149022e-03   2.42004893e-03   2.52409212e-03]
 [  1.71538998e-03   2.37860974e-03   1.04111007e-04   2.73262221e-03
    2.26754066e-03   2.73786290e-03   2.35931382e-03]
 [  1.54090492e-03   1.73827051e-03   8.01977375e-05   2.49854015e-03
    1.81878473e-03   2.26039066e-03   2.03737760e-03]
 [  1.56890346e-03   1.99554964e-03   8.25398330e-05   2.32308599e-03
    1.89028623e-03   2.42575345e-03   2.13608479e-03]
 [  2.17194907e-03   3.56360456e-03   1.41147209e-04   3.82959321e-03
    2.90491922e-03   3.50257073e-03   3.4

In [23]:
# Compute gradients using Numpy
# Forward propagation
def forwardPropagationBatch(np_x, np_h0, W_xh, W_hh, W_hy):
    h_tm1 = np_h0
    T = np_x.shape[0]
    nin, nh = W_xh.shape
    nout = W_hy.shape[1]
    np_h = np.zeros((T+1, nbatches, nh))
    np_h[0,:, :] = np_h0
    np_yhat = np.zeros((T+1, nbatches, nout))
    for t in xrange(1, T+1):
        h_t = np.tanh(np.dot(np_x[t-1], W_xh) + np.dot(h_tm1, W_hh))
        np_h[t, :, :] = h_t
        h_tm1 = h_t
        yhat = np.tanh(np.dot(h_t, W_hy))
        np_yhat[t, :, :] = yhat
    return np_h, np_yhat
    
np_h, np_yhat = forwardPropagationBatch(np_x, np_h0, 
                                   np_W_xh, np_W_hh, np_W_hy)
loss = 0.5 * ((np_yhat[1:,:,:] - np_y)**2).sum().sum()
"""
print "np_h:"
print np_h
print "np_yhat:"
print np_yhat
"""
snp_h = np.zeros((T+1, nbatches, nh))
snp_yhat = np.zeros((T+1, nbatches, nout))
for nb in xrange(nbatches):
    cur_h, cur_yhat = \
    forwardPropagation(np_x[:, nb, :], np_h0[nb, :], 
                       np_W_xh, np_W_hh, np_W_hy)
    snp_h[:, nb, :] = cur_h
    snp_yhat[:, nb, :] = cur_yhat
"""
print "snp_h:"
print snp_h
print "snp_yhat:"
print snp_yhat
"""
print np.allclose(np_h, snp_h)
print np.allclose(np_yhat, snp_yhat)

True
True


In [24]:
# Numpy gradients
# np_h : T  x nh, np_yhat : T x nout

h1 = np_h[1:,:,:]
print h1.shape
y1 = (np_yhat[1:, :, :] - np_y) * (1 - np_yhat[1:, :, :] ** 2)
print y1.shape
np_dE_hy = np.tensordot(h1, y1, axes=([0, 1], [0, 1]))
print "np_dE_hy:"
print np_dE_hy
print "th_dE_hy:"
print th_dE_hy
print np.allclose(np_dE_hy, th_dE_hy)

(10, 10, 7)
(10, 10, 1)
np_dE_hy:
[[ 0.45825637]
 [ 0.46383301]
 [ 0.46621112]
 [ 0.4629324 ]
 [ 0.46461205]
 [ 0.46499326]
 [ 0.46342569]]
th_dE_hy:
[[ 0.45825637]
 [ 0.46383301]
 [ 0.46621112]
 [ 0.4629324 ]
 [ 0.46461205]
 [ 0.46499326]
 [ 0.46342569]]
True


In [25]:
dE_hh = np.zeros((nh, nh))

for nb in xrange(nbatches):
    dhh = np.zeros((T+1, nh))
    for t in xrange(T,0,-1):
        # dE_t/dy_t * dy_t/dh_t
        dhh[t] += np.dot(np_W_hy, 
                 (np_yhat[t,nb,:] - np_y[t-1,nb,:]) * (1 - np_yhat[t,nb,:] ** 2))
        r = dhh[t] * (1 - np_h[t,nb,:] ** 2)
        dhh[t-1] += np.dot(np_W_hh, r)
        #dE_hh += np.outer(np_h[t-1,nb,:], r)
        dE_hh += np.reshape(np_h[t - 1, nb, :], (nh, 1)) * r
        
print "dE_hh:"
print dE_hh
print "th_dE_hh:"
print th_dE_hh
print np.allclose(dE_hh, th_dE_hh)

dE_hh = np.zeros((nh, nh))
dhh = np.zeros((T+1, nbatches, nh))
for t in xrange(T, 0, -1):
    for nb in xrange(nbatches):
        dhh[t, nb, :] += \
            np.dot(np_W_hy,
            (np_yhat[t,nb,:] - np_y[t-1,nb,:]) * (1 - np_yhat[t,nb,:] ** 2))
        r = dhh[t, nb, :] * (1 - np_h[t, nb, :] ** 2)
        dhh[t-1, nb, :] += np.dot(np_W_hh, r)
        dE_hh += np.reshape(np_h[t-1, nb, :], (nh,1)) * r

print "dE_hh:"
print dE_hh
print "th_dE_hh:"
print th_dE_hh
print np.allclose(dE_hh, th_dE_hh)

dE_hh:
[[  1.67672259e-03   2.09713662e-03   8.28578133e-05   2.74616998e-03
    1.93149022e-03   2.42004893e-03   2.52409212e-03]
 [  1.71538998e-03   2.37860974e-03   1.04111007e-04   2.73262221e-03
    2.26754066e-03   2.73786290e-03   2.35931382e-03]
 [  1.54090492e-03   1.73827051e-03   8.01977375e-05   2.49854015e-03
    1.81878473e-03   2.26039066e-03   2.03737760e-03]
 [  1.56890346e-03   1.99554964e-03   8.25398330e-05   2.32308599e-03
    1.89028623e-03   2.42575345e-03   2.13608479e-03]
 [  2.17194907e-03   3.56360456e-03   1.41147209e-04   3.82959321e-03
    2.90491922e-03   3.50257073e-03   3.41217100e-03]
 [  1.91020079e-03   2.67223276e-03   1.24279642e-04   3.39490380e-03
    2.71056203e-03   2.90823355e-03   3.05830751e-03]
 [  2.28098681e-03   3.71619344e-03   1.48766562e-04   4.19009326e-03
    3.09754789e-03   3.57129055e-03   3.83318380e-03]]
th_dE_hh:
[[  1.67672259e-03   2.09713662e-03   8.28578133e-05   2.74616998e-03
    1.93149022e-03   2.42004893e-03   2.5240

In [26]:
dE_hh = np.zeros((nh, nh))
dhh = np.zeros((T+1, nbatches, nh))

for t in xrange(T,0,-1):
    # dE_t/dy_t * dy_t/dh_t
    m = (np_yhat[t, :, :] - np_y[t-1, :, :]) * (1 - np_yhat[t, :, :] ** 2)
    dhh[t, :, :] += np.dot(m, np_W_hy.T)
    r = dhh[t, :, :] * (1 - np_h[t, :, :] ** 2)
    
    dhh[t-1, :, :] += np.dot(r, np_W_hh.T)
    dE_hh += np.dot(np_h[t-1, :, :].T, r)
    
print "dE_hh:"
print dE_hh
print "th_dE_hh:"
print th_dE_hh
print np.allclose(dE_hh, th_dE_hh)

dE_hh:
[[  1.67672259e-03   2.09713662e-03   8.28578133e-05   2.74616998e-03
    1.93149022e-03   2.42004893e-03   2.52409212e-03]
 [  1.71538998e-03   2.37860974e-03   1.04111007e-04   2.73262221e-03
    2.26754066e-03   2.73786290e-03   2.35931382e-03]
 [  1.54090492e-03   1.73827051e-03   8.01977375e-05   2.49854015e-03
    1.81878473e-03   2.26039066e-03   2.03737760e-03]
 [  1.56890346e-03   1.99554964e-03   8.25398330e-05   2.32308599e-03
    1.89028623e-03   2.42575345e-03   2.13608479e-03]
 [  2.17194907e-03   3.56360456e-03   1.41147209e-04   3.82959321e-03
    2.90491922e-03   3.50257073e-03   3.41217100e-03]
 [  1.91020079e-03   2.67223276e-03   1.24279642e-04   3.39490380e-03
    2.71056203e-03   2.90823355e-03   3.05830751e-03]
 [  2.28098681e-03   3.71619344e-03   1.48766562e-04   4.19009326e-03
    3.09754789e-03   3.57129055e-03   3.83318380e-03]]
th_dE_hh:
[[  1.67672259e-03   2.09713662e-03   8.28578133e-05   2.74616998e-03
    1.93149022e-03   2.42004893e-03   2.5240

In [27]:
dE_xh = np.zeros((nin, nh))
dhh = np.zeros((T+1, nbatches, nh))

for t in xrange(T,0,-1):
    # dE_t/dy_t * dy_t/dh_t
    m = (np_yhat[t, :, :] - np_y[t-1, :, :]) * (1 - np_yhat[t, :, :] ** 2)
    dhh[t, :, :] += np.dot(m, np_W_hy.T)
    r = dhh[t, :, :] * (1 - np_h[t, :, :] ** 2)
    
    dhh[t-1, :, :] += np.dot(r, np_W_hh.T)
    dE_xh += np.dot(np_x[t-1, :, :].T, r)
    
print "dE_xh:"
print dE_xh
print "th_dE_xh:"
print th_dE_xh
print np.allclose(dE_xh, th_dE_xh)

dE_xh:
[[  9.46756716e-04   1.42526584e-03   6.11103857e-05   1.88734985e-03
    1.37105367e-03   1.58197431e-03   1.64599757e-03]
 [  1.36294685e-03   2.36832618e-03   1.05536765e-04   3.06072720e-03
    2.19073716e-03   2.04764692e-03   2.86222114e-03]
 [  8.74789750e-04   1.40926535e-03   6.10092933e-05   1.57079518e-03
    1.50081140e-03   1.39801721e-03   1.57214105e-03]]
th_dE_xh:
[[  9.46756716e-04   1.42526584e-03   6.11103857e-05   1.88734985e-03
    1.37105367e-03   1.58197431e-03   1.64599757e-03]
 [  1.36294685e-03   2.36832618e-03   1.05536765e-04   3.06072720e-03
    2.19073716e-03   2.04764692e-03   2.86222114e-03]
 [  8.74789750e-04   1.40926535e-03   6.10092933e-05   1.57079518e-03
    1.50081140e-03   1.39801721e-03   1.57214105e-03]]
True
