# NLP and DeepLearning Assignment1
### Working progress

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import sklearn
import random

### Define the softmax function
#### q1_softmax.py

In [2]:
def softmax(x):
    """Compute the softmax function for each row of the input x.

    It is crucial that this function is optimized for speed because
    it will be used frequently in later code. You might find numpy
    functions np.exp, np.sum, np.reshape, np.max, and numpy
    broadcasting useful for this task.

    Numpy broadcasting documentation:
    http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html

    You should also make sure that your code works for a single
    N-dimensional vector (treat the vector as a single row) and
    for M x N matrices. This may be useful for testing later. Also,
    make sure that the dimensions of the output match the input.

    You must implement the optimization in problem 1(a) of the
    written assignment!

    Arguments:
    x -- A N dimensional vector or M x N dimensional numpy matrix.

    Return:
    x -- You are allowed to modify x in-place
    """
    orig_shape = x.shape

    if len(x.shape) > 1:
        # Matrix
        ### YOUR CODE HERE
        # raise NotImplementedError

        maxs = np.max(x, axis=1) # An array of max values of each line
        x = x - maxs.reshape(maxs.shape[0], 1) # for each line, subtract the max value
        sums = np.sum(np.exp(x), axis = 1)
        x = np.exp(x) / sums.reshape(sums.shape[0], 1)
        ### END YOUR CODE
    else:
        # Vector
        ### YOUR CODE HERE
        # raise NotImplementedError
        x = x - np.max(x)
        x = np.exp(x) / np.sum(np.exp(x))

        ### END YOUR CODE

    assert x.shape == orig_shape
    return x

### Test the numpy array broadcasting operations

In [3]:
x = np.array([
    [0.26894142, 0.73105858, 23, 1],
    [0.26894142, 0.73105858, 12, 2],
    [2, 3, 4, 5]])
maxs = np.max(x, axis=1) # An array of max values of each line
x = x - maxs.reshape(maxs.shape[0], 1)
x

array([[-22.73105858, -22.26894142,   0.        , -22.        ],
       [-11.73105858, -11.26894142,   0.        , -10.        ],
       [ -3.        ,  -2.        ,  -1.        ,   0.        ]])

In [4]:
sums = np.sum(np.exp(x), axis = 1)
x = np.exp(x) / sums.reshape(sums.shape[0], 1)
x

array([[  1.34284749e-10,   2.13167810e-10,   9.99999999e-01,
          2.78946809e-10],
       [  8.03965182e-06,   1.27623948e-05,   9.99933801e-01,
          4.53969243e-05],
       [  3.20586033e-02,   8.71443187e-02,   2.36882818e-01,
          6.43914260e-01]])

### Test the softmax function

In [5]:
def test_softmax_basic():
    """
    Some simple tests to get you started.
    Warning: these are not exhaustive.
    """
    print "Running basic tests..."
    test1 = softmax(np.array([1,2]))
    print test1
    ans1 = np.array([0.26894142,  0.73105858])
    assert np.allclose(test1, ans1, rtol=1e-05, atol=1e-06)

    test2 = softmax(np.array([[1001,1002],[3,4]]))
    print test2
    ans2 = np.array([
        [0.26894142, 0.73105858],
        [0.26894142, 0.73105858]])
    assert np.allclose(test2, ans2, rtol=1e-05, atol=1e-06)

    test3 = softmax(np.array([[-1001,-1002]]))
    print test3
    ans3 = np.array([0.73105858, 0.26894142])
    assert np.allclose(test3, ans3, rtol=1e-05, atol=1e-06)

    print "You should be able to verify these results by hand!\n"

In [6]:
test_softmax_basic()

Running basic tests...
[ 0.26894142  0.73105858]
[[ 0.26894142  0.73105858]
 [ 0.26894142  0.73105858]]
[[ 0.73105858  0.26894142]]
You should be able to verify these results by hand!



### Define the sigmoid and its gradient
#### q2_sigmoid.py

In [7]:
def sigmoid(x):
    """
    Compute the sigmoid function for the input here.

    Arguments:
    x -- A scalar or numpy array.

    Return:
    s -- sigmoid(x)
    """

    ### YOUR CODE HERE
    #raise NotImplementedError
    s = 1/ (1 + np.exp(-x))
    
    ### END YOUR CODE

    return s


def sigmoid_grad(s):
    """
    Compute the gradient for the sigmoid function here. Note that
    for this implementation, the input s should be the sigmoid
    function value of your original input x.

    Arguments:
    s -- A scalar or numpy array.

    Return:
    ds -- Your computed gradient.
    """

    ### YOUR CODE HERE
    #raise NotImplementedError
    
    ds = s * (1 - s)
    ### END YOUR CODE

    return ds

In [8]:
x = np.array([[1, 2], [-1, -2]])
sigmoid(x)

array([[ 0.73105858,  0.88079708],
       [ 0.26894142,  0.11920292]])

In [9]:
def test_sigmoid_basic():
    """
    Some simple tests to get you started.
    Warning: these are not exhaustive.
    """
    print "Running basic tests..."
    x = np.array([[1, 2], [-1, -2]])
    f = sigmoid(x)
    g = sigmoid_grad(f)
    print f
    f_ans = np.array([
        [0.73105858, 0.88079708],
        [0.26894142, 0.11920292]])
    assert np.allclose(f, f_ans, rtol=1e-05, atol=1e-06)
    print g
    g_ans = np.array([
        [0.19661193, 0.10499359],
        [0.19661193, 0.10499359]])
    assert np.allclose(g, g_ans, rtol=1e-05, atol=1e-06)
    print "You should verify these results by hand!\n"

In [10]:
test_sigmoid_basic()

Running basic tests...
[[ 0.73105858  0.88079708]
 [ 0.26894142  0.11920292]]
[[ 0.19661193  0.10499359]
 [ 0.19661193  0.10499359]]
You should verify these results by hand!



### Implement a gradient checker
#### q2_gradcheck.py

In [11]:
# First implement a gradient checker by filling in the following functions
def gradcheck_naive(f, x):
    """ Gradient check for a function f.

    Arguments:
    f -- a function that takes a single argument and outputs the
         cost and its gradients
    x -- the point (numpy array) to check the gradient at
    """

    rndstate = random.getstate()
    random.setstate(rndstate)
    fx, grad = f(x) # Evaluate function value at original point
    h = 1e-4        # Do not change this!

    # Iterate over all indexes in x
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    count = 0
    while not it.finished:
        ix = it.multi_index

        # Try modifying x[ix] with h defined above to compute
        # numerical gradients. Make sure you call random.setstate(rndstate)
        # before calling f(x) each time. This will make it possible
        # to test cost functions with built in randomness later.

        ### YOUR CODE HERE:
        #raise NotImplementedError
        random.setstate(rndstate)
        current_value = x[ix] 
        ### calculate outcomes when the input at ix point change -h
        x[ix] = current_value - h
        y1, _ = f(x)
        random.setstate(rndstate)
        ### calculate outcomes when the input at ix point change h
        x[ix] = current_value + h
        y2, _ = f(x)
        numgrad = (y2 - y1) / (2*h)
        ### END YOUR CODE

        # Compare gradients
        reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))

        if reldiff > 1e-5:
            print "Gradient check failed."
            print "First gradient error found at index %s" % str(ix)
            print "Your gradient: %f \t Numerical gradient: %f" % (
                grad[ix], numgrad)
            count = count + 1

        it.iternext() # Step to next dimension

    print "Gradient check found: " + str(count) + " erors!"


def sanity_check():
    """
    Some basic sanity checks.
    """
    quad = lambda x: (np.sum(x ** 2), x * 2)

    print "Running sanity checks..."
    gradcheck_naive(quad, np.array(123.456))      # scalar test
    gradcheck_naive(quad, np.random.randn(3,))    # 1-D test
    gradcheck_naive(quad, np.random.randn(4,5))   # 2-D test
    print ""


In [12]:
sanity_check()

Running sanity checks...
Gradient check found: 0 erors!
Gradient check found: 0 erors!
Gradient check found: 0 erors!



### Implement one-layer NN forward and backward propagation
#### q2_neural.py

In [25]:
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    #raise NotImplementedError
    
    N = dimensions[0]
    
    z1 = data.dot(W1) + b1
    h = sigmoid(z1)
    z2 = h.dot(W2) + b2
    y = softmax(z2)
    cost = (np.sum(-labels * np.log(y))) / N
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    #raise NotImplementedError
    
    delta1 = (y - labels) / N
    gradb2 = np.sum(delta1, axis = 0)
    gradW2_ind = delta1[:, np.newaxis, :] * h[:,:,np.newaxis]
    gradW2 = np.sum(gradW2_ind, axis = 0)
    deriv_h = sigmoid_grad(h)
    delta2 = np.dot(delta1, W2.T)
    gradb1_ind = delta2 * deriv_h
    gradb1 = np.sum(gradb1_ind, axis = 0)
    #gradW1 = data.T.dot(delta2 * deriv_h)
    gradW1_ind = gradb1_ind[:, np.newaxis, :] * data[:, :, np.newaxis]
    gradW1 = np.sum(gradW1_ind, axis = 0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad


In [26]:
def sanity_check():
    """
    Set up fake data and parameters for the neural network, and test using
    gradcheck.
    """
    print "Running sanity check..."

    N = 20
    dimensions = [10, 5, 10]
    data = np.random.randn(N, dimensions[0])   # each row will be a datum
    labels = np.zeros((N, dimensions[2]))
    for i in xrange(N):
        labels[i, random.randint(0,dimensions[2]-1)] = 1

    params = np.random.randn((dimensions[0] + 1) * dimensions[1] + (
        dimensions[1] + 1) * dimensions[2], )

    gradcheck_naive(lambda params:
        forward_backward_prop(data, labels, params, dimensions), params)
sanity_check()

Running sanity check...
Gradient check failed.
First gradient error found at index (30,)
Your gradient: 0.058769 	 Numerical gradient: 0.058781
Gradient check failed.
First gradient error found at index (35,)
Your gradient: 0.081748 	 Numerical gradient: 0.081762
Gradient check failed.
First gradient error found at index (36,)
Your gradient: -0.070287 	 Numerical gradient: -0.070300
Gradient check failed.
First gradient error found at index (38,)
Your gradient: 0.122022 	 Numerical gradient: 0.122012
Gradient check failed.
First gradient error found at index (40,)
Your gradient: -0.185774 	 Numerical gradient: -0.185789
Gradient check failed.
First gradient error found at index (41,)
Your gradient: -0.041820 	 Numerical gradient: -0.041832
Gradient check failed.
First gradient error found at index (42,)
Your gradient: 0.042911 	 Numerical gradient: 0.042927
Gradient check failed.
First gradient error found at index (45,)
Your gradient: -0.024462 	 Numerical gradient: -0.024444
Gradient

### Step by step implementation and tests

In [27]:
N = 20
dimensions = [10, 5, 10]
data = np.random.randn(N, dimensions[0])   # each row will be a datum
labels = np.zeros((N, dimensions[2]))
ofs = 0
Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

params = np.random.randn((dimensions[0] + 1) * dimensions[1] + (
        dimensions[1] + 1) * dimensions[2], )

W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
ofs += Dx * H
b1 = np.reshape(params[ofs:ofs + H], (1, H))
ofs += H
W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
ofs += H * Dy
b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

for i in xrange(N):
    labels[i, random.randint(0,dimensions[2]-1)] = 1

In [28]:
z1 = data.dot(W1) + b1
h = sigmoid(z1)
z2 = h.dot(W2) + b2
y = softmax(z2)
cost = np.sum(-labels * np.log(y))
cost

77.223613895975276

In [29]:
labels * np.log(y)

array([[-0.        , -0.        , -0.        , -0.        , -6.54623674,
        -0.        , -0.        , -0.        , -0.        , -0.        ],
       [-0.        , -1.49248814, -0.        , -0.        , -0.        ,
        -0.        , -0.        , -0.        , -0.        , -0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -0.        , -3.60567446, -0.        , -0.        , -0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -0.        , -0.        , -0.        , -0.        , -4.84693109],
       [-0.        , -2.17272416, -0.        , -0.        , -0.        ,
        -0.        , -0.        , -0.        , -0.        , -0.        ],
       [-0.        , -0.        , -5.11648724, -0.        , -0.        ,
        -0.        , -0.        , -0.        , -0.        , -0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -2.89178453, -0.        , -0.        

In [30]:
np.sum(-1 * labels * np.log(y), axis = 1)

array([ 6.54623674,  1.49248814,  3.60567446,  4.84693109,  2.17272416,
        5.11648724,  2.89178453,  4.34249167,  1.91754824,  4.45988541,
        2.54296631,  3.80771505,  4.13441591,  1.68430598,  7.32977357,
        2.18438623,  3.54973582,  4.10209083,  6.97892486,  3.51704767])

In [31]:
np.sum(np.sum(-1 * labels * np.log(y), axis = 1))

77.223613895975262

In [32]:
delta1 = y - labels
delta1

array([[  1.95227235e-03,   2.41539850e-02,   4.66100392e-02,
          7.81184698e-01,  -9.98564492e-01,   8.63582395e-02,
          2.11298153e-02,   3.21054830e-02,   4.65937590e-03,
          4.10584416e-04],
       [  6.42955684e-03,  -7.75187406e-01,   1.12962395e-01,
          2.30967638e-01,   7.54557550e-03,   9.36620701e-02,
          1.97640806e-02,   2.34239639e-01,   6.14195028e-02,
          8.19694780e-03],
       [  8.54443236e-03,   4.47910330e-01,   3.18837075e-02,
          2.93933190e-01,   2.93937253e-03,   2.12834758e-02,
         -9.72830886e-01,   1.14247921e-01,   2.01350268e-02,
          3.19534298e-02],
       [  1.06336248e-02,   1.48799739e-01,   4.03259930e-02,
          6.56691784e-01,   1.65869962e-02,   1.08851182e-02,
          3.93190025e-02,   6.64489412e-02,   2.45636122e-03,
         -9.92147561e-01],
       [  4.89526055e-03,  -8.86132998e-01,   5.02453364e-02,
          6.61416050e-01,   1.58461726e-02,   2.36509165e-02,
          2.53542880e-02

In [33]:
gradb2 = np.sum(delta1, axis = 0)
gradb2

array([ -1.72232086,  -0.7875725 ,  -0.85270204,  10.31016395,
        -1.833639  ,  -1.15861095,  -2.16835752,   1.6714993 ,
        -1.72564111,  -1.73281925])

In [34]:
np.dot(h.T, delta1)

array([[-0.52613085, -0.62268624,  0.43884497,  4.09181847, -0.64995845,
        -0.16582412, -1.89031752,  0.99959773, -0.73772147, -0.93762252],
       [-0.66279067,  0.1031362 , -1.21751193,  6.4828994 , -0.74142512,
        -1.27442649, -1.02591716,  0.74413286, -1.78978353, -0.61831356],
       [-1.13782892,  0.01046933, -0.51991248,  4.5627367 , -1.70973455,
        -0.37292589, -0.64686929,  0.39405758, -0.03104718, -0.54894531],
       [-0.71363026, -0.58342972, -0.76193301,  5.80063587, -1.89332424,
        -0.54384718, -0.65094798,  0.3148916 ,  0.04329785, -1.01171294],
       [-0.96953153, -1.2300284 , -1.0593119 ,  6.34346645, -1.49802707,
         0.04916734, -0.83669751,  0.63822417, -0.31614489, -1.12111665]])

In [35]:
gradW2_ind = delta1[:, np.newaxis, :] * h[:,:,np.newaxis]
gradW2_ind.shape

(20, 5, 10)

In [36]:
gradW2 = np.sum(gradW2_ind, axis = 0)
gradW2.shape

(5, 10)

In [37]:
np.dot(h.T, delta1) == gradW2

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]], dtype=bool)

In [38]:
deriv_h = sigmoid_grad(h)
deriv_h

array([[ 0.22711488,  0.05319335,  0.06389117,  0.06051832,  0.22807427],
       [ 0.10285861,  0.00481792,  0.22634651,  0.14091671,  0.12333534],
       [ 0.04125389,  0.04795291,  0.00176305,  0.00038147,  0.08037629],
       [ 0.14647056,  0.16174192,  0.0313536 ,  0.05539273,  0.02950779],
       [ 0.24536392,  0.23605148,  0.09488275,  0.15090334,  0.07122657],
       [ 0.03745923,  0.01383145,  0.01621613,  0.05179103,  0.21890234],
       [ 0.22832997,  0.24441441,  0.19011564,  0.00611153,  0.16265998],
       [ 0.00095147,  0.05640323,  0.06240625,  0.01550247,  0.14005443],
       [ 0.03840941,  0.09243839,  0.03944413,  0.03851521,  0.00195944],
       [ 0.24775945,  0.247166  ,  0.24668324,  0.23597477,  0.095073  ],
       [ 0.0397947 ,  0.1854855 ,  0.12625071,  0.24126662,  0.23234088],
       [ 0.03341942,  0.2181262 ,  0.2228408 ,  0.11351771,  0.0120367 ],
       [ 0.08429161,  0.01981502,  0.05053335,  0.00563409,  0.03003383],
       [ 0.00381641,  0.01403985,  0.0

In [39]:
delta2 = np.dot(delta1, W2.T)
delta2

array([[ 1.7712793 ,  1.72181018,  1.04804574,  2.59539242,  0.30027081],
       [-0.38717734, -0.81858559, -0.29766813,  1.59464295,  0.2806725 ],
       [ 1.68982025, -0.32858798,  0.33436432, -1.62785206,  0.66346902],
       [ 0.79469565, -0.82851018, -0.70289134,  3.50054064,  2.35104312],
       [-0.43416908, -0.30244343, -0.76941363,  3.0312389 ,  1.06407829],
       [ 1.00241381,  1.58750073, -1.3832423 ,  2.77466755,  1.60756541],
       [-0.34115627,  1.51830239, -1.19037576,  0.98998395,  1.35322389],
       [-0.6053443 ,  1.40314916, -1.17862441,  1.7256331 ,  1.86613696],
       [-0.4908323 , -0.19306649, -0.76060604,  2.99763821,  1.03301475],
       [ 2.03786985, -0.39355355, -1.2723768 ,  2.35803246,  1.70461157],
       [-0.38542476, -0.13307579, -0.86816605,  3.52531337,  1.2778676 ],
       [ 1.95546041, -0.55010853, -1.16627262,  1.85682613,  1.53876558],
       [-0.58553889,  1.41203874, -0.66554446, -1.12582744,  0.30339072],
       [ 0.02526722, -0.8462359 ,  0.7

In [40]:
gradh = np.dot(delta1, W2.T)
gradh

array([[ 1.7712793 ,  1.72181018,  1.04804574,  2.59539242,  0.30027081],
       [-0.38717734, -0.81858559, -0.29766813,  1.59464295,  0.2806725 ],
       [ 1.68982025, -0.32858798,  0.33436432, -1.62785206,  0.66346902],
       [ 0.79469565, -0.82851018, -0.70289134,  3.50054064,  2.35104312],
       [-0.43416908, -0.30244343, -0.76941363,  3.0312389 ,  1.06407829],
       [ 1.00241381,  1.58750073, -1.3832423 ,  2.77466755,  1.60756541],
       [-0.34115627,  1.51830239, -1.19037576,  0.98998395,  1.35322389],
       [-0.6053443 ,  1.40314916, -1.17862441,  1.7256331 ,  1.86613696],
       [-0.4908323 , -0.19306649, -0.76060604,  2.99763821,  1.03301475],
       [ 2.03786985, -0.39355355, -1.2723768 ,  2.35803246,  1.70461157],
       [-0.38542476, -0.13307579, -0.86816605,  3.52531337,  1.2778676 ],
       [ 1.95546041, -0.55010853, -1.16627262,  1.85682613,  1.53876558],
       [-0.58553889,  1.41203874, -0.66554446, -1.12582744,  0.30339072],
       [ 0.02526722, -0.8462359 ,  0.7

In [41]:
delta2 == gradh

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]], dtype=bool)

In [383]:
gradh1 = gradh*h*(1-h)
gradh1

array([[  1.48071759e-01,   9.55729780e-02,  -2.66658595e-01,
          9.37784106e-02,   1.64516141e-01],
       [  1.62488419e-01,   1.66875971e-02,   1.49560832e-02,
         -1.56445360e-01,   1.99732956e-01],
       [ -3.29737337e-04,   5.37456001e-03,  -1.35904135e-02,
          6.64663203e-02,   2.91223710e-03],
       [  1.97160852e-02,  -3.78621301e-01,   1.83601544e-01,
          2.76843910e-02,  -3.14226368e-01],
       [  2.27513496e-03,   2.39200417e-01,  -1.05325366e-01,
         -5.14312370e-04,   7.16550931e-03],
       [  2.57738791e-03,  -2.48028907e-04,  -2.63148151e-01,
         -1.68596301e-03,   3.26676068e-01],
       [  4.83028688e-03,  -5.29988720e-01,   7.72780579e-02,
         -1.15345025e-01,   3.69435128e-01],
       [  4.40423364e-03,  -3.29075189e-01,   1.85867239e-01,
          2.09182895e-01,  -3.60189294e-03],
       [  2.87445030e-04,  -2.01897520e-02,   3.76773997e-01,
         -2.99468662e-01,   2.18053005e-01],
       [  5.04111025e-02,  -4.7531511

In [407]:
gradb1_ind = delta2 * deriv_h
gradb1_ind

array([[  1.48071759e-01,   9.55729780e-02,  -2.66658595e-01,
          9.37784106e-02,   1.64516141e-01],
       [  1.62488419e-01,   1.66875971e-02,   1.49560832e-02,
         -1.56445360e-01,   1.99732956e-01],
       [ -3.29737337e-04,   5.37456001e-03,  -1.35904135e-02,
          6.64663203e-02,   2.91223710e-03],
       [  1.97160852e-02,  -3.78621301e-01,   1.83601544e-01,
          2.76843910e-02,  -3.14226368e-01],
       [  2.27513496e-03,   2.39200417e-01,  -1.05325366e-01,
         -5.14312370e-04,   7.16550931e-03],
       [  2.57738791e-03,  -2.48028907e-04,  -2.63148151e-01,
         -1.68596301e-03,   3.26676068e-01],
       [  4.83028688e-03,  -5.29988720e-01,   7.72780579e-02,
         -1.15345025e-01,   3.69435128e-01],
       [  4.40423364e-03,  -3.29075189e-01,   1.85867239e-01,
          2.09182895e-01,  -3.60189294e-03],
       [  2.87445030e-04,  -2.01897520e-02,   3.76773997e-01,
         -2.99468662e-01,   2.18053005e-01],
       [  5.04111025e-02,  -4.7531511

In [414]:
gradb1_ind == gradh1

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [False, False,  True, False, False],
       [ True, False, False, False,  True],
       [ True, False,  True,  True, False],
       [False,  True,  True, False, False],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True, False,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True, False, False],
       [ True,  True,  True,  True,  True],
       [ True, False,  True,  True, False],
       [ True,  True,  True,  True,  True],
       [False,  True,  True,  True,  True],
       [False,  True, False,  True,  True],
       [False,  True,  True, False,  True],
       [ True,  True, False,  True, False],
       [ True,  True,  True,  True,  True],
       [ True, False,  True,  True,  True]], dtype=bool)

In [408]:
gradh1_b1 = np.sum(gradh1, axis = 0)
gradh1_b1

array([ 0.59509274, -2.2731381 ,  0.66052198, -0.24933812,  2.29484855])

In [409]:
gradb1 = np.sum(gradb1_ind, axis = 0)
gradb1

array([ 0.59509274, -2.2731381 ,  0.66052198, -0.24933812,  2.29484855])

In [415]:
gradb1 - gradh1_b1

array([  0.00000000e+00,  -4.44089210e-16,  -1.11022302e-16,
        -5.55111512e-17,   0.00000000e+00])

In [425]:
gradh1_W1 = data.T.dot(gradh1)
gradh1_W1

array([[ 0.12934467,  0.48020855, -1.22965084, -0.65266667,  1.11047267],
       [-0.19279542,  0.03195569,  0.13028115, -0.22112702,  2.20196808],
       [ 0.06666173,  0.99740924,  0.27814503, -1.0423232 ,  0.30003253],
       [-0.20766192,  0.8332635 ,  1.1494671 ,  0.85845471,  0.31393749],
       [-0.05583822, -2.02946745,  0.9331629 , -0.50124165,  0.16785525],
       [-0.19394407,  1.33556591,  0.07274817,  0.80388024, -3.14253137],
       [ 0.43135939,  0.81887804, -0.92100688,  0.15638053,  0.73948405],
       [ 0.24485145, -1.44851982,  0.63376201, -0.50476703,  0.65857226],
       [ 0.25952593, -0.76546294,  0.13977428, -1.03290992,  1.82769238],
       [-0.17550237, -1.01193104,  0.86607282, -0.3781074 ,  0.07650125]])

In [525]:
delta2*deriv_h - gradb1_ind

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [541]:
gradW1_ind = gradb1_ind[:, np.newaxis, :] * data[:, :, np.newaxis]
gradW1_ind

array([[[  1.46406038e-01,   9.44978378e-02,  -2.63658842e-01,
           9.27234583e-02,   1.62665430e-01],
        [ -1.90917431e-01,  -1.23227735e-01,   3.43818256e-01,
          -1.20913896e-01,  -2.12120118e-01],
        [  7.55359102e-02,   4.87546845e-02,  -1.36030664e-01,
           4.78392210e-02,   8.39246897e-02],
        [  7.90107950e-02,   5.09975503e-02,  -1.42288494e-01,
           5.00399727e-02,   8.77854842e-02],
        [ -1.43030187e-01,  -9.23188936e-02,   2.57579360e-01,
          -9.05854279e-02,  -1.58914668e-01],
        [ -1.10641750e-01,  -7.14137631e-02,   1.99251861e-01,
          -7.00728316e-02,  -1.22929273e-01],
        [  4.77327446e-02,   3.08091197e-02,  -8.59606634e-02,
           3.02306190e-02,   5.30337924e-02],
        [ -1.31486961e-02,  -8.48683135e-03,   2.36791463e-02,
          -8.32747468e-03,  -1.46089488e-02],
        [  1.04593374e-01,   6.75098365e-02,  -1.88359498e-01,
           6.62422088e-02,   1.16209184e-01],
        [  1.553177

In [542]:
gradW1 = np.sum(gradW1_ind, axis = 0)
gradW1.shape

(10, 5)

In [543]:
gradW1 - gradh1_W1

array([[ -2.77555756e-17,   0.00000000e+00,   2.22044605e-16,
         -1.11022302e-16,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,  -2.77555756e-17,
          0.00000000e+00,   0.00000000e+00],
       [  1.38777878e-17,   0.00000000e+00,  -1.11022302e-16,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   1.11022302e-16,  -4.44089210e-16,
          0.00000000e+00,   0.00000000e+00],
       [ -1.38777878e-17,   0.00000000e+00,   0.00000000e+00,
          2.22044605e-16,   5.55111512e-17],
       [  0.00000000e+00,  -4.44089210e-16,  -1.38777878e-16,
          0.00000000e+00,   4.44089210e-16],
       [  0.00000000e+00,   1.11022302e-16,   2.22044605e-16,
         -2.77555756e-17,   0.00000000e+00],
       [  2.77555756e-17,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   2.77555756e-17,
         -2.22044605e-16,   0.00000000e+00],
       [  0.00000000e+00,   0.0000000

### Test the forward backward propagation implementation