### q2 sigmoid

In [1]:
# q2_sigmoid
import numpy as np

In [13]:
def sigmoid(x):
    x = 1. / (1 + np.exp(-x))
    return x

In [14]:
def sigmoid_grad(f):
    f = f * (1 - f)
    return f

### q1 softmax

In [None]:
def softmax(x):
    """
    Compute the softmax function for each row of the input x
    
    It is crucial that this function is optimized for speed because
    it will be used frequently in later code.
    You might find numpy functions np.exp, np.sum, np.reshape, np.max,
    and numpy broadcasting usefull for this task
    
    You should also make sure that your code works for one dimensional inputs
    
    """
    if len(x.shape) > 1:
        tmp = np.max(x, axis = 1)
        x -= tmp.reshape((x.shape[0], 1))
        x = np.exp(x)
        tmp = np.sum(x, axis = 1)
        x /= tmp.reshape((x.shape[0], 1))
    else:
        tmp = np.max(x)
        x -= tmp
        x = np.exp(x)
        tmp = np.sum(x)
        x /= tmp
    return x

### q2 gradient

In [18]:
def gradcheck_naive(f, x):
    """
    Gradient check for a function f
    -f should be a function that takes a single argument and outputs the cost
    and its gradients
    -x is the point (numpy array) to check the gradient at"""
    
    rndstate = random.getstate()
    random.setstate(rndstate)
    fx, grad = f(x)
    h = 1e-4
    
    # Iterate over all indexed in x
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index
        
        ### try modifying x[ix] with h defined above to compute numerical gradients
        ### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it
        ### possible to test cost functions with built in randomness later
        ### 
        old_xix = x[ix]
        x[ix] = old_xix + h
        random.setstate(rndstate)
        fp = f(x)[0]
        x[ix] = old_xix - h
        random.setstate(rndstate)
        fm = f(x)[0]
        x[ix] = old_xix
        
        numgrad = (fp - fm)/(2 * h)
        
        reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
        if reldiff > 1e-5:
            print("Gradient check failed.")
            print("First gradient error found at index %s" % str(ix))
            print("Your gradient: %f \t Numerical gradient: %f" %(grad[ix], numgrad))
            
            return
    it.iternext()
print("Gradient check passed!")

Gradient check passed!


In [19]:
def grad_numerical(f, x, h=1e-4):
    """
    Gradient check for a function f
    -f: should be a function that takes a single argument and outputs the cost
    and its gradients
    -x: is the point (numpy array) to check the gradient at 
    -h: is the size of the shift for all dimensions"""
    
    rndstate = random.getstate()
    random.setstate(rndstate)
    fx, grad = f(x)
    num_grad = np.zeros(x.shape)
    
    # Iterate over all indexes in x
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index
        
        ### try modifying x[ix] with h defined above to compute numerical gradient
        ### make sure you callrandom.setstate(rndstate) before calling f(x) each time, this will make it
        ### possible to test cost functions with built in randomness later
        old_xix = x[ix]
        x[ix] += 0.5 * h
        random.setstate(rndstate)
        fp = f(x)[0]
        x[ix] -= h
        random.setstate(rndstate)
        fm = f(x)[0]
        x[ix] = old_xix
        
        num_grad += (fp - fm)/h
        it.iternext()
    return num_grad

In [20]:
def eval_numerical_gradient_array(f, x, df, h = 1e-5):
    # Evaluate a numeric for a function that accepts a numpy array and returns an numpy array
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index
        
        oldval = x[ix]
        x[ix] = oldval + h
        pos = f(x).copy()
        x[ix] = oldval - h
        neg = f(x).copy()
        x[ix] = oldval
        
        grad[ix] = np.sum((pos - neg) * df) / (2 * h)
        it.iternext()
    return grad



### q2 neural

In [4]:
import numpy as np
import random

In [5]:
def affine_forward(x, w, b):
    """
    Computer the forward pass for an affine (fully-connected) layer
    The input x has shape (N, d_1, , ... d_k) and contains a
    minibatch of N examples, where each example x[i] has shape (d_1, ..., d_k).
    We will reshape each input into a vector of dimension D = d_1 * ... * d_k and
    then transform it to an output vector of dimension M.
    
    Inputs:
    x: A numpy array containing input data, of shape (N, d_1, ..., d_k)
    w: a numpy array of weights, of shape (D, M)
    b: a numpy array of biases, of shape (M, )
    
    Returns a tuple of:
    out: output, of shape (N, M)
    cache: (x, w, b)
    """
    out = None
    N = x.shape[0]
    D = np.prod(x.shape[1:])
    M = b.shape[1]
    out = np.dot(x.reshape(N, D), w.reshape(D, M)) + b.reshape(1, M)
    
    return out, (x, w, b)
    

In [9]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[8, 8, 8]])
out = a + b

In [10]:
print(out)

[[ 9 10 11]
 [12 13 14]]


In [12]:
def affine_backward(dout, cache):
    """
    Computes the backward pass for an affine layer
    
    Inputs:
    -dout: Upstream derivative, of shape (N, M)
    -cache: Tuple of:
    -x: Input data, of shape (N, d_1, ..., d_k)
    -w: Weights, of shape (D, M)
    
    Returns a tuple of:
    -dx: Gradient with respect to x, of shape (N, d1, ... d_k)
    -dw: Gradient with respect to w, of shape (D, M)
    -db: Gradient with respect to b, of shape (M,)"""
    
    x, w, b = cache
    dx, dw, db = None, None, None
    N = x.shape[0]
    D = np.prod(x.shape[1:])
    M = b.shape[1]
    
    dx = np.dot(dout, w.reshape(D, M).T).reshape(x.shape) #DxN
    dw = np.dot(x.reshape(N, D).T, dout).reshape(w.shape) #MxD
    db = np.sum(dout, axis=0) #1xM
    
    return dx, dw, db

In [15]:
def sigmoid_forward(x):
    """
    Computes the forward pass for a sigmoid activation.
    Inputs:
    -x: Input data, numpy array of arbitary shape
    
    Returns a tuple (out, cache)
    -out: output of the same shape as x
    -cache: identical to out; required for backpropagation
    """
    
    return sigmoid(x), sigmoid(x)

In [16]:
def sigmoid_backward(dout, cache):
    """
    Computes the backward pass for an sigmoid layer.
    
    Inputs:
    -dout: Upstream derivative, same shape as the input to the 
    sigmoid layer (x)
    -cache: sigmoid(x)
    Returns a tuple of:
    -dx: back propagated gradient with respect to x
    """
    
    x = cache
    return sigmoid_grad(x) * dout

In [17]:
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network
    
    Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters.
    """
    
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])
    N = data.shape[0]
    
    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))
    
    # forward propagation
    hidden = np.dot(data, W1) + b1
    layer1_a = sigmoid(hidden)
    layer2 = np.dot(layer1_a, W2) + b2
    
    # need to calculate the softmax loss
    probs = softmax(layer2)
    costs = -np.sum(np.log(probs[np.arange(N), np.argmax(labels, axis=1)]))
    
    # backward propagation
    # There is no regularization
    # dx -> sigmoid -> w2 * layer1_a+ b -> sigmoid -> W1 * data + b1 ->
    dx = probs.copy()
    dx -= labels
    
    dlayer2 = np.zeros_like(dx)
    gradW2 = np.zeros_like(W2)
    gradW1 = np.zeros_like(W1)
    gradb2 = np.zeros_like(b2)
    gradb1 = np.zeros_like(b1)
    
    gradw2 = np.dot(layer1_a.T, dx)
    gradb2 = np.sum(dx, axis=0)
    dlayer2 = np.dot(dw, W2.T)
    dlayer1 = sigmoid_grad(layer1_a) * dlayer2
    gradW1 = np.dot(data.T, dlayer1)
    gradb1 = np.sum(dlayer1, axis = 0)
    
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad

In [None]:
def sanity_check():
    """
    Set up fake data and parameters for the neural network, and test using
    gradcheck."""
    print("Running sanity check...")
    N = 300
    dimensions = [10, 5, 10]
    data = np.random