In [1]:
"""This is my first attempt to put together a ConvNet without using TensorFlow"""

import numpy as np
import h5py
import matplotlib.pyplot as plt
import matplotlib.image as Image
from random import randint
import json

#%matplotlib inline
#plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
#plt.rcParams['image.interpolation'] = 'nearest'
#plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(4)

In [2]:
def yToVector(y, C=6):
    # convert the Y value into a vector of dimension C x 1
    m = y.shape[0]
    y = np.eye(6)[y].T.reshape(C,m) 
    #each row of eye matrix corresponds to the row vector of Y and [Y} specify which row to take
    #and then transpose it to a column vector and reshape it
    return y

In [3]:
def loadData ():
    # Training data
    #read for h5 datafile and convert into dictionary
    train_dataset = h5py.File('convdata/train_signs.h5', 'r')
    #for keys in test_dataset: print(keys) #print all the keys from the dict var
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_x_orig = train_set_x_orig/255 #normalize X
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels
    train_set_y_orig = yToVector(train_set_y_orig) #vectorize y
    list_classes = np.array(train_dataset["list_classes"][:]) # your train set labels  
    
    # Test Data
    test_dataset = h5py.File('convdata/test_signs.h5', 'r')
    #for keys in test_dataset: print(keys) #print all the keys from the dict var
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your train set features
    test_set_x_orig = test_set_x_orig/255 #normalize X
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your train set labels
    test_set_y_orig = yToVector(test_set_y_orig) #vectorize y 
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, list_classes

In [4]:
def showFilters(params, key='CN_W1'):
    
    p = params[key]
    
    fig, axarr = plt.subplots(1, p.shape[3],  figsize=(16, 12))

    for i in range(p.shape[3]):
        pgray = np.zeros([p.shape[0], p.shape[1], p.shape[2]])
        pgray = rgb2gray(p[:,:,:,i])
        axarr[i].set_title(key)
        axarr[i].imshow(pgray, cmap = plt.get_cmap('gray'))
    plt.show()
    
    return

In [5]:
def rgb2gray(rgb):

    gray = np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

    return gray

In [6]:
def showImage(imageArr, title = ''):
    
    plt.title(title)
    plt.imshow(imageArr,)

    plt.show()
    return

In [7]:
def padToKeepDim(inDim, fDim, stride=1):
    outdim = inDim
    p = int((((outdim - 1) * stride) + fDim - inDim)/2)
    return p

In [8]:
def calOutDim(inDim, fDim, padding, stride):

    outDim = int((inDim + 2*padding - fDim)/stride) +  1
    
    return outDim

In [9]:
def addPadding(arr, p):
    m, h, w, c = arr.shape
    
    padded = np.zeros([m, (h + 2*p) , (w + 2*p), c])
    padded[ :, p:(p+h), p:(p+w), :] = arr[:,:,:,:]

    return padded

In [10]:
def conv_single_step(a_slice_prev, W, b):

    ### START CODE HERE ### (≈ 2 lines of code)
    # Element-wise product between a_slice and W. Do not add the bias yet.
    s = W * a_slice_prev
    # Sum over all entries of the volume s.
    Z = np.sum(s)
    # Add bias b to Z. Cast b to a float() so that Z results in a scalar value.
    Z = float(Z + b)
    ### END CODE HERE ###

    return Z

In [11]:
def conv_forward(A_prev, W, b, hparameters):

    ### START CODE HERE ###
    # Retrieve dimensions from A_prev's shape (≈1 line)  
    (m, nH_prev, nW_prev, nC_prev) = A_prev.shape
    
    # Retrieve dimensions from W's shape (≈1 line)
    (f, f, nC_prev, noOfFilters) = W.shape
    
    # b would be in shape (1,1,1,noOfFilters)
    
    # Retrieve information from "hparameters" 
    s = hparameters['stride']
    p = hparameters['pad']
    
    # Compute the dimensions of the CONV output volume 
    nH = calOutDim(nH_prev, f, p, s)
    nW = calOutDim(nW_prev, f, p, s)
    
    # Initialize the output volume Z with zeros. 
    Z = np.zeros([m, nH, nW, noOfFilters])
    
    # Create A_prev_pad by padding A_prev
    A_prev_pad = addPadding(A_prev, p)
    
    for i in range(m):                  # loop over the batch of training examples
        a_prev_pad = A_prev_pad[i,:,:,:]    # Select ith training example's padded activation
        #print("conv forward: " +  str(i+1) + "th training data, out of " + str(m), end='\r', flush=True)
        for h in range(nH):                 # loop over vertical axis of the output volume
            for w in range(nW):             # loop over horizontal axis of the output volume
                for c in range(noOfFilters):        # loop over channels (= #filters) of the output volume
                    
                    # Use the corners to define the (3D) slice of a_prev_pad 
                    a_slice_prev = a_prev_pad[h*s : h*s + f, w*s : w*s + f ,:]
                    
                    # For each slice, convolve the slice with the each filter W and b
                    # ie fill the output matrix across filter using each slice
                    Z[i, h, w, c] = conv_single_step(a_slice_prev, W[:,:,:,c], b[:,:,:,c])
                                        
    
    # Making sure your output shape is correct
    assert(Z.shape == (m, nH, nW, noOfFilters))
    
    # Save information in "cache" for the backprop
    cache = (A_prev, W, b, hparameters)
    
    return Z, cache

In [12]:
def pool_forward(Z, hparameters, mode = "max"):

    
    # Retrieve dimensions from the input shape
    (m, nH_prev, nW_prev, nC_prev) = Z.shape
    
    # Retrieve hyperparameters from "hparameters"
    f = hparameters["f"]
    s = hparameters["stride"]
    
    # Define the dimensions of the output
    nH = calOutDim(nH_prev, f, 0, s)
    nW = calOutDim(nW_prev, f, 0, s)
    nC = nC_prev
    
    # Initialize output matrix A
    A = np.zeros((m, nH, nW, nC))              
    
    ### START CODE HERE ###
    for i in range(m):                         # loop over the training examples
        #print("pool forward: " +  str(i+1) + "th training data, out of " + str(m), end='\r', flush=True)
        for h in range(nH):                     # loop on the vertical axis of the output volume
            for w in range(nW):                 # loop on the horizontal axis of the output volume
                for c in range (nC):            # loop over the channels of the output volume
                    
                    # Use the corners to define the current slice on the ith training example of A_prev, channel c.
                    z_prev_slice = Z[i,h*s:h*s + f, w*s:w*s + f,c]
                    
                    # Compute the pooling operation on the slice. Use an if statment to differentiate the modes. 
                    # Use np.max/np.mean.
                    if mode == "max":
                        A[i, h, w, c] = np.max(z_prev_slice)
                    elif mode == "average":
                        A[i, h, w, c] = np.mean(z_prev_slice)
    
    # Store the input and hparameters in "cache" for pool_backward()
    cache = (Z, hparameters)
    
    # Making sure your output shape is correct
    assert(A.shape == (m, nH, nW, nC))
    
    return A, cache

In [13]:
def conv_backward(dZ, cache):
    """
    Take dZ which is m, nH, nW, nC_prev and calculate:
    dA (for the slice) m, f, f, nC_prev by adding W along nH, nW and for each corresponding slot, muliply by  
    corresponding right dZ
    dW same calculation except adding up the right slice of A_prev instead of W, then for each one 
    muliply by the corresponding dZ
    dB just add up all the dZ to form a scalar
    """
    
    # Retrieve information from "cache"
    (A_prev, W, b, hparameters) = cache
    
    # Retrieve dimensions from A_prev's shape
    (m, nH_prev, nW_prev, nC_prev) = A_prev.shape
    
    # Retrieve dimensions from W's shape
    (f, f, nC_prev, noOfFilters) = W.shape
    
    # Retrieve information from "hparameters"
    s = hparameters['stride']
    pad = hparameters['pad']
    
    # Retrieve dimensions from dZ's shape
    (m, n_H, n_W, n_C) = dZ.shape
    
    # Initialize dA_prev, dW, db with the correct shapes
    dA_prev = np.zeros(list(A_prev.shape))                     
    dW = np.zeros(list(W.shape))   
    db = np.zeros(list(b.shape) )  

    # Pad A_prev and dA_prev
    A_prev_pad = addPadding(A_prev, pad)
    dA_prev_pad = addPadding(dA_prev, pad)
    
    for i in range(m):                       # loop over the training examples
        
        # select ith training example from A_prev_pad and dA_prev_pad
        a_prev_pad = A_prev_pad[i,:,:,:]
        da_prev_pad = dA_prev_pad[i,:,:,:]
        
        #print("conv backward: " +  str(i+1) + "th training data, out of " + str(m), end='\r', flush=True)
        
        for h in range(n_H):                   # loop over vertical axis of the output volume
            for w in range(n_W):               # loop over horizontal axis of the output volume
                for c in range(n_C):           # loop over the channels of the output volume
                    
                    # Use the corners to define the slice from a_prev_pad
                    a_slice = a_prev_pad[h*s:h*s+f,w*s: w*s+f,:]

                    # Update gradients for the window and the filter's parameters using the code formulas given above
                    #scan out the da
                    da_prev_pad[h*s : h*s+f, w*s : w*s+f, :] += W[:,:,:,c]*dZ[i,h,w,c]
                    #repeatedly darken the dW
                    dW[:,:,:,c] += a_slice*dZ[i,h,w,c]  #note that DZ[i,h,w,c] is a scalar
                    #add all up
                    db[:,:,:,c] += dZ[i,h,w,c] #broadcast single value to 1,1,1
                    
        # Set the ith training example's dA_prev to the unpaded da_prev_pad (Hint: use X[pad:-pad, pad:-pad, :])
        dA_prev[i, :, :, :] = da_prev_pad[pad:-pad,pad:-pad,:] #unpad -pad is padth index from the row/col end

    
    # Making sure your output shape is correct
    assert(dA_prev.shape == A_prev.shape)
    
    return dA_prev, dW, db

In [14]:
def create_mask_from_window(x):
    #take the previous input layer to identify which "slot" the backward prop should be allocated to
    
    mask = (x == np.max(x))
    
    return mask

In [15]:
def distribute_value(da, shape):
    """
    distribute the value of da to the corresponding slice of Z and pass backward
    """
    
    ### START CODE HERE ###
    # Retrieve dimensions from shape (≈1 line)
    (nH, nW) = shape
    
    # Compute the value to distribute on the matrix (≈1 line)
    average = 1/(nH * nW)
    
    # Create a matrix where every entry is the "average" value (≈1 line)
    z = np.ones([nH, nW])*da*average
    
    return z

In [16]:
def pool_backward(dA, cache, mode = "max"):
    #backward prop for the pooling layer: max or average
    
    # Retrieve information from cache for identifying slots to pass the backward prop
    (Z, hparameters) = cache
    
    # Retrieve hyperparameters from "hparameters" to "locate" previous slice
    s = hparameters['stride']
    f = hparameters['f']
    
    # Retrieve dimensions from A_prev's shape and dA's shape (≈2 lines)
    m, nH_Z, nW_Z, nC_Z = Z.shape
    m, nH, nW, nC = dA.shape
    
    # Initialize dA_prev with zeros (≈1 line)
    dZ = np.zeros([m, nH_Z, nW_Z, nC_Z])
    
    for i in range(m):                       # loop over the training examples
        
        # select training example from A_prev
        z = Z[i,:,:,:]
        
        #print("pool backward: " +  str(i+1) + "th training data, out of " + str(m), end='\r', flush=True)
        
        for h in range(nH):                   # loop on the vertical axis of the current layer
            for w in range(nW):               # loop on the horizontal axis of the current layer
                for c in range(nC):           # loop over the channels of the current layer
                    
                    
                    # Compute the backward propagation in both modes.
                    if mode == "max":
                        
                        # Use the corners and "c" to define the current slice from a_prev (≈1 line)
                        z_slice = z[h*s : h*s+f, w*s : w*s+f, c]
                        # Create the mask from a_prev_slice (≈1 line)
                        mask = create_mask_from_window(z_slice)
                        #mask =  mask/np.sum(mask)  #just in case there are more than one maxima
                        # Set dA_prev to be dA_prev + (the mask multiplied by the correct entry of dA) (≈1 line)
                        dZ[i, h*s: h*s+f, w*s: w*s+f, c] += dA[i,h,w,c]*mask
                        
                    elif mode == "average":
                        
                        # Get the value a from dA (≈1 line)
                        da = dA[i,h,w,c] #get the current slow of dA
                        # Define the shape of the filter as fxf (≈1 line)
                        shape = (f, f)
                        # Distribute it to get the correct slice of dZ. i.e. Add the distributed value of da. (≈1 line)
                        dZ[i, h*s: h*s+f, w*s: w*s+f, c] += distribute_value(da, shape)
                        
    
    # Making sure your output shape is correct
    assert(dZ.shape == Z.shape)
    
    return dZ

In [17]:
def loadSavedParams():
    
    params = {}
    
    with open('convNet.json', 'r') as infile:
        param_data = json.load(infile)
    
    for key in param_data:
        if isinstance(param_data[key], list):
            params[key] = np.asarray(param_data[key])
        else:
            params[key] = param_data[key]
            
    return params

In [18]:
def init_params(inDim, FCLayers_dim, hparams):
    params = {}
    smooth_grad = hparams['smooth_grad']
    save2File = hparams['save2File']
    
    if save2File == True:
        params = loadSavedParams()
    else:
        if not ('CN_W1'in params):
            keepDims = True
            f1 = 5
            s1 = 1
            nc1 = 3
            noOfFilter1 = 6
            p1 = None
            if keepDims == True: #override the value of p1 if KeepDims == True
                p1 = padToKeepDim(inDim, f1, s1)
                z1Dim = inDim
            else:
                z1Dim = calOutDim(inDim, f1, p1, s1)
            params['CN_W1'] = np.random.randn(f1,f1,nc1,noOfFilter1)
            params['CN_b1'] = np.zeros([1,1,1,noOfFilter1])
            params['hparam_conv1'] = {'stride': s1, 'pad': p1}
            params['hparam_pool1'] = {'stride': 2, 'f': 2}
            a1Dim = calOutDim(z1Dim, params['hparam_pool1']['f'], 0, params['hparam_pool1']['stride'])
    
        if not ('CN_W2'in params):
            keepDims = True
            f2 = 3
            s2 = 1
            nc2 = noOfFilter1
            noOfFilter2 = 16
            p2 = None
            if keepDims == True:
                p2 = padToKeepDim(a1Dim, f2, s2)
                z2Dim = a1Dim
            else:
                z2Dim = calOutDim(a1Dim, f2, p2, s2)
            params['CN_W2'] = np.random.randn(f2,f2,nc2,noOfFilter2)
            params['CN_b2'] = np.zeros([1,1,1,noOfFilter2])
            params['hparam_conv2'] = {'stride': s2, 'pad': p2}
            params['hparam_pool2'] = {'stride': 2, 'f': 2}
            a2Dim = calOutDim(z2Dim, params['hparam_pool2']['f'], 0, params['hparam_pool2']['stride'])
        
            ### FC layers dimension ###
            FCLayers_dim[0] = a2Dim*a2Dim*noOfFilter2
        
        for l in range(1, len(FCLayers_dim)):
            if smooth_grad == True:
                smooth_gradient_adj = np.sqrt(2/FCLayers_dim[l-1])    # to avoid vanishing or exploding grads
            else:
                smooth_gradient_adj = 1
            params['FC_W' + str(l)] = np.random.randn(FCLayers_dim[l],FCLayers_dim[l-1]) * smooth_gradient_adj
            params['FC_b' + str(l)] = np.zeros([FCLayers_dim[l],1])
    
    return params, FCLayers_dim

In [19]:
def linear_forward(A, W, b):
    
    Z = np.dot(W,A) + b
    
    assert(Z.shape == (W.shape[0], A.shape[1]))
    
    cache = (A, W, b)           #linear_cache
    
    return Z, cache

In [20]:
def relu(Z):
    a = np.maximum(0,Z)
    
    return a, Z

In [21]:
def softmax_forward(Z):
    
    Zshift = Z - np.max(Z)
    t = np.exp(Zshift)
    a = np.divide(t, (np.sum(t, axis=0, keepdims=True)))
    
    return a, Z

In [22]:
def flattenArray(arr):
    m = arr.shape[0]
    n = arr.shape[1]*arr.shape[2]*arr.shape[3]
    a = np.zeros([m, n])
    for i in range(m):
        a[i,:] = arr[i,:,:,:].reshape(n,)
    return a

In [23]:
def L_model_forward(X, FCLayers_dim, params):
    """    
    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2)
                the cache of linear_sigmoid_forward() (there is one, indexed L-1)
    """

    FC_caches = []
    CN_caches = ()
    L = len(FCLayers_dim)                  # number of layers in the neural network
    
    #Conv Forward
    Z1, cache_conv1 = conv_forward(X, params['CN_W1'], params['CN_b1'], params['hparam_conv1'])
    A1, cache_pool1 = pool_forward(Z1,params['hparam_pool1'], "max")
    Z2, cache_conv2 = conv_forward(A1, params['CN_W2'], params['CN_b2'], params['hparam_conv2'])
    A2, cache_pool2 = pool_forward(Z2, params['hparam_pool2'], "max")
    
    CN_caches = (cache_conv1, cache_pool1, cache_conv2, cache_pool2, A2.shape)
    
    A_unrolled = unrollMatrix(A2)
    
    # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    A = A_unrolled
    for l in range(1, L - 1):
        A_prev = A
                
        A, FC_cache = linear_activation_forward(A_prev, params['FC_W' + str(l)], params['FC_b' + str(l)], "relu")
        
        FC_caches.append(FC_cache)
    
    # Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list.
    
    AL, FC_cache = linear_activation_forward(A, params['FC_W' + str(L-1)], params['FC_b' + str(L-1)], "sigmoid")   
    
    FC_caches.append(FC_cache)          # (linear_cache, z_activation_cache) 
        
    assert(AL.shape == (params['FC_W' + str(L-1)].shape[0],X.shape[0]))
    
    return AL, FC_caches, CN_caches

In [24]:
def unrollMatrix(A):
    m = A.shape[0]
    unrolledA = A.transpose(1,2,3,0).reshape(-1,m)
    return unrolledA

In [25]:
def linear_activation_forward(A_prev, W, b, activation):

    ####def linear_activation_forward(A_prev, W, b, activation):    
    
    if activation == "sigmoid":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)    #linear cache : (A, W, b)   
        A, z_activation_cache = softmax_forward(Z)

    
    elif activation == "relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)    #linear cache : (A, W, b)
        A, z_activation_cache = relu(Z)
            

    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    
    FC_cache = (linear_cache, z_activation_cache)         #linear_cache is A, W, b, activation_cache is Z

    return A, FC_cache
    

In [26]:
def compute_cost(AL, Y, FCLayers_dim, params, _lambda):
    """
    Implement the cost function defined by equation (7).

    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """
    
    m = Y.shape[1]
    Y = np.array(Y, dtype=float)     # to avoid division by zero
    SumSqW = 0                       # for regularization
    
    # Compute loss from aL and y.

    #cost = (1/m)*np.sum(-(Y*np.log(AL)+(1-Y)*np.log(1-AL)))
    
    if np.sum(AL <= 0) > 0:                    #check if there is any instances, true = 1
        AL[AL <= 0] = 1e-7
        print("AL below zeros detected")
        
    if np.sum(AL >= 1.0) > 0:
        sub = 1.0 - 1e-7
        AL[AL >= 1.0] = sub      #make it just slightly smaller than 1
        print("(1 - AL) below zeros detected")
  
    logprobs = np.multiply(-np.log(AL),Y) + np.multiply(-np.log(1 - AL), 1 - Y)

    
    ### Regularization ###
    L = len(FCLayers_dim)
    
    for l in range(L-1): 
        SumSqW = SumSqW + np.sum(np.square(params["FC_W" + str(l + 1)]))
        L2_reg = (1./(2 * m)) * _lambda * SumSqW
    
    cost = 1./m * np.sum(logprobs) + L2_reg
        
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape == ())
    
    return cost

In [27]:
def FC_model_backward(AL, Y, FCLayers_dim, FC_caches):

    grads = {}
    L = len(FCLayers_dim) - 1 # the number of layers
    
    m = AL.shape[1] # A or Z retains the dimension of number of training examples m
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    noOfClass = Y.shape[0]
    
    dAL = -1 * np.divide( Y, AL ) 
    
    current_cache = FC_caches[L-1]   # contains of linear cache (A, W, b,) and activation cache (Z)
    
    ### first backpropagation :-> sigmoid
    grads["FC_dA" + str(L-1)], grads["FC_dW" + str(L)], grads["FC_db" + str(L)] = linear_activation_backward(dAL, current_cache, 'sigmoid')
    
    for l in reversed(range(L-1)):

        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["FC_dA" + str(l+1)], FC_caches[l] , 'relu')
        
        grads["FC_dA" + str(l)] = dA_prev_temp
        grads["FC_dW" + str(l+1)] = dW_temp
        grads["FC_db" + str(l+1)] = db_temp
    
    return grads


In [28]:
def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    #dJ_dZ = dJ_dA * dA_dZ
    #dA_dZ = 0 when Z <=0
    #dA_dZ = 1 when Z > 0
    #dJ_dZ = 0 when z <=0; = dJ_dA when Z > 0
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0 
    
    assert (dZ.shape == Z.shape)
    
    return dZ


In [29]:
def softmax_backward_extended(dA, cache):
    
    Z = cache    
    a, _ = softmax_forward(Z)
    needVerify = False
    useForLoop = False
    
    assert(Z.shape == dA.shape)
    
    noOfClass, m = Z.shape
    dJdZ = np.zeros([noOfClass, m])
    
    for k in range(m):
        dAdZMatrix = np.zeros((noOfClass, noOfClass))
        
        if useForLoop == False:
            dAdZMatrix = -np.outer(a[:, k], a[:, k]) + np.diag(a[:, k].flatten())
        else:
            dAdZ_forLoop = np.zeros((noOfClass, noOfClass))
        
            for i in range(noOfClass):
                for j in range(noOfClass):
                    dAdZ_forLoop[i, j] = a[i, k] * ((i == j) - a[j, k])
            
            dAdZMatrix = dAdZ_forLoop
        
        if needVerify == True and useForLoop == True:
            if (np.sum(dAdZ_forLoop) - np.sum(dAdZMatrix)) > 1e-15:
                print("difference between dAdZ_forLoop and Matrix is too big")
        
            assert(dAdZMatrix.shape ==  dAdZ_forLoop.shape)
        
        assert(dAdZMatrix.shape == (noOfClass,noOfClass))

        new_vector = np.sum ( (dA[:,k].reshape(noOfClass,1) * dAdZMatrix).T, axis=1, keepdims=True)
    
        if k == 0:
            dJdZMatrix = new_vector

        else:
            dJdZMatrix = np.concatenate((dJdZMatrix, new_vector), axis=1)
    
    #hardcoded answer
    dJdZa = a + dA*a
    
    #if np.sum(dJdZMatrix) - np.sum(dJdZa) > 1e-10:
        #print("difference between dJdZMatrix and hardcode calculation is too big")
    
    return dJdZMatrix

In [30]:
def linear_backward(dZ, cache):

    A_prev, W, b = cache
    m = A_prev.shape[1]
       
    dW = 1./m * np.dot(dZ, A_prev.T)   
    dA_prev = np.dot(W.T, dZ)    
        
    db = 1./m * np.sum(dZ, axis=1, keepdims=True)
    assert (db.shape == b.shape)
        
        
    assert (dW.shape == W.shape)
    assert (dA_prev.shape == A_prev.shape)
    
    return dA_prev, dW, db

In [31]:
def update_params(m, FCLayers_dim, params, grads, alpha, _lambda):
    
    for l in range(2 ):
        
        params["CN_W" + str(l+1)] = params["CN_W" + str(l+1)] - alpha * (grads["CN_dW" + str(l+1)]  
                                                 + (params["CN_W" + str(l+1)] * (_lambda/m))) 
        params["CN_b" + str(l+1)] = params["CN_b" + str(l+1)] - alpha * grads["CN_db" + str(l+1)]
    
    L = len(FCLayers_dim)
    
    for l in range(L-1): 

        params["FC_W" + str(l+1)] = params["FC_W" + str(l+1)] - alpha * (grads["FC_dW" + str(l+1)] 
                                                                         + (params["FC_W" + str(l+1)] * (_lambda/m)) )  
        params["FC_b" + str(l+1)] = params["FC_b" + str(l+1)] - alpha * grads["FC_db" + str(l+1)]
                
    return params


In [32]:
def linear_activation_backward(dA, cache, activation):
    
    linear_cache, activation_cache = cache

    if activation == "relu":

        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        
        dZ = softmax_backward_extended(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
                
    return dA_prev, dW, db
    

In [33]:
def dict2vector(_dict, skip_term, include_term = "_"):
    """
    Roll all our parameters dictionary into a single vector satisfying our specific required shape.
    """
    len_no_term = sum(1 for i in _dict if skip_term not in i and include_term in i)  # find the length of vector without hparam*
    
    no_term_key_labels = np.array(range(len_no_term*2), dtype='U8').reshape(len_no_term,2)
    
    count = 0
    
    dict_cache = {}
    
    for key, value in sorted(_dict.items()):
        
        if skip_term not in key and include_term in key:
        
            #Storing key names and dimenson
            no_term_key_labels[count, 0] = key
            
            shape_str = ""
            for i in range(len(value.shape)):
                shape_str = shape_str + str(value.shape[i])
                if i < (len(value.shape) - 1):
                    shape_str = shape_str + "-"
            no_term_key_labels[count,1] = shape_str

        
            #storing a N x 1 dimensional value vector
            new_vector = np.reshape(_dict[key], (-1,1))
        
            if count == 0:
                dict_values = new_vector

            else:
                dict_values = np.concatenate((dict_values, new_vector), axis=0)
                
            count = count + 1
        else:
            dict_cache[key] = value
            
        
    return no_term_key_labels, dict_values, dict_cache


In [34]:
def vector2dict(keys_labels, param_values, params_cache):
    """
    Unroll all our parameters dictionary from a single vector satisfying our specific required shape.
    """
    last_index = 0
    params = {}
    
    for i in range(keys_labels.shape[0]):
        
        key = keys_labels[i][0]
        dimlist = keys_labels[i][1].split('-')
        
        index_length = 1
        this_shape = ()
        
        for i in range(len(dimlist)):
            index_length = index_length * int(dimlist[i])
            this_shape = this_shape + (int(dimlist[i]),)
        
        temp_array = param_values[last_index:last_index+index_length,0]
        
        temp_array = temp_array.reshape(*this_shape)
        params[key] = temp_array
        
        last_index = last_index + index_length
        
    for k in params_cache:
        params[k] = params_cache[k]


    return params

In [35]:
def gradient_check_n(params, grads, X, Y, FCLayers_dim):

    
    # Set-up variables
    epsilon = 1e-7
    printdiff = True
    
    no_hparam_key_labels, param_values, params_cache = dict2vector(params, "hparam", "CN_W2")
    no_dA_grad_labels, no_dA_grad_values, grads_cache = dict2vector(grads, "dA", "CN_dW2")
    #no_hparam_key_labels, param_values, params_cache = dict2vector(params, "hparam")
    #no_dA_grad_labels, no_dA_grad_values, grads_cache = dict2vector(grads, "dA")

    num_parameters = param_values.shape[0]
    J_plus = np.zeros((num_parameters, 1))
    J_minus = np.zeros((num_parameters, 1))
    gradapprox = np.zeros((num_parameters, 1))
    
    assert (no_dA_grad_values.shape == param_values.shape)
    
    # Compute gradapprox
    for i in range(num_parameters):
        #print("Testing " + str(i) + "th parameter...")
        
        # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".

        thetaplus = np.copy(param_values)                           # Step 1
        
        if True:
        
            thetaplus[i][0] = thetaplus[i][0] + epsilon                 # Step 2

            updated_params_plus = vector2dict(no_hparam_key_labels, thetaplus, params_cache)
        
            AL_plus, _ , _ = L_model_forward(X, FCLayers_dim, updated_params_plus)
            J_plus[i] = compute_cost(AL_plus, Y, FCLayers_dim, params, 0.0)     # Step 3


            # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".

            thetaminus = np.copy(param_values)                          # Step 1
            thetaminus[i][0] = thetaminus[i][0] - epsilon               # Step 2        

            updated_params_minus = vector2dict(no_hparam_key_labels, thetaminus, params_cache)
            
            AL_minus, _ , _ = L_model_forward(X, FCLayers_dim, updated_params_minus)            
            J_minus[i] = compute_cost(AL_minus, Y, FCLayers_dim, params, 0.0)   # Step 3

        
            # Compute gradapprox[i]
            gradapprox[i] = (J_plus[i] - J_minus[i])/(2 * epsilon)
    
            # Compare gradapprox to backward propagation gradients by computing difference.

            numerator = np.linalg.norm(no_dA_grad_values[i] - gradapprox[i])                          # Step 1'
            denominator = np.linalg.norm(no_dA_grad_values[i]) + np.linalg.norm(gradapprox[i])        # Step 2'
            difference = np.divide(numerator, denominator)                                            # Step 3'

            if printdiff == True:
                if difference > 1e-7:
                    print ("\033[93m" + "Gradient Check on " + str(i) 
                           + "th param: backward Prop error! difference = " + str(difference) + "\033[0m")
                    print("grad value: " + str(no_dA_grad_values[i]) + "; grad approx: " + str(gradapprox[i]))
                    #subprocess.call(["afplay", "beep-08b.wav"])
                else:
                    print ("\033[92m" + "Gradient Check on " + str(i) 
                           + "th param: Backward Prop OKAY! difference = " + str(difference) + "\033[0m")
    
    return difference

In [36]:
def saveParams(params):
    
    toSaveParam = {}
    
    for key in params:
        if isinstance(params[key], np.ndarray):
            toSaveParam[key] = params[key].tolist()
        else:
            toSaveParam[key] = params[key]
    
    with open('convNet.json', 'w') as outfile:
        json.dump(toSaveParam, outfile)
    return

In [37]:
def showOutcome(X, Y, outcome, noOfexamples = 5, showWrongOnly = True):

    wrongSlot = (np.argmax(outcome, axis=0) != np.argmax(Y, axis=0) )*1 # find the slot where prediction is wrong
    posWrongSlot = list(np.where(wrongSlot==1)[0])
         
    fig, axarr = plt.subplots(1, noOfexamples, figsize=(16, 12))
    
    for p in range(noOfexamples):
        pos = randint(0,len(posWrongSlot)-1)
        i = posWrongSlot[pos]
        t = "Y: " + str(np.argmax(Y[:,i])) + " ; predict: " + str(np.argmax(outcome[:,i]))

        axarr[p].set_title(t,  fontsize=14)
        axarr[p].imshow(X[i,:,:,:])
 
    plt.show()
    
    return

In [38]:
def calAccuracy(AL, Y):
    
    accuracy = (sum(np.argmax(Y, axis=0) == np.argmax(AL, axis=0))/Y.shape[1]) * 100
    
    return accuracy

In [39]:
def make_prediction(X, Y, params):
        
    #Conv and FC Forward
    AL, _ , _ = L_model_forward(X, FCLayers_dim, params)
    accuracy = calAccuracy(AL, Y)
    
    print("Training Accuracy on this data set is: " + str(accuracy) + "%.")
    
    return AL

In [40]:
def class_model_run(X, Y, params, hparams):

    _lambda = hparams['_lambda']
    alpha = hparams['alpha']
    noOfIterations = hparams['noOfIterations']
    checkGradient = hparams['checkGradient']

    for i in range(noOfIterations):
        #start of one epoch
        
        #Conv and FC Forward
        AL, FC_caches, CN_caches = L_model_forward(X, FCLayers_dim, params)

        #Cost compute    
        cost = compute_cost(AL, Y, FCLayers_dim, params, _lambda)
        print("Cost: " + str(cost) + " after " + str(i+1) + " iterations")
        
        cache_conv1, cache_pool1, cache_conv2, cache_pool2, A2shape = CN_caches
        
        # FC backward
        grads = FC_model_backward(AL, Y, FCLayers_dim, FC_caches)
        
        # Undo the unrolling
        grads['FC_dA0'] = rollMatrix(grads['FC_dA0'], A2shape)
        
        #Conv backward
        CN_dZ2 = pool_backward(grads['FC_dA0'], cache_pool2, mode = "max")
        #print(np.sum(CN_dZ2 == 0))
        grads['CN_dA1'], grads['CN_dW2'], grads['CN_db2'] = conv_backward(CN_dZ2, cache_conv2)
        
        CN_dZ1 = pool_backward(grads['CN_dA1'], cache_pool1, mode = "max")
        grads['CN_dA0'], grads['CN_dW1'], grads['CN_db1'] = conv_backward(CN_dZ1, cache_conv1)
        
        assert(grads['CN_dA0'].shape == X.shape)
        
        if checkGradient == True and (i+1)%1 == 0:
            diff = gradient_check_n(params, grads, X, Y, FCLayers_dim)
        
        #update params
        params = update_params(X.shape[0], FCLayers_dim, params, grads, alpha, _lambda)
    
        # end of one epoch
    return params, grads

In [41]:
def rollMatrix(theVector, A2shape):

    rolledMatrix = theVector.transpose(1,0).reshape(A2shape[0],A2shape[1],A2shape[2],A2shape[3])
    
    assert(rolledMatrix.shape == A2shape)
    
    return rolledMatrix

In [42]:
train_x_orig, train_y_orig, test_x_orig, test_y_orig, allClasses = loadData()
minibatch_end = 10
X = train_x_orig[0:minibatch_end,:]
Y = train_y_orig[:,0:minibatch_end]
#X = train_x_orig
#Y = train_y_orig
FCLayers_dim = [ None, 120, 84, len(allClasses)]

hparams = {}
hparams['_lambda'] = 0
hparams['alpha'] = 0.0001
hparams['noOfIterations'] = 2
hparams['checkGradient'] = True
hparams['save2File'] = True
hparams['smooth_grad'] = True
hparams['showImage'] = True

params, FCLayers_dim = init_params(X.shape[1], FCLayers_dim, hparams)

params, grads = class_model_run(X, Y, params, hparams)

if hparams['save2File'] == True:
    saveParams(params)

outcome = make_prediction(X, Y, params)

if hparams['showImage'] == True:
    showOutcome(X, Y, outcome, showWrongOnly = True)
    showFilters(params, 'CN_W1')
    showFilters(params, 'CN_W2')


Cost: 0.222235487949 after 1 iterations
[93mGradient Check on 0th param: backward Prop error! difference = 0.702464486309[0m
grad value: [ 0.18421144]; grad approx: [ 0.03219418]
[93mGradient Check on 1th param: backward Prop error! difference = 0.683889899905[0m
grad value: [-0.07126445]; grad approx: [-0.0133782]
[93mGradient Check on 2th param: backward Prop error! difference = 0.698566810156[0m
grad value: [-0.48326544]; grad approx: [-0.08576186]
[93mGradient Check on 3th param: backward Prop error! difference = 0.708369048035[0m
grad value: [ 0.12963974]; grad approx: [ 0.02213044]
[93mGradient Check on 4th param: backward Prop error! difference = 0.729037191503[0m
grad value: [-0.04511126]; grad approx: [-0.00706953]
[93mGradient Check on 5th param: backward Prop error! difference = 0.718123579567[0m
grad value: [ 0.3332183]; grad approx: [ 0.054668]
[93mGradient Check on 6th param: backward Prop error! difference = 0.696591092344[0m
grad value: [ 0.35442797]; grad

KeyboardInterrupt: 

In [None]:
#########################################

TypeError: 'float' object cannot be interpreted as an integer