In [1]:
#Background:
#I am a newbie on machine learing and I am doing this notebook to try out different methods/skills that I learned
# most the optioins and parameters changes can be done at the bottom of the notebook by changing parameters in 
# global_var. 
# The basic purpose is to classify images of digits from 0-9 into 10 different classes. 
# Here are some of the functions we can use in this notebook: normalize data, separate training and testing data,
# use of batch normalization, use softmax, use gradient check, use gradient descent or adams or momentum methods 
# for updating paramters, use of dropout, use of L2 regularization, plotgraph to show cost/gradient changes etc.

#Key learnings:
# [X_train.shape[0],40,20,5,1] 
# batchnorm = True, grad = adams, iter=1000, alpha = 0.01, lambda = 0 train acc = 100% test = 92.6%


#- Gradient checks work better if other techniques like momentum/RMS are not implemented
#- "division by zero in log" is catched and attempted to be solved by addiing epsilon to (1 - AL) if <= 0. 
#   By doing so at least the cost function would not become nan
#- it seems that the cost function is easier to subject to exploding grad if alpha is larger
#- if mini-batch is not applied, the value of adams is not certain

In [2]:
# -*- coding: utf-8 -*-
import numpy as np
import scipy.io
import math
import matplotlib.pyplot as plt
import subprocess
import time
import json
from PIL import Image

In [3]:
def normal_array(a):
    
    n, m = a.shape

    epsilon = 1e-8
    
    #a_mean = np.mean(a, axis = 1).reshape(a.shape[0],1)
    #a_std = np.std(a, axis = 1).reshape(a.shape[0],1)
    
    ### mean of data set: no of columns = number of features
    a_mean = (1./m) * np.sum(a, axis = 1).reshape(n,1)

    ### difference between dataset and mean
    a_mean_diff = (a - a_mean)
    
    ### square of the difference
    a_sq = np.power( (a - a_mean) , 2 )
    
    ### variance
    a_var = (1./m) * np.sum( a_sq , axis = 1).reshape(n,1)
    
    ### standard deviation with addition of epsilon to avoid div by zero
    a_std = np.sqrt(a_var + epsilon) 
    
    a_std_iver = 1./a_std
    
    a_norm = a_mean_diff * a_std_iver
    
    assert(a_norm.shape == a.shape)
    
    return a_norm


In [4]:
def showrandomimage(X, Y, Ypredict, showWrongOnly = False):
    
    dim = int(np.sqrt(X.shape[0]))
    
    if showWrongOnly == True:
        mask = np.where(Y != Ypredict.reshape(Y.shape))
        Y = Y[:,mask[1]]
        X = X[:,mask[1]]
        Ypredict = Ypredict[:,mask[1]]
    
    i = np.random.randint(0,X.shape[1])
    
    print("Prediction: " + str(Ypredict[:,i]) + " ; Y: " + str(Y[:,i]))
    
    arr = X[:,i].reshape(dim,dim).T
    plt.imshow(arr, cmap='gray')
    plt.show()
    
    return

In [5]:
def load_data(train_data_ratio):
    
    mat = scipy.io.loadmat('ex4data1.mat')
    X_raw = np.array(mat['X']).T
    Y_raw = np.array(mat['y']).T
    
    #X = np.random.rand(400,5000)       #X n=400, m=5000
    #Y = np.round(np.random.rand(1,1000)*100)%2
    
    m = Y_raw.shape[1]
    train_size = round(m * train_data_ratio)   # 80% as training set
    test_size = m - train_size
    
    p = np.random.permutation(m)
        
    #X_train = normal_array(X_raw)
    X_train = X_raw[:, p[ 0 : train_size ] ]
    Y_train = Y_raw[:, p[ 0 : train_size ] ]
    X_dev = X_raw[:, p[ train_size : train_size + 1 + test_size] ]
    Y_dev = Y_raw[:, p[ train_size : train_size + 1 + test_size] ]
    
    assert(X_train.shape[1] + X_dev.shape[1] == m)
    
    return X_train, Y_train, X_dev, Y_dev

In [6]:
# GRADED FUNCTION: random_mini_batches

def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    """
    Creates a list of random minibatches from (X, Y)
    
    Arguments:
    X -- input data, of shape (input size, number of examples)
    Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)
    mini_batch_size -- size of the mini-batches, integer
    
    Returns:
    mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
    """
    
    np.random.seed(seed)            # To make your "random" minibatches the same as ours
    m = X.shape[1]                  # number of training examples
    mini_batches = []
        
    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1,m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) 
    # number of mini batches of size mini_batch_size in your partitionning
    
    for k in range(0, num_complete_minibatches):
        ### START CODE HERE ### (approx. 2 lines)
        mini_batch_X = shuffled_X[:,k * mini_batch_size : (k+1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:,k * mini_batch_size : (k+1) * mini_batch_size]
        ### END CODE HERE ###
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        ### START CODE HERE ### (approx. 2 lines)
        mini_batch_X = shuffled_X[:,(num_complete_minibatches * mini_batch_size) : m]
        mini_batch_Y = shuffled_Y[:,(num_complete_minibatches * mini_batch_size) : m]
        ### END CODE HERE ###
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [7]:
def prepareSoftMaxY(Y_train, numberOfClasses): 
    Y_all_class = np.zeros([numberOfClasses, Y_train.shape[1]])
    Y_trainSoftMax = Y_train
    Y_trainSoftMax[Y_train == 10] = 0
        
    # change Y to a numberOfClasses x m matrix with class of pos = 1
    for i in range(Y_trainSoftMax.shape[1]):
        Y_all_class[Y_trainSoftMax[0,i], i] = 1
    
    return Y_all_class

In [8]:
def initialize_parameters_deep(layer_dims, global_var):

    parameters = {}
    momentumGrad = {}            # for moving average gradient
    RMSGrad = {}
    ActRecord = {}
    batchNorm = global_var['batchNorm']
    #checkAct = global_var['checkActivation']
    
    #if checkAct == True:
    #    actStatus = {}
    
    L = len(layer_dims)            # number of layers in the network
    
    for l in range(1, L):
        
        ### Avoid Gradient Vanish or Exploding
        smooth_gradient_adj = np.sqrt(2/layer_dims[l-1])    # to avoid vanishing or exploding gradients
        
        parameters['W' + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1]) * smooth_gradient_adj
       
        ### for moving average gradient ###
        momentumGrad['dW' + str(l)]  = np.zeros([layer_dims[l],layer_dims[l-1]])
       
        ### for RMS moving average gradient ###
        RMSGrad['dW' + str(l)]  = np.zeros([layer_dims[l],layer_dims[l-1]])
        
        if batchNorm == True:
            ### Creating Gamma and Beta for Z normalization
            parameters['G' + str(l)] = np.ones([layer_dims[l],1]) * smooth_gradient_adj 
            #parameters['G' + str(l)] = np.random.randn(layer_dims[l],1) * smooth_gradient_adj 
            parameters['B' + str(l)] = np.zeros([layer_dims[l],1])  
        
            momentumGrad['dG' + str(l)]  = np.zeros([layer_dims[l],1])
            momentumGrad['dB' + str(l)]  = np.zeros([layer_dims[l],1])     
        
            RMSGrad['dG' + str(l)]  = np.zeros([layer_dims[l],1])
            RMSGrad['dB' + str(l)]  = np.zeros([layer_dims[l],1])   
        
            assert(parameters['G' + str(l)].shape == (layer_dims[l], 1))
            assert(parameters['B' + str(l)].shape == (layer_dims[l], 1))
            assert(momentumGrad['dG' + str(l)].shape == (layer_dims[l], 1))
            assert(momentumGrad['dB' + str(l)].shape == (layer_dims[l], 1))
        else:
            parameters['b' + str(l)] = np.zeros([layer_dims[l],1])
            momentumGrad['db' + str(l)] = np.zeros([layer_dims[l],1]) 
            RMSGrad['db' + str(l)] = np.zeros([layer_dims[l],1])
            
            assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))
            
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))   
        
        #if checkAct == True:
            # set all initial status to false
            #actStatus['a' + str(l)] = np.zeros(([layer_dims[l],1]) , dtype=bool) 
    
    #if checkAct == True:
        #global_var['act'] = actStatus
        
            
    return parameters, momentumGrad, RMSGrad

In [9]:
def plot_graph(cost_array, stitle):
    
    ax = plt.subplot(111)
    
    i = cost_array.shape[0]
    
    plt.plot(np.arange(0,i), cost_array,'-')
    plt.title(stitle)
    
    fig = plt.figure(figsize=(5, 5), dpi=100)    
    
    plt.show()
    return

In [10]:
def linear_forward(A, W, b):
    
    Z = np.dot(W,A) + b
    
    assert(Z.shape == (W.shape[0], A.shape[1]))
    
    cache = (A, W, b)           #linear_cache
    
    return Z, cache

In [11]:
def relu(Z):
    a = np.maximum(0,Z)
    
    return a, Z

In [12]:
def softmax_forward(Z):
    
    Zshift = Z - np.max(Z)
    t = np.exp(Zshift)
    a = np.divide(t, (np.sum(t, axis=0, keepdims=True)))
    
    return a, Z

In [13]:
def sigmoid(Z):
    a = 1./(1+np.exp(-Z))
    
    #assert(np.sum(a <= 0) == 0 and np.sum(a >= 1) == 0)
    
    return a, Z

In [14]:
def batchnorm_forward(Z, G, B):    
    """
    Obsoleted 
    """
    n, m = Z.shape    # n is the number of features and m is the number of training examples
    
    epsilon = 1e-8    # to 
    
    ### mean of data set: no of columns = number of features
    Z_mean = np.mean(Z, axis = 1).reshape(n,1)
    
    ### difference between dataset and mean
    Z_mean_diff = (Z - Z_mean)
    
    ### square of the difference
    Z_sq = Z_mean_diff ** 2 
    
    ### variance
    Z_var = np.mean( Z_sq , axis = 1).reshape(n,1)
    
    ### standard deviation with addition of epsilon to avoid div by zero
    Z_std = np.sqrt(Z_var + epsilon) 
    
    ### standard deviation with addition of epsilon to avoid div by zero
    Z_std_iver = 1./Z_std
    
    ### normalized Z
    Z_norm = Z_mean_diff * Z_std_iver
    
    ### ZGamma: easier for differentiation
    ZGamma = G * Z_norm
    
    Z_tda = ZGamma + B
        
    
    norm_cache = Z_norm, G, B, Z_mean, Z_std_iver, Z_var
        
    assert (Z_tda.shape == Z.shape)
    
    return Z_tda, norm_cache

In [15]:
def batchnorm_forward_computational_graph(Z, G, B):  
    
    """
    Reference: 
    https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html
    But seems that gradient check does not add up
    last check: 25 Sept 2017 19:00
    
    """
    
    n, m = Z.shape    # n is the number of features and m is the number of training examples
    
    epsilon = 1e-8    # to 
    
    ### mean of data set: no of columns = number of features
    Z_mean = (1./m) * np.sum(Z, axis = 1).reshape(n,1)
    
    ### difference between dataset and mean
    Z_mean_diff = (Z - Z_mean)
    
    ### square of the difference
    Z_sq = Z_mean_diff ** 2 
    
    ### variance
    Z_var = (1./m) * np.sum( Z_sq , axis = 1).reshape(n,1)
    
    ### standard deviation with addition of epsilon to avoid div by zero
    Z_std = np.sqrt(Z_var + epsilon) 
    
    ### standard deviation with addition of epsilon to avoid div by zero
    Z_std_iver = 1./Z_std
    
    ### normalized Z
    Z_norm = Z_mean_diff * Z_std_iver
    
    ### ZGamma: easier for differentiation
    ZGamma = G * Z_norm
    
    Z_tda = ZGamma + B
    
    norm_cache = Z_norm, G, B, Z_mean_diff, Z_std_iver, Z_var
    
    #norm_cache = Z_norm, G, B, Z_mean, Z_std_iver, Z_var
            
    assert (Z_tda.shape == Z.shape)
    
    return Z_tda, norm_cache

In [16]:
def batchnorm_backward_computational_graph(dZ_tda, activation_cache, norm_cache):
    """
    Reference: 
    https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html
    But seems that gradient check does not add up
    last check: 25 Sept 2017 19:00
    
    """

    Z = activation_cache
    Z_norm, G, B, Z_mean_diff, Z_std_iver, Z_var = norm_cache
    
    
    epsilon = 1e-8
    
    n, m = dZ_tda.shape           
    
    #assuming we have the right dJ_dZ_tda
    
    #Z_tda = GammaZ + B
    dGammaZ = dZ_tda * 1
    dB = np.sum((dZ_tda * 1), axis = 1).reshape(n, 1)                    #dJdB
    
    #GammaZ = G * Z_norm
    dG = np.sum((dGammaZ * Z_norm), axis = 1).reshape(n, 1)               #dJdG    
    dZ_norm = dZ_tda * G                                                          #dJdZ_norm    
    
    
    #Z_norm = Z_mean_diff * Z_std_iver
    
    dZ_mean_diff1 = dZ_norm * Z_std_iver                                                   #dJdZ_mean_diff
    dZ_std_iver = np.sum((dZ_norm * Z_mean_diff), axis = 1).reshape(n, 1)         #dJdZ_std_iver
        
    dZ_std = ( -1./ (Z_std_iver ** 2) ) * dZ_std_iver                      #dJdstd
    
    #Z_std = np.power((Z_var +  epsilon), (1/2))
    
    dZ_var = (0.5 * (1./ np.sqrt(Z_var +  epsilon) * 1)) * dZ_std            #dJddZ_var
    
    #Z_var = 1./m * np.sum(Z_sq)
    
    dZ_sq =  (1./m * np.ones(Z.shape)) * dZ_var                                    #dJdZ_sq
    
    #Z_sq = Z_mean_diff**2
    
    dZ_mean_diff2 = (2 * Z_mean_diff) * dZ_sq                            #dJdZ_mean_diff
    
    dZ_mean_diff = dZ_mean_diff1 + dZ_mean_diff2
    
    #dZ_mean_diff = z - dZ_mean
    
    dZ1 = 1 * (dZ_mean_diff)                                      
                       
    dZ_mean = np.sum((dZ_mean_diff), axis = 1).reshape(n, 1) * -1        #dJdZ_mean
    
    #dZ_mean = (1/m) * np.sum(z)
    
    dZ2 = dZ_mean * 1./m * np.ones(dZ_tda.shape)                                 #dJdZ
    
    dZ = dZ1 + dZ2
    
    ##########
        
    assert (dZ_tda.shape == dZ.shape)
    assert (dZ.shape == Z.shape)
    assert (dG.shape == (Z.shape[0],1))
    assert (dB.shape == (Z.shape[0],1))
    
    
    return dZ, dG, dB

In [17]:
def batchnorm_backward_complicated(dout, activation_cache, norm_cache):
    """
    Reference:
    https://wiseodd.github.io/techblog/2016/07/04/batchnorm/
    Seems that this one works for gradient check
    Last update: 25 Sept 2016 20:00
    """

    X = activation_cache
    X_norm, gamma, beta, Z_mean_diff, Z_std_iver, var = norm_cache

    assert(X.shape == dout.shape)
    
    n, m = dout.shape
    
    mu = X - Z_mean_diff

    std_inv = 1. / np.sqrt(var + 1e-8)

    dX_norm = dout * gamma
    
    
    dvar = np.sum(dX_norm * Z_mean_diff, axis=1).reshape(n,1) * -.5 * std_inv**3
    dmu = np.sum(dX_norm * -std_inv, axis=1, keepdims=True) + dvar * np.mean(-2. * Z_mean_diff, axis=1, keepdims=True)

    dX = (dX_norm * std_inv) + (dvar * 2 * Z_mean_diff / m) + (dmu / m)
    dgamma = np.sum(dout * X_norm, axis=1, keepdims=True)
    dbeta = np.sum(dout, axis=1, keepdims=True)

    dX = dX.reshape(n,m)

    
    return dX, dgamma, dbeta

In [18]:
def batchnorm_backward_thankGod(dJ_dZtda, activation_cache, norm_cache):

    # retrieve cache(s)
    Z = activation_cache
    Znorm, Gamma, Beta, Zmeandiff, Z_std_iver, Zvar = norm_cache
    
    # needed variables Z, gamma, beta, Zmean, Zvar,
    
    # prepare variable(s)
    n, m = dJ_dZtda.shape
    eps = 1e-8
    
    #assert(np.sum(1./(np.sqrt(Zvar + eps))) == np.sum(Z_std_iver))
    assert(Z.shape == dJ_dZtda.shape)
    
    #dZtda = Gamma * Znorm + Beta
    dZtda_dZnorm = Gamma
    
    dJ_dZnorm = dJ_dZtda * dZtda_dZnorm   # 1
    #(n. m)
    
    #Znorm = (Z - Zmean) *  1/((Zvar + eps)**1/2)
    # we need to compute dJ_dZ by breaking into 3 paths:
    # 1) dJ_dZvar
    # 2) dJ_dZmu
    # 3) dJ_dZc (assuming other interdependent variables like Zmu, Zvar are constant)
    # then dJ_dZ (total) = dJ_dZc + dJ_dZvar + dJ_dZmu
    
    #compute the derivative of Znorm wrt "Zvar" assuming other variables are constant
    dZnorm_dZvar = np.sum( Zmeandiff, axis = 1, keepdims=True) * (-0.5) * ( (1./np.sqrt(Zvar + eps)) **3 )
    # (n, 1)
    #dJ_dZvar = np.sum(dJ_dZnorm, axis = 1).reshape(n, 1)  * dZnorm_dZvar
    #(n, 1) cannot do this because np.sum elements have to be processed together
    dJ_dZvar = np.sum( dJ_dZnorm * Zmeandiff, axis = 1, keepdims=True) * (-0.5) * ( (1./np.sqrt(Zvar + eps)) **3 )
    # (n, 1)
    
    
    #compute the derivative of Znorm wrt "Zmu" assuming other variablesare constant
    dZnorm_dZmu = np.sum( (-1./np.sqrt(Zvar + eps)) , 
                         axis = 1).reshape(n,1) + dJ_dZvar * (-2./m) * np.sum( Zmeandiff, axis = 1).reshape(n, 1) 
    #(n, 1)
    
    #dJ_dZmu = np.sum(dJ_dZnorm, axis = 1).reshape(n, 1)  * dZnorm_dZmu
    #(n, 1) cannot do this because np.sum elements have to be processed together
    
    dJ_dZmu = np.sum(dJ_dZnorm * (-1./np.sqrt(Zvar + eps)) , 
                     axis = 1, keepdims = True) + dJ_dZvar * (-2./m) * np.sum( Zmeandiff , axis = 1, keepdims = True)
    
    #(n, 1)
    
    #compute the derivative of Znorm wrt "Z" assuming other variables are constant
    dZnorm_dZc = 1./(np.sqrt(Zvar + eps)) * 1
    #dJ_dZc = dJ_dZnorm * dZnorm_dZc
    #(n, m)
    
    #we need to compute dZmu_dZvar and dZmu_dZ before adding them up
    #Zvar = 1/m * np.sum( (Z - Zmu) ** 2)
    #dZvar_dZ =  2 * (Z - Zmu)/m 
    #(n, m)
    
    #Zmu = 1/m * np.sum(Z)
    #dZmu_dZ = 1/m
    #(n, m)
    
    #then we compuete the total dJ_dZ

    dJ_dZ = (dJ_dZnorm * dZnorm_dZc) + (dJ_dZvar * 2 * Zmeandiff/m ) + (dJ_dZmu / m)
    #(n, m)
    
    
    dJ_dGamma = np.sum(dJ_dZtda * Znorm, axis=1).reshape(n, 1)
    dJ_dBeta = np.sum(dJ_dZtda * 1, axis=1).reshape(n, 1)
    
    return dJ_dZ, dJ_dGamma, dJ_dBeta

In [19]:
def linear_activation_forward(A_prev, W, b, activation, global_var, G = 0, B = 0):

    ####def linear_activation_forward(A_prev, W, b, activation):    
    
    batchNorm = global_var['batchNorm']
    dropOut = global_var['dropOut']
    dropOutRate = 0.15
    
    if activation == "sigmoid":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        
        if batchNorm == True:
            Z, linear_cache = linear_forward(A_prev, W, b)    #linear cache : (A, W, b, G, B)
            Ztida, norm_cache = batchnorm_forward_computational_graph(Z, G, B)
            if global_var['useSoftMax'] == False:
                A, z_activation_cache = sigmoid(Ztida) 
            else:
                A, z_activation_cache = softmax_forward(Ztida)
        else:
            Z, linear_cache = linear_forward(A_prev, W, b)    #linear cache : (A, W, b)   
            if global_var['useSoftMax'] == False:
                A, z_activation_cache = sigmoid(Z)
            else:
                A, z_activation_cache = softmax_forward(Z)

    
    elif activation == "relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        
        if batchNorm == True:
            Z, linear_cache = linear_forward(A_prev, W, b)    #linear cache : (A, W, b, G, B)
            Ztida, norm_cache = batchnorm_forward_computational_graph(Z, G, B)
            A, z_activation_cache = relu(Ztida)
        else:
            Z, linear_cache = linear_forward(A_prev, W, b)    #linear cache : (A, W, b)
            A, z_activation_cache = relu(Z)
            
        if dropOut == True and A.shape[0] >= 10:
            d = np.random.rand(A.shape[0], A.shape[1])
            A = A * (d > dropOutRate)
            #print("dropout applied!")
        
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    
    cache = (linear_cache, z_activation_cache)         #linear_cache is A, W, b, activation_cache is Z

    if batchNorm == True:
        return A, cache, norm_cache
    else:
        return A, cache
    

In [20]:
def L_model_forward(X, layer_dims, parameters, global_var, i = -1):
    """    
    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2)
                the cache of linear_sigmoid_forward() (there is one, indexed L-1)
    """
    
    batchNorm = global_var['batchNorm']
    isPredict = global_var['isPredict']
    #checkAct = global_var['checkActivation']
    
    caches = []
    norm_caches = []
    A = X
    L = len(layer_dims)                  # number of layers in the neural network

    # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    for l in range(1, L - 1):
        A_prev = A 
                
        if batchNorm == True:
            A, cache, norm_cache = linear_activation_forward(A_prev, parameters['W' + str(l)], 0, "relu", global_var, parameters['G' + str(l)], parameters['B' + str(l)])
            norm_caches.append(norm_cache)
        else:
            A, cache = linear_activation_forward(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], "relu", global_var)
        
        caches.append(cache)
        ### END CODE HERE ###
    
    # Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list.
    
    if batchNorm == True:
        AL, cache, norm_cache = linear_activation_forward(A, parameters['W' + str(L-1)], 0, "sigmoid", global_var, parameters['G' + str(L-1)], parameters['B' + str(L-1)])
        norm_caches.append(norm_cache)
    else:
        AL, cache = linear_activation_forward(A, parameters['W' + str(L-1)], parameters['b' + str(L-1)], "sigmoid", global_var)
        
    
    caches.append(cache)          # (linear_cache, z_activation_cache) 
        
    assert(AL.shape == (parameters['W' + str(L-1)].shape[0],X.shape[1]))
    
    if batchNorm == True:       
        return AL, caches, norm_caches
    else:
        return AL, caches

In [21]:
def compute_cost(AL, Y, layer_dims, parameters, lambd, global_var):
    """
    Implement the cost function defined by equation (7).

    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """
    
    m = Y.shape[1]
    Y = np.array(Y, dtype=float)     # to avoid division by zero
    SumSqW = 0                       # for regularization
    
    # Compute loss from aL and y.

    #cost = (1/m)*np.sum(-(Y*np.log(AL)+(1-Y)*np.log(1-AL)))
    
    if np.sum(AL <= 0) > 0:                    #check if there is any instances, true = 1
        AL[AL <= 0] = 1e-7
        print("AL below zeros detected")
        
    if np.sum(AL >= 1) > 0:
        sub = 1 - 1e-7
        AL[AL > 1] = sub      #make it just slightly smaller than 1
        print("(1 - AL) below zeros detected")
      
    if global_var['useSoftMax'] == False:    
        logprobs = np.multiply(-np.log(AL),Y) + np.multiply(-np.log(1 - AL), 1 - Y)
    else:
        logprobs = -1 * np.sum(Y * (np.log(AL)), axis=0, keepdims=True)
    
    ### Regularization ###
    L = len(layer_dims)
    
    for l in range(L-1): 
        SumSqW = SumSqW + np.sum(np.square(parameters["W" + str(l + 1)]))
        L2_reg = (1./(2 * m)) * lambd * SumSqW
    
    cost = 1./m * np.sum(logprobs) + L2_reg
        
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape == ())
    
    return cost

In [22]:
def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    #dJ_dZ = dJ_dA * dA_dZ
    #dA_dZ = 0 when Z <=0
    #dA_dZ = 1 when Z > 0
    #dJ_dZ = 0 when z <=0; = dJ_dA when Z > 0
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0 
    
    assert (dZ.shape == Z.shape)
    
    return dZ


In [23]:
def sigmoid_backward(dA, cache):
    """
    Implement the backward propagation for a single SIGMOID unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache     # activation cache
    
    a = 1/(1+np.exp(-Z))
    
    #dZ = np.multiply(np.multiply(a, (1-a)), dA)
    dZ = dA * a * (1-a)       # dAL/dZ = a * (1-a)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

In [24]:
def softmax_backward_extended(dA, cache):
    
    Z = cache    
    a, _ = softmax_forward(Z)
    needVerify = False
    useForLoop = False
    
    assert(Z.shape == dA.shape)
    
    noOfClass, m = Z.shape
    dJdZ = np.zeros([noOfClass, m])
    
    for k in range(m):
        dAdZMatrix = np.zeros((noOfClass, noOfClass))
        
        if useForLoop == False:
            dAdZMatrix = -np.outer(a[:, k], a[:, k]) + np.diag(a[:, k].flatten())
        else:
            dAdZ_forLoop = np.zeros((noOfClass, noOfClass))
        
            for i in range(noOfClass):
                for j in range(noOfClass):
                    dAdZ_forLoop[i, j] = a[i, k] * ((i == j) - a[j, k])
            
            dAdZMatrix = dAdZ_forLoop
        
        if needVerify == True and useForLoop == True:
            if (np.sum(dAdZ_forLoop) - np.sum(dAdZMatrix)) > 1e-15:
                print("difference between dAdZ_forLoop and Matrix is too big")
        
            assert(dAdZMatrix.shape ==  dAdZ_forLoop.shape)
        
        assert(dAdZMatrix.shape == (noOfClass,noOfClass))

        new_vector = np.sum ( (dA[:,k].reshape(noOfClass,1) * dAdZMatrix).T, axis=1, keepdims=True)
    
        if k == 0:
            dJdZMatrix = new_vector

        else:
            dJdZMatrix = np.concatenate((dJdZMatrix, new_vector), axis=1)
    
    #hardcoded answer
    dJdZa = a + dA*a
    
    #print(dJdZMatrix.shape)
    
    if np.sum(dJdZMatrix) - np.sum(dJdZa) > 1e-10:
        print("difference between dJdZMatrix and hardcode calculation is too big")
    
    return dJdZMatrix

In [25]:
def softmax_backward(dA, cache):
    
    Z = cache
    
    t = np.exp(Z)
    a = t/(np.sum(t, axis=0))
    
    dZ = a + dA * a
    
    assert (dZ.shape == dA.shape)
    
    return dZ

In [26]:
def linear_backward(dZ, cache, batchNorm):

    A_prev, W, b = cache
    m = A_prev.shape[1]
       
    dW = 1./m * np.dot(dZ, A_prev.T)   
    dA_prev = np.dot(W.T, dZ)    
        
    if batchNorm ==  False:
        db = 1./m * np.sum(dZ, axis=1, keepdims=True)
        assert (db.shape == b.shape)
        
        
    assert (dW.shape == W.shape)
    assert (dA_prev.shape == A_prev.shape)
    
    
    if batchNorm ==  True:
        return dA_prev, dW
    else:
        return dA_prev, dW, db

In [27]:
def linear_activation_backward(dA, cache, norm_cache, activation, batchNorm):
    
    batchNorm = global_var['batchNorm']
    batchNormBackMethod = global_var['batchNormBackMethod']
    useSoftMax = global_var['useSoftMax']
    
    linear_cache, activation_cache = cache

    if activation == "relu":

        if batchNorm == True:
            dZ_tda = relu_backward(dA, activation_cache)
            
            if batchNormBackMethod == "abstract":
                dZ, dG, dB = batchnorm_backward_thankGod(dZ_tda, activation_cache, norm_cache) 
            else:
                dZ, dG, dB = batchnorm_backward_computational_graph(dZ_tda, activation_cache, norm_cache)
                
            dA_prev, dW = linear_backward(dZ, linear_cache, batchNorm)
        else:
            dZ = relu_backward(dA, activation_cache)
            dA_prev, dW, db = linear_backward(dZ, linear_cache, batchNorm)
        
        
    elif activation == "sigmoid":
        
        if batchNorm == True:
            if useSoftMax == False:
                dZ_tda = sigmoid_backward(dA, activation_cache)
            else:
                dZ_tda = softmax_backward_extended(dA, activation_cache)
            
            if batchNormBackMethod == "abstract":
                dZ, dG, dB = batchnorm_backward_thankGod(dZ_tda, activation_cache, norm_cache) 
            else:
                dZ, dG, dB = batchnorm_backward_computational_graph(dZ_tda, activation_cache, norm_cache)    
             
            dA_prev, dW = linear_backward(dZ, linear_cache, batchNorm)  

        else:
            if useSoftMax == False:
                dZ = sigmoid_backward(dA, activation_cache)
            else:
                dZ = softmax_backward_extended(dA, activation_cache)
            dA_prev, dW, db = linear_backward(dZ, linear_cache, batchNorm)
                

    if batchNorm == True:  
        return dA_prev, dW, dG, dB
    else:
        return dA_prev, dW, db
    

In [28]:
def dictionary_to_vector_custom(parameters):
    """
    Roll all our parameters dictionary into a single vector satisfying our specific required shape.
    """
    keys_labels = np.array(range(len(parameters)*3), dtype='U8').reshape(len(parameters),3)

    count = 0
    
    for key, value in sorted(parameters.items()):
        
        #Storing key names and dimenson
        keys_labels[count, 0] = key
        keys_labels[count,1] = value.shape[0]
        keys_labels[count,2] = value.shape[1]
        
        #storing a N x 1 dimensional value vector
        new_vector = np.reshape(parameters[key], (-1,1))
        
        if count == 0:
            param_values = new_vector

        else:
            param_values = np.concatenate((param_values, new_vector), axis=0)
    
                
        count = count + 1
        
    return keys_labels, param_values


In [29]:
def vector_to_dictionary_custom(keys_labels, param_values):
    """
    Unroll all our parameters dictionary from a single vector satisfying our specific required shape.
    """
    last_index = 0
    parameters = {}
    
    for i in range(keys_labels.shape[0]):
        
        key = keys_labels[i][0]
        dim0 = int(keys_labels[i][1])
        dim1 = int(keys_labels[i][2])
        index_length = (dim0 * dim1)
        
        temp_array = param_values[last_index:last_index+index_length,0]
        
        temp_array = temp_array.reshape(dim0, dim1)
        parameters[key] = temp_array
        
        last_index = last_index + index_length


    return parameters

In [30]:
def gradients_to_vector_custom(gradients):
    """
    Roll all our parameters dictionary into a single vector satisfying our specific required shape.
    """
    
    len_no_dA = sum(1 for i in gradients if 'dA' not in i)  # find the length of vector without dA*
    
    no_dA_grad_labels = np.array(range(len_no_dA*3), dtype='U8').reshape(len_no_dA,3)

    count = 0
    
    for key, value in sorted(gradients.items()):
        
        if 'dA' not in key:
        
            #Storing key names and dimenson
            no_dA_grad_labels[count, 0] = key
            no_dA_grad_labels[count,1] = value.shape[0]
            no_dA_grad_labels[count,2] = value.shape[1]
        
            #storing a N x 1 dimensional value vector
            new_vector = np.reshape(gradients[key], (-1,1))
        
            if count == 0:
                no_dA_grad_values = new_vector

            else:

                no_dA_grad_values = np.concatenate((no_dA_grad_values, new_vector), axis=0)
                
            count = count + 1
        
    
    return no_dA_grad_labels, no_dA_grad_values


In [31]:
def L_model_backward(AL, Y, caches, batchNorm, norm_caches = []):

    grads = {}
    L = len(caches) # the number of layers
    batchNorm = global_var['batchNorm']
    useSoftMax = global_var['useSoftMax']

    
    m = AL.shape[1] # A or Z retains the dimension of number of training examples m
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    noOfClass = Y.shape[0]
    
    if useSoftMax == False:
        dAL = ( - (np.divide( Y, AL ) - np.divide(1 - Y, 1 - AL )) ) # derivative of cost with respect to AL
    else:
        dAL = -1 * np.divide( Y, AL ) 
        

    
    current_cache = caches[L-1]   # contains of linear cache (A, W, b,) and activation cache (Z)
    
    if batchNorm == True:
        current_norm_cache  = norm_caches[L-1]
    else:
        current_norm_cache = norm_caches
    
    #### MAKE norm_caches to append ####
    
    ### first backpropagation :-> sigmoid
    if batchNorm == True:
        grads["dA" + str(L-1)], grads["dW" + str(L)], grads["dG" + str(L)], grads["dB" + str(L)] = linear_activation_backward(dAL, current_cache, current_norm_cache, 'sigmoid', global_var)
        #print(np.sum(grads["dG" + str(L)]))
    else:
        grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, current_norm_cache, 'sigmoid', global_var)
        #print(np.sum(grads["dW" + str(L)]))

    
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) gradients.
        # Inputs: "grads["dA" + str(l + 2)], caches". Outputs: "grads["dA" + str(l + 1)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] 

        
        if batchNorm == True:
            dA_prev_temp, dW_temp, dG_temp, dB_temp = linear_activation_backward(grads["dA" + str(l+1)], caches[l], norm_caches[l] , 'relu', global_var)
        else:
            dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+1)], caches[l], norm_caches , 'relu', global_var)

        
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l+1)] = dW_temp
        
        if batchNorm == True:
            grads["dG" + str(l+1)] = dG_temp
            grads["dB" + str(l+1)] = dB_temp
        else:
            grads["db" + str(l+1)] = db_temp
    
    return grads


In [32]:
def gradient_check_n(parameters, gradients, X, Y, layer_dims, global_var):

    
    # Set-up variables
    epsilon = 1e-7
    
    batchNorm = global_var['batchNorm']
    printdiff = global_var['checkGradientPrintDiff']
    dropOut = global_var['dropOut']
    keys_labels, param_values = dictionary_to_vector_custom(parameters)
    no_dA_grad_labels, no_dA_grad_values = gradients_to_vector_custom(gradients)

    num_parameters = param_values.shape[0]
    J_plus = np.zeros((num_parameters, 1))
    J_minus = np.zeros((num_parameters, 1))
    gradapprox = np.zeros((num_parameters, 1))
    
    assert (no_dA_grad_values.shape == param_values.shape)
    
    if dropOut == True:
        print("Warning Dropout is ON!")
    
    # Compute gradapprox
    for i in range(num_parameters):
        #print("Testing " + str(i) + "th parameter...")
        
        # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
        # "_" is used because the function you have to outputs two parameters but we only care about the first one
        ### START CODE HERE ### (approx. 3 lines)
        thetaplus = np.copy(param_values)                           # Step 1
        thetaplus[i][0] = thetaplus[i][0] + epsilon                 # Step 2
        if batchNorm == True:
            AL_plus, _ , _ = L_model_forward(X, layer_dims, 
                                             vector_to_dictionary_custom(keys_labels, thetaplus), global_var)
        else:
            AL_plus, _  = L_model_forward(X, layer_dims, vector_to_dictionary_custom(keys_labels, thetaplus), 
                                          global_var)
        J_plus[i] = compute_cost(AL_plus, Y, layer_dims, parameters, 0.0, global_var)     # Step 3
        ### END CODE HERE ###

        # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".
        ### START CODE HERE ### (approx. 3 lines)
        thetaminus = np.copy(param_values)                          # Step 1
        thetaminus[i][0] = thetaminus[i][0] - epsilon               # Step 2        

        if batchNorm == True:
            AL_minus, _ , _ = L_model_forward(X, layer_dims, 
                                              vector_to_dictionary_custom(keys_labels, thetaminus), global_var)
        else:
            AL_minus, _ = L_model_forward(X, layer_dims, vector_to_dictionary_custom(keys_labels, thetaminus), 
                                          global_var)            
        J_minus[i] = compute_cost(AL_minus, Y, layer_dims, parameters, 0.0, global_var)   # Step 3
        ### END CODE HERE ###
        
        # Compute gradapprox[i]
        gradapprox[i] = (J_plus[i] - J_minus[i])/(2 * epsilon)
        #print(gradapprox.shape)
        #print(no_dA_grad_values.shape)
    
        # Compare gradapprox to backward propagation gradients by computing difference.

        numerator = np.linalg.norm(no_dA_grad_values[i] - gradapprox[i])                          # Step 1'
        denominator = np.linalg.norm(no_dA_grad_values[i]) + np.linalg.norm(gradapprox[i])        # Step 2'
        difference = np.divide(numerator, denominator)                                            # Step 3'

        if printdiff == True:
            if difference > 1e-7:
                print ("\033[93m" + "Gradient Check on " + str(i) + "th param: backward Prop error! difference = " + str(difference) + "\033[0m")
                #subprocess.call(["afplay", "beep-08b.wav"])
            else:
                print ("\033[92m" + "Gradient Check on " + str(i) + "th param: Backward Prop OKAY! difference = " + str(difference) + "\033[0m")
    
    return difference

In [33]:
def update_parameters(m, layer_dims, parameters, grads, momentumGrad, RMSGrad, alpha, lambd, i, global_var):

    B1 = 0.9
    B2 = 0.98
    epsilon = 1e-8
    
    L = len(layer_dims)
    momentumGrad_corrected = {}
    RMSGrad_corrected = {}

    batchNorm = global_var['batchNorm']
    update_method = global_var['update_method']
        

    for l in range(L-1): 

        
        ### Update Velocity by using B1 and (!-B1) and Grads ###
        momentumGrad["dW" + str(l+1)] = B1 * momentumGrad["dW" + str(l+1)] + ((1 - B1) * grads["dW" + str(l+1)])
        
        if batchNorm == True:
            momentumGrad["dG" + str(l+1)] = B1 * momentumGrad["dG" + str(l+1)] + ((1 - B1) * grads["dG" + str(l+1)])
            momentumGrad["dB" + str(l+1)] = B1 * momentumGrad["dB" + str(l+1)] + ((1 - B1) * grads["dB" + str(l+1)])
        else:
            momentumGrad["db" + str(l+1)] = B1 * momentumGrad["db" + str(l+1)] + ((1 - B1) * grads["db" + str(l+1)])

        
        ### Calculate corrected Velocity     
        momentumGrad_corrected["dW" + str(l+1)] = np.divide(momentumGrad["dW" + str(l+1)], (1 - B1**i))
        
        if batchNorm == True:
            momentumGrad_corrected["dG" + str(l+1)] = np.divide(momentumGrad["dG" + str(l+1)], (1 - B1**i))
            momentumGrad_corrected["dB" + str(l+1)] = np.divide(momentumGrad["dB" + str(l+1)], (1 - B1**i))        
        else:
            momentumGrad_corrected["db" + str(l+1)] = np.divide(momentumGrad["db" + str(l+1)], (1 - B1**i))        
           
        
        ### Update RMS using B2 and Grads ###
        RMSGrad["dW" + str(l+1)] = B2 * RMSGrad["dW" + str(l+1)] + np.multiply((1 - B2) , np.power( grads["dW" + str(l+1)], 2))
        
        if batchNorm == True:
            RMSGrad["dG" + str(l+1)] = B2 * RMSGrad["dG" + str(l+1)] + np.multiply((1 - B2) , np.power( grads["dG" + str(l+1)], 2))
            RMSGrad["dB" + str(l+1)] = B2 * RMSGrad["dB" + str(l+1)] + np.multiply((1 - B2) , np.power( grads["dB" + str(l+1)], 2))
        else:
            RMSGrad["db" + str(l+1)] = B2 * RMSGrad["db" + str(l+1)] + np.multiply((1 - B2) , np.power( grads["db" + str(l+1)], 2))
            
        
        ### Calculate corrected RMSVelocity
        RMSGrad_corrected["dW" + str(l+1)] = np.divide(RMSGrad["dW" + str(l+1)], (1 - B2**i))
        
        if batchNorm == True:
            RMSGrad_corrected["dG" + str(l+1)] = np.divide(RMSGrad["dG" + str(l+1)], (1 - B2**i))
            RMSGrad_corrected["dB" + str(l+1)] = np.divide(RMSGrad["dB" + str(l+1)], (1 - B2**i))
        else:
            RMSGrad_corrected["db" + str(l+1)] = np.divide(RMSGrad["db" + str(l+1)], (1 - B2**i))
                
        ### UPDATE PARAMETERS ####
        
        if update_method == "grads":
            parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - alpha * (grads["dW" + str(l+1)] + (parameters["W" + str(l+1)] * (lambd/m)) )  
            
            if batchNorm == True:
                parameters["G" + str(l+1)] = parameters["G" + str(l+1)] - alpha * (grads["dG" + str(l+1)] + (parameters["G" + str(l+1)] * (lambd/m)))
                parameters["B" + str(l+1)] = parameters["B" + str(l+1)] - alpha * grads["dB" + str(l+1)]
            else:
                parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - alpha * grads["db" + str(l+1)]
                
            
        elif update_method == "momentum":
            parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - alpha * ( momentumGrad_corrected["dW" + str(l+1)] + (parameters["W" + str(l+1)] * (lambd/m)) )
           
            if batchNorm == True:
                parameters["G" + str(l+1)] = parameters["G" + str(l+1)] - alpha * (momentumGrad_corrected["dG" + str(l+1)] + (parameters["G" + str(l+1)] * (lambd/m)))
                parameters["B" + str(l+1)] = parameters["B" + str(l+1)] - alpha * momentumGrad_corrected["dB" + str(l+1)]               
            else:
                parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - alpha * momentumGrad_corrected["db" + str(l+1)]               
                 
        
        elif update_method == "adams":
            parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - alpha * ( np.divide(momentumGrad_corrected["dW" + str(l+1)] , (np.sqrt(RMSGrad_corrected["dW" + str(l+1)]) + epsilon)) + (parameters["W" + str(l+1)] * (lambd/m)) )
            
            if batchNorm == True:
                parameters["G" + str(l+1)] = parameters["G" + str(l+1)]- alpha * (np.divide(momentumGrad_corrected["dG" + str(l+1)] , (np.sqrt(RMSGrad_corrected["dG" + str(l+1)]) + epsilon)) + (parameters["G" + str(l+1)] * (lambd/m)))
                parameters["B" + str(l+1)] = parameters["B" + str(l+1)] - alpha * np.divide(momentumGrad_corrected["dB" + str(l+1)] , (np.sqrt(RMSGrad_corrected["dB" + str(l+1)]) + epsilon))        
            else:
                parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - alpha * np.divide(momentumGrad_corrected["db" + str(l+1)] , (np.sqrt(RMSGrad_corrected["db" + str(l+1)]) + epsilon))        
                
            
    return parameters, momentumGrad, RMSGrad


In [34]:
def make_predictions(X, Y, numberOfClasses, layer_dims, final_params, global_var, dset):
    
    batchNorm = global_var['batchNorm']
    useSoftMax = global_var['useSoftMax']
    
    predictions = np.zeros([1, Y.shape[1] ])
    probability = np.zeros([numberOfClasses , Y.shape[1] ]) 
    
    Y[Y == 10] = 0
    
    if useSoftMax == False:
        
        for p in range(numberOfClasses):
            if batchNorm == True:
                probability[p,:], caches, z_norm_caches = L_model_forward(X, layer_dims, 
                                                                      final_params["param" + str(p)], global_var)
            else:
                probability[p,:], caches = L_model_forward(X, layer_dims, final_params["param" + str(p)], global_var)
    else:
        if batchNorm == True:
            probability, caches, z_norm_caches = L_model_forward(X, layer_dims, 
                                                                 final_param_all_class["finalparam"], global_var)
        else:
            probability, caches = L_model_forward(X, layer_dims, final_param_all_class["finalparam"], global_var)
    
    predictions = np.argmax(probability,axis=0)
    predictions = predictions.T
            
    print(dset + " accruracy: is " + str(np.sum(predictions == Y)/Y.shape[1]*100) + "%")
    
    return predictions, probability

In [35]:
def single_class_model_run(X, Y, k, layer_dims, iterations, alpha, lambd, global_var):
    
    initial_parameters, momentumGrad, RMSGrad = initialize_parameters_deep(layer_dims, global_var)
    
    parameters = initial_parameters
    
    print_cost = global_var['print_cost']
    checkGradient = global_var['checkGradient']
    update_method = global_var['update_method']
    batchNorm = global_var['batchNorm']
    useSoftMax = global_var['useSoftMax']
    checkTime = global_var['checkTime']
    timerStart = False
    td = []
    

    cost_array = np.zeros([iterations,1])
    gradient_mean_array = np.zeros([iterations,3])
    
    for i in range(iterations):
        
        if checkTime == True and i == round(iterations/10):
            ts = time.time()
            timerStart = True
        
        ### ONE EPOCH STARTS ###
        
        # Forward Propagation
        if batchNorm == True:
            AL, caches, norm_caches = L_model_forward(X, layer_dims, parameters, global_var, i)
                
        else:
            AL, caches = L_model_forward(X, layer_dims, parameters, global_var, i)
                
        #Cost compute    
        cost = compute_cost(AL, Y, layer_dims, parameters, lambd, global_var)
            
        cost_array[i, 0] = cost   
            
        if i % 100 == 0 and print_cost == True:
            if useSoftMax == False:
                print("Cost for class " + str(k) + " on the " + str(i+1) + "th iterations: " + str(cost))
            else:
                print("SoftMax cost on " +  str(i+1) + "th iterations: " + str(cost))
                   
        #Backward Propation
            
        if batchNorm == True:
            grads = L_model_backward(AL, Y, caches, global_var, norm_caches)
        else:
            grads = L_model_backward(AL, Y, caches, global_var)

            
        gradient_mean_array[i, 0] = np.std(grads['dW1'])
            
        if batchNorm == True:
            gradient_mean_array[i, 1] = np.mean(grads['dG1'])
            gradient_mean_array[i, 2] = np.mean(grads['dB1'])

        ### Conduct Gradient Checks
        if i % 500 == 0 and checkGradient == True:
            diff = gradient_check_n(parameters, grads, X, Y, layer_dims, global_var)
                
        ### Update Parameters ###    
        parameters, momentumGrad, RMSGrad = update_parameters(Y.shape[1] , layer_dims, 
                                                              parameters, grads, momentumGrad, 
                                                              RMSGrad, alpha, lambd, i + 1, global_var)
        if timerStart == True:
            te = time.time()
            td.append(te-ts)
            timerStart = False
        
        ### ONE EPOCH ENDS ###
    
    if checkTime == True:
        at = sum(td)/len(td)
        print("Average time per EPOCH is: " + str(at))
        
    return parameters, grads, cost_array, gradient_mean_array

In [36]:
def start_training(numberOfClasses, X_train, Y_train, layer_dims, iterations, alpha, lambd, global_var):
    
    final_param_all_class = {}
    global_grads = {}
    global_var['isPredict'] = False
    
    if global_var['useSoftMax'] == False:
        
        for k in range(numberOfClasses):
            
            if k == 0:
                Y_class = (Y_train==10)*Y_train
            else: 
                Y_class = (Y_train==k)*Y_train    
            Y_class[ Y_class > 0 ] = 1
        
             
            parameters, grads, cost_array, grad_mean_array = single_class_model_run(X_train, Y_class, k, layer_dims, 
                                                                                iterations, alpha, lambd, global_var)
            if global_var['plotGraph'] == True:
                if global_var['batchNorm'] == True:
                    plot_graph(grad_mean_array[:,0], 'Mean of dW1 per iteration')
                    plot_graph(grad_mean_array[:,1], 'Mean of dG2 per iteration')
                    plot_graph(grad_mean_array[:,1], 'Mean of dB2 per iteration')                
                else:
                    plot_graph(cost_array, ("Cost function change per iteration for class " +  str(k)))
        
            final_param_all_class["param" + str(k)] = parameters
            global_grads["grad" + str(k)] = grads
        
    else:
        Y_all_class = prepareSoftMaxY(Y_train, numberOfClasses)
        # numberOfClasses x m
        
        # change the last layer to numberOfClasses nodes (instead of one)
        layer_dims[-1] = numberOfClasses
        
        parameters, grads, cost_array, grad_mean_array = single_class_model_run(X_train, Y_all_class, 0, 
                                                                                layer_dims, iterations, 
                                                                                alpha, lambd, global_var)
        if global_var['plotGraph'] == True:
            if global_var['batchNorm'] == True:
                plot_graph(grad_mean_array[:,0], 'Mean of dW1 per iteration')
                plot_graph(grad_mean_array[:,1], 'Mean of dG2 per iteration')
                plot_graph(grad_mean_array[:,1], 'Mean of dB2 per iteration')                
            else:
                plot_graph(cost_array, "SoftMax cost function change per iteration")
        
        final_param_all_class["finalparam"] = parameters
        global_grads["finalparam"] = grads

    
    global_var['isPredict'] = True
    
    return final_param_all_class, global_grads

In [37]:
#initialization
np.random.seed(2)

### Data Preparation Starts ###
train_data_ratio = 0.90
X_train, Y_train, X_dev, Y_dev = load_data(train_data_ratio)
#m = X_train.shape[1]
#mini_batches = random_mini_batches(X_assess, Y_assess, mini_batch_size)
### Data Preparation Ends ###

### Model Superparameters Start ###
#layer_dims = [X_train.shape[0],40,20,5,1]
layer_dims = [X_train.shape[0],25,16,8,1]
#layer_dims = [X_train.shape[0],80,20,1]
iterations = 500
alpha = 0.002
lambd = 0.1
numberOfClasses =  10
### Model Superparameters End ###

cost_history_for_all_class = {}
global_var = {}

### On/Off Hyperparameters Start ###
#global_var['checkActivation'] = False
global_var['useSoftMax'] = True
global_var['dropOut'] = False
global_var['checkGradient'] = False
global_var['checkGradientPrintDiff'] = True
global_var['checkTime'] = False
global_var['print_cost'] = True
global_var['batchNorm'] = True
global_var['batchNormBackMethod'] = "abstract"     #abstract -> thankGod function; or use "computational"
global_var['plotGraph'] = False
global_var['update_method'] = "adams"    #or "grads" or "momentum" OR "adams"
### On/Off Hyperparameters End ###

final_param_all_class, global_grads = start_training(numberOfClasses, X_train, Y_train, layer_dims, 
                                                     iterations, alpha, lambd, global_var)
        
train_predict, train_prob = make_predictions(X_train, Y_train, numberOfClasses, layer_dims,  
                                             final_param_all_class, global_var, dset = "Training")
dev_predict, dev_prob = make_predictions(X_dev, Y_dev, numberOfClasses, layer_dims,
                                         final_param_all_class, global_var, dset = "Dev")

#showrandomimage(X_dev, Y_dev, dev_predict.reshape(Y_dev.shape), True)

SoftMax cost on 1th iterations: 2.39856816838
[93mGradient Check on 0th param: backward Prop error! difference = 0.999555654299[0m
[93mGradient Check on 1th param: backward Prop error! difference = 0.999555654293[0m
[93mGradient Check on 2th param: backward Prop error! difference = 0.999555653437[0m
[93mGradient Check on 3th param: backward Prop error! difference = 0.999555654294[0m
[93mGradient Check on 4th param: backward Prop error! difference = 0.999555654308[0m
[93mGradient Check on 5th param: backward Prop error! difference = 0.999555654315[0m
[93mGradient Check on 6th param: backward Prop error! difference = 0.999555654297[0m
[93mGradient Check on 7th param: backward Prop error! difference = 0.999555654291[0m
[93mGradient Check on 8th param: backward Prop error! difference = 0.999555654282[0m
[93mGradient Check on 9th param: backward Prop error! difference = 0.99955565429[0m
[93mGradient Check on 10th param: backward Prop error! difference = 0.999555654297[

[93mGradient Check on 98th param: backward Prop error! difference = 0.9995556543[0m
[93mGradient Check on 99th param: backward Prop error! difference = 0.999555654274[0m
[93mGradient Check on 100th param: backward Prop error! difference = 0.999555654265[0m
[93mGradient Check on 101th param: backward Prop error! difference = 0.99955565428[0m
[93mGradient Check on 102th param: backward Prop error! difference = 0.999555654135[0m
[93mGradient Check on 103th param: backward Prop error! difference = 0.999555654307[0m
[93mGradient Check on 104th param: backward Prop error! difference = 0.999555652677[0m
[93mGradient Check on 105th param: backward Prop error! difference = 0.999555654299[0m
[93mGradient Check on 106th param: backward Prop error! difference = 0.999555654307[0m
[93mGradient Check on 107th param: backward Prop error! difference = 0.999555654307[0m
[93mGradient Check on 108th param: backward Prop error! difference = 0.999555654313[0m
[93mGradient Check on 109



[93mGradient Check on 121th param: backward Prop error! difference = 0.0100598767366[0m
[93mGradient Check on 122th param: backward Prop error! difference = 0.00270511811663[0m
[93mGradient Check on 123th param: backward Prop error! difference = 0.00086226381054[0m
[93mGradient Check on 124th param: backward Prop error! difference = 0.000127634417808[0m
[93mGradient Check on 125th param: backward Prop error! difference = 9.24799853232e-05[0m
[93mGradient Check on 126th param: backward Prop error! difference = 3.22114282578e-05[0m
[93mGradient Check on 127th param: backward Prop error! difference = 0.000511056587669[0m
[93mGradient Check on 128th param: backward Prop error! difference = 0.00382633405734[0m
[93mGradient Check on 129th param: backward Prop error! difference = 0.00524571049616[0m
[93mGradient Check on 130th param: backward Prop error! difference = 0.00138431586772[0m
[93mGradient Check on 131th param: backward Prop error! difference = 0.000109005375097

[92mGradient Check on 211th param: Backward Prop OKAY! difference = 6.91263503361e-09[0m
[92mGradient Check on 212th param: Backward Prop OKAY! difference = 9.62714603888e-09[0m
[92mGradient Check on 213th param: Backward Prop OKAY! difference = 1.90961196388e-08[0m
[93mGradient Check on 214th param: backward Prop error! difference = 1.40035369193e-07[0m
[93mGradient Check on 215th param: backward Prop error! difference = 3.8007516915e-07[0m
[93mGradient Check on 216th param: backward Prop error! difference = 2.4645238203e-06[0m
[93mGradient Check on 217th param: backward Prop error! difference = 5.43123870258e-07[0m
[93mGradient Check on 218th param: backward Prop error! difference = 4.89185124963e-05[0m
[93mGradient Check on 219th param: backward Prop error! difference = 4.08302194349e-05[0m
[93mGradient Check on 220th param: backward Prop error! difference = 8.15950514437e-07[0m
[93mGradient Check on 221th param: backward Prop error! difference = 2.30788517433e-

[92mGradient Check on 306th param: Backward Prop OKAY! difference = 2.0785900073e-08[0m
[92mGradient Check on 307th param: Backward Prop OKAY! difference = 5.46906905177e-09[0m
[92mGradient Check on 308th param: Backward Prop OKAY! difference = 2.72786686881e-09[0m
[93mGradient Check on 309th param: backward Prop error! difference = 5.50116197839e-07[0m
[92mGradient Check on 310th param: Backward Prop OKAY! difference = 9.73011566695e-08[0m
[92mGradient Check on 311th param: Backward Prop OKAY! difference = 8.20932206145e-09[0m
[92mGradient Check on 312th param: Backward Prop OKAY! difference = 8.69650092397e-08[0m
[92mGradient Check on 313th param: Backward Prop OKAY! difference = 8.42288807847e-08[0m
[92mGradient Check on 314th param: Backward Prop OKAY! difference = 1.42742566313e-08[0m
[92mGradient Check on 315th param: Backward Prop OKAY! difference = 2.18939833309e-08[0m
[92mGradient Check on 316th param: Backward Prop OKAY! difference = 9.78235153586e-08[0m

[93mGradient Check on 396th param: backward Prop error! difference = 7.47431984109e-06[0m
[93mGradient Check on 397th param: backward Prop error! difference = 0.000106330938359[0m
[93mGradient Check on 398th param: backward Prop error! difference = 1.98099376484e-05[0m
[93mGradient Check on 399th param: backward Prop error! difference = 3.21647731346e-07[0m
[93mGradient Check on 400th param: backward Prop error! difference = 1.26659196943e-07[0m
[92mGradient Check on 401th param: Backward Prop OKAY! difference = 6.77323620993e-10[0m
[92mGradient Check on 402th param: Backward Prop OKAY! difference = 6.81827375563e-08[0m
[93mGradient Check on 403th param: backward Prop error! difference = 2.42876792389e-07[0m
[93mGradient Check on 404th param: backward Prop error! difference = 1.00397448667e-06[0m
[92mGradient Check on 405th param: Backward Prop OKAY! difference = 7.88998289192e-09[0m
[92mGradient Check on 406th param: Backward Prop OKAY! difference = 1.87239220249e

[93mGradient Check on 491th param: backward Prop error! difference = 1.73536921774e-07[0m
[93mGradient Check on 492th param: backward Prop error! difference = 1.29773904632e-06[0m
[93mGradient Check on 493th param: backward Prop error! difference = 1.22233326905e-06[0m
[93mGradient Check on 494th param: backward Prop error! difference = 2.27600443522e-06[0m
[93mGradient Check on 495th param: backward Prop error! difference = 2.23420340031e-05[0m
[93mGradient Check on 496th param: backward Prop error! difference = 8.4670643905e-05[0m
[93mGradient Check on 497th param: backward Prop error! difference = 0.00307341303305[0m
[92mGradient Check on 498th param: Backward Prop OKAY! difference = nan[0m
[93mGradient Check on 499th param: backward Prop error! difference = 0.0122118376709[0m
[93mGradient Check on 500th param: backward Prop error! difference = 0.000390644195448[0m
[93mGradient Check on 501th param: backward Prop error! difference = 6.80054431029e-05[0m
[93mGr

KeyboardInterrupt: 

In [None]:
#showrandomimage(X_dev, Y_dev, dev_predict.reshape(Y_dev.shape), True)