In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pdb # Python Debugger
# pdb.set_tracec() for setting breakpoint
# %debugger also can be used. q for quit, c for continue, n for next line

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2


In [1]:
# Initialize parameters (W's, b's)
# layer_dims includes hidden units at each hidden layer. 
# Ex. layer_dims = [5,3,2]: 5 nodes in input layer, 3 nodes in first hidden layer, 2 nodes in output layer. len([5,3,2]) = 3

def initialize_parameters_standard_deep(layer_dims):
    parameters = {}
    L = len(layer_dims) # number of layers in network
    
    for l in range(1,L):
        parameters["W"+ str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*0.01
        parameters["b"+ str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters["W" + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(parameters["b" + str(l)].shape == (layer_dims[l], 1))
        
    return parameters

In [2]:
# Initialize parameters (W's, b's)
# layer_dims includes hidden units at each hidden layer. 
# Ex. layer_dims = [5,3,2]: 5 nodes in input layer, 3 nodes in first hidden layer, 2 nodes in output layer. len([5,3,2]) = 3

def initialize_parameters_he_deep(layer_dims, activation):
    parameters = {}
    L = len(layer_dims) # number of layers in network
    
    for l in range(1,L):
        
        if activation == "relu":
            parameters["W"+ str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(2/layer_dims[l-1])
        elif activation == "tanh"
            parameters["W"+ str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(1/layer_dims[l-1])
            
        parameters["b"+ str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters["W" + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(parameters["b" + str(l)].shape == (layer_dims[l], 1))
        
    return parameters

In [7]:
def linear_activation_forward(A_prev, W, b, keep_prob, activation):
    
    Z = np.dot(W, A_prev) +b
    if activation == "sigmoid":
        A = 1/(1+np.exp(-Z))
    elif activation == "relu":
        A = np.maximum(0, Z) 
        
    if keep_prob == 1:
        # without dropout
        linear_cache = (A_prev, W, b)
        activation_cache = Z
        
    elif keep_prob <1:
        # with dropout
        if activation == "relu":
            # apply dropout to all layers except last(output) layer
            D = np.random.rand(A.shape[0], A.shape[1])   # Step 1: initialize matrix D = np.random.rand(..., ...)
            D = (D < keep_prob).astype(int)              # Step 2: convert entries of D to 0 or 1 (using keep_prob as the threshold)
            A = np.multiply(A, D)                        # Step 3: shut down some neurons of A
            A = A/keep_prob                             # Step 4: scale the value of neurons that haven't been shut down
        
        linear_cache = (A_prev, W, b)
        activation_cache = (Z, D)
    
    cache = (linear_cache, activation_cache)
    
    assert(Z.shape == (W.shape[0], A_prev.shape[1]))
    assert(A.shape == (W.shape[0], A_prev.shape[1]))
    
    return A, cache

In [8]:
def L_model_forward(X, parameters, keep_prob):
    caches = []
    A = X
    L = len(parameters)//2       # number of layers in the neural network
    
    # Implement [Linear->ReLU]*(L-1). Add "cache" to the "caches" list
    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev, parameters["W" +str(l)], parameters["b" + str(l)], keep_prob, activation = "relu")
        caches.append(cache)
    
    # Implement [Linear->SIGMOID]. Add "cache" to the "caches" list
    AL, cache = linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], keep_prob, activation = "sigmoid")
    caches.append(cache)
    
    assert(AL.shape == (1,X.shape[1]))
    
    return AL, caches

In [9]:
def compute_cost(AL, Y, parameters, lambd):
    m = Y.shape[1]
    L = len(parameters)//2    # number of layers in the neural network
    
    # Compute cross entropy loss from AL and Y
    cross_entropy_cost = -1/m*(np.dot(np.log(AL), Y.T) + np.dot(np.log(1-AL), (1-Y).T))
    
    if lambd == 0:
        # cost without regularization
        cost = cross_entropy_cost
    else:
        # cost with L2,F regularization
        frobenius_norm = 0
        for l in range(1,L+1):
            frobenius_norm = frobenius_norm + np.sum(np.square(parameters["W" + str(l)]))
        L2_regularization_cost = lambd/(2*m)*frobenius_norm
        cost = cross_entropy_cost + L2_regularization_cost
        
    cost = np.squeeze(cost)   
    assert(cost.shape == ())
    
    return cost

In [3]:
def linear_activation_backward(dA, cache, lambd, activation):
    linear_cache, activation_cache = cache
    Z = activation_cache
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    
    if activation == "relu":
        # Implement the backward propagation for a single RELU unit.
        dZ = np.array(dA, copy=True) # just converting dz to a correct object.
        # When z <= 0, you should set dz to 0 as well. 
        dZ[Z <= 0] = 0   
        assert (dZ.shape == Z.shape)     
    elif activation == "sigmoid":
        # Implement the backward propagation for a single SIGMOID unit.
        s = 1/(1+np.exp(-Z))
        dZ = dA*s*(1-s)
        assert (dZ.shape == Z.shape)
        
    # Implement the linear portion of backward propagation for a single layer (layer l)
    dA_prev = np.dot(W.T, dZ)
    if lambd == 0:
        # without L2,F regularization
        dW = 1/m*np.dot(dZ, A_prev.T)
    else:
        # with L2,F regularization
        dW = 1/m*np.dot(dZ, A_prev.T) + lambd/m*W
    db = 1/m*np.sum(dZ, axis=1, keepdims=True)
    
    assert(dA_prev.shape == A_prev.shape)
    assert(dW.shape == W.shape)
    assert(db.shape == b.shape)
    
    return dA_prev, dW, db

In [None]:
def L_model_backward(AL, Y, caches, lambd):
    # Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL since we are not sure about the dimension of Y
    
    # Initializing the backpropagation
    dAL = -(np.divide(Y, AL) - np.divide(1-Y, 1- AL))
    
    # Lth layer (SIGMOID -> LINEAR) gradients.
    current_cache = caches[L-1] # take last cache 
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)]= linear_activation_backward(dAL, current_cache, lambd, activation="sigmoid")
    
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)]= linear_activation_backward(grads["dA" +str(l+1)], current_cache, lambd, activation="relu")
        
    return grads

In [11]:
def update_parameters(parameters, grads, learning_rate):
    # Update parameters using gradient descent
    L = len(parameters)//2 # number of layers in the neural network
    
    # Update rule for each parameter. Use a for loop.
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads["db" + str(l+1)]
        
    return parameters

In [None]:
def L_layer_model(X, Y, layer_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost = False, lambd = 0, keep_prob = 1):
    # Implements a L-layer neural network: [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID.
    costs = []   # Keep track of cost
    
    # Parameters initialization.
    parameters = initialize_parameters_deep(layer_dims)
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):
        
        # Forward propagation: [LINEAR -> RELU]*(L-1) -> LINEAR -> SIGMOID.
        AL, caches = L_model_forward(X, parameters,keep_prob)
        
        # Compute cost
        cost = compute_cost(AL, Y, parameters, lambd)
        
        # Backward propagation
        assert(lambd==0 or keep_prob==1)    # it is possible to use both L2 regularization and dropout, 
                                            # but for now I will only explore one at a time
        if lambd == 0 and keep_prob ==1:
            grads = L_model_backward(AL, Y, caches)
        elif lambd =! 0:
            grads = L_model_backward_with_regularization(AL, Y, caches, lambd)
        elif keep_prob < 1:
            grads = L_model_backward_with_droupout(AL, Y, caches, keep_prob)
        
        # Update parameters
        parameters = update_parameters(parameters, grads, learning_rate)
        
        # Print the cost every 100 training example
        if print_cost and i % 1000 ==0:
            print("Cost after iteration %i: %f" %(i, cost))
            costs.append(cost)
            
        # Plot the cost
        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per thousands)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()
        
        return parameters

In [None]:
def predict(X, y, parameters):
    # This function is used to predict the results of a  L-layer neural network.
    m = X.shape[1]
    L = len(parameters)//2 # number of layers in the neural network
    prediction = np.zeros((1, m))
    
    # Forward propagation
    AL, caches = L_model_forward(X, parameters)
    prediction = (AL > 0.5)
    
    print("Accuracy: " + str(np.sum((prediction == y)/m)))

In [5]:
a = np.random.rand(50,1)
?np.random.rand