# Build a L-layer Neural Network.ipynb

## Table of Contents

* [Recap](#chapter0)
* [1. L-laers Neural Network Model](#chapter1)
    * [1.1 Functions of our L-layers Neural Network ](#section_1_1)
        * [1.1.1 Initialize parameters](#section_1_1_1)
        * [1.1.2 Forward propagation](#section_1_1_2)
        * [1.1.3 Cost function](#section_1_1_3)
        * [1.1.4 Backward Propagation](#section_1_1_4)
        * [1.1.5 Update parameters](#section_1_1_5) 
    * [1.2 L-layer Model](#section_1_2)
* [2. Load the Dataset ](#chapter2)
    * [2.1 Load the Dataset](#section_2_1)
    * [2.2 Display the Data](#section_2_2)
    * [2.3 Flatten the data](#section_2_3)
    * [2.4 Normalize the data](#section_2_4)

# Recap  <a class="anchor" id="chapter0"></a>

> Forward Propagation :

<center><img src="images/05-Deep Neural network/forward-prop.png" width = "600px"></center>

$$
\begin{cases}
    Z^{[l]} = W^{[l]} X + b^{[l]} \\
    A^{[l]} = g^{[l]}(Z^{[l]}) 
\end{cases}
$$


> Backward Propagation :

<center><img src="images/05-Deep Neural network/backward-prop.png" width = "600px"></center>

$$
\begin{cases}
    dZ^{[l]} =  dA^{[l]}  * g^{[l]'}(Z^{[l]}) \\
    dW^{[l]} = \frac{1}{m} dZ^{[l]}A^{[l-1]T} \\
    db^{[l]} = \frac{1}{m} \sum dZ^{[l]}    \\
    dA^{[l-1]} =W^{[l]T}dZ^{[l]} \\
    dZ^{[l-1]} = W^{[l]T}dZ^{[l]} * g^{[l-1]'}(Z^{[l-1]}) \\
\end{cases}
$$

$$
\begin{cases}
    dZ^{[1]} = W^{[2]T}dZ^{[2]} * g^{[1]'}(Z^{[1]}) \\
    dW^{[1]} = \frac{1}{m} dZ^{[1]} X^T \\
    db^{[1]} = \frac{1}{m} \sum  dZ^{[1]}
\end{cases}
$$


> Dimension :
- m : number of examples

$$\begin{cases}
    Z^{[l]},A^{[l]} : (n^{[l]},m) \\
    W^{[l]}: (n^{[l]},n^{[l-1]})   \\
    b^{[l]}: (n^{[l]},1)  \\
    dZ^{[l]},dA^{[l]} : (n^{[l]},m) \\
    dW^{[l]} : (n^{[l]},n^{[l-1]})   \\
    db^{[l]}: (n^{[l]},1)
\end{cases}
$$


# 1. L-layers Neural Network model <a class="anchor" id="chapter1"></a>

In [3]:
# Packages
import copy
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import sklearn.linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss


%matplotlib inline

## 1.1 Functions of our L-layers Neural Network  <a class="anchor" id="section_1_1"></a>

### 1.1.1 Initialize parameters  <a class="anchor" id="section_1_1_1"></a>

In [24]:
def initialize_parameters(hidden_layers_dim,n_input,n_output):
    """
    Initialize the l parameters of the L-layer neural network
    
    Arguments:
    hidden_layers_dim -- list of hidden units in the hidden layers
    n_input -- features of the input matrix X
    n_output -- number units in the output layer 
    
    Returns:
    parameters -- a dictionary containing W1, W2, b1, and b2, ....
    """

    # seed
    np.random.seed(3)

    # init cache
    parameters = {}
    l = len(hidden_layers_dim)

    for i,n_dim in enumerate(hidden_layers_dim):
        
        if i == 0:
            W = np.random.randn(n_dim,n_input) * 0.01
            b = np.zeros((n_dim,1))
        else:
            W = np.random.randn(n_dim,hidden_layers_dim[i-1]) * 0.01
            b = np.zeros((n_dim,1))

        # getting params
        parameters[f"W{i+1}"] = W
        parameters[f"b{i+1}" ] = b

    # output layer
    W = np.random.randn(n_output,hidden_layers_dim[-1]) * 0.01
    b = np.zeros((n_output,1))
    

    # getting params
    parameters[f"W{l+1}"] = W
    parameters[f"b{l+1}" ] = b


    return parameters


In [36]:
# test
layers_dim = [5,5,3]

params = initialize_parameters(layers_dim,4,1)
params

{'W1': array([[ 0.01788628,  0.0043651 ,  0.00096497, -0.01863493],
        [-0.00277388, -0.00354759, -0.00082741, -0.00627001],
        [-0.00043818, -0.00477218, -0.01313865,  0.00884622],
        [ 0.00881318,  0.01709573,  0.00050034, -0.00404677],
        [-0.0054536 , -0.01546477,  0.00982367, -0.01101068]]),
 'b1': array([[0.],
        [0.],
        [0.],
        [0.],
        [0.]]),
 'W2': array([[-0.01185047, -0.0020565 ,  0.01486148,  0.00236716, -0.01023785],
        [-0.00712993,  0.00625245, -0.00160513, -0.00768836, -0.00230031],
        [ 0.00745056,  0.01976111, -0.01244123, -0.00626417, -0.00803766],
        [-0.02419083, -0.00923792, -0.01023876,  0.01123978, -0.00131914],
        [-0.01623285,  0.00646675, -0.00356271, -0.01743141, -0.0059665 ]]),
 'b2': array([[0.],
        [0.],
        [0.],
        [0.],
        [0.]]),
 'W3': array([[-0.00588594, -0.00873882,  0.00029714, -0.02248258, -0.00267762],
        [ 0.01013183,  0.00852798,  0.01108187,  0.01119391,  

### 1.1.2 Forward propagation  <a class="anchor" id="section_1_1_2"></a>

In [30]:
def activation_function(Z,activation_name):

    """
    Compute the activation function
    
    Arguments:
    activation_name -- name of the activation function choosen
    Z -- items

    Returns:
    activation -- activation value
    """

    if activation_name.lower() == "sigmoid":

        activation = 1 / (1+np.exp(-Z))

    elif activation_name.lower() == "relu":

        activation = np.maximum(0,Z)

    elif activation_name.lower() == "tanh":

        activation = np.tanh(Z)
    else:
        # default activation function
        activation = np.maximum(0,Z)


    assert(activation.shape == Z.shape)

    return activation

In [33]:
# test activation
Z = np.random.randn(10,100)
A = activation_function(Z,"sigmoid")

print(A.shape)

(10, 100)


In [190]:
def Forward_propagation(X,parameters):

    """
    Compute the forward propagation on the L layers
    
    Arguments:
    X -- Input
    parameters -- dictionnary containing the parameters of each layers

    Returns:
    caches -- list of dictionnaries. each dictionnay contains the linear result and activation of each layer
    """

    # caches and layers
    caches = []
    L = len(parameters)//2
    
    # Input X
    A_prev = X

    for i in range(L):

        # getting the parameters of the i-th layers
        W = parameters[f"W{i+1}"]
        b = parameters[f"b{i+1}"]

        # linear and activation result
        Z = np.dot(W,A_prev) + b

        # activation
        if i == L - 1 :
            A = activation_function(Z,"sigmoid")
        else:
            A = activation_function(Z,"relu")

        # append cache
        cache = {"W":W,"b":b,"Z":Z,"A":A,"A_prev":A_prev}
        caches.append(cache)

        # change A_prev
        A_prev = A

    return A, caches

In [189]:
# test
layers_dim = [5,5,3]
X = np.random.randn(2,100)
params = initialize_parameters(layers_dim,X.shape[0],1)

AL,caches = Forward_propagation(X,params)

for val in caches:
    print(val["W"].shape)

(5, 2)
(5, 5)
(3, 5)
(1, 3)


### 1.1.3 Cost function  <a class="anchor" id="section_1_1_3"></a>

In [181]:
def cost_function(AL,y):

    """
    Compute the cost after the forward propagation

    Arguments:
    AL -- L-activation 
    y -- true labels of the dataset dim = (n_y,m) | m examples, n_y nodes of the output layer

    Returns:
    cost -- cost value
    """
    # m  examples
    m = y.shape[1]

    # cost
    cost = -(1/m) *(np.dot(y,np.log(AL).T) + np.dot((1-y),np.log(1-AL).T))
    cost = np.squeeze(cost)

    return cost


In [182]:
# Test the cost
y_true = np.random.randint(0,2,(1,100))
y_pred = np.random.random((1,100))

# check with the true log_loss
cost = cost_function(y_pred,y_true)
l_cost = log_loss(y_true.T,y_pred.T)
print(cost)
print(l_cost)

1.0045892596933899
1.0045892596933899


### 1.1.4 Backward Propagation  <a class="anchor" id="section_1_1_4"></a>

In [201]:
def backward_activation_function(dA,Z, function_name):

    if function_name.lower() == "sigmoid":

        Z = Z
        s = 1/(1+np.exp(-Z))
        
        dZ = dA * s * (1-s)

    elif function_name.lower() == "relu":

        Z = Z
        dZ = np.array(dA, copy=True) # just converting dz to a correct object.
       
        # When z <= 0, you should set dz to 0 as well. 
        dZ[Z <= 0] = 0

    elif function_name.lower() == "tanh":
        Z = Z
        s = np.tanh(Z)

        dZ = dA * (1-np.power(s,2))
    
    else:
        #default: relu
        Z = Z
        dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
        # When z <= 0, you should set dz to 0 as well. 
        dZ[Z <= 0] = 0

    
    return dZ



In [219]:
def backward_propagation(AL,y,caches):
    
    # gradients
    gradients = {}

    # L layers
    L = len(caches)

    # m examples
    m = y.shape[1]

    # dAL 
    dAL = - (np.divide(y, AL) - np.divide(1 - y, 1 - AL))
    
    # getting caches variables
    current_cache = caches[-1]
    WL = current_cache["W"]
    ZL = current_cache["Z"]
    A_prev = current_cache["A_prev"]
    
    dZ = backward_activation_function(dAL,ZL,"sigmoid")
    
    dW_temp = (1/m) * np.dot(dZ,A_prev.T)
    db_temp = (1/m) * np.sum(dZ,axis=1, keepdims=True)
    dA_prev_temp = np.dot(WL.T,dZ)
    
    # compute the gradient
    gradients["dW" + str(L)] = dW_temp
    gradients["db" + str(L)] = db_temp
    

    for i in reversed(range(L-1)):
        # getting caches variables
        current_cache = caches[i]
       
        W = current_cache["W"]
        Z = current_cache["Z"]
        A_prev = current_cache["A_prev"]

        dZ = backward_activation_function(dA_prev_temp,Z,"relu")
       
        dW_temp = (1/m) * np.dot(dZ,A_prev.T)
        db_temp = (1/m) * np.sum(dZ,axis=1, keepdims=True)
        dA_prev_temp = np.dot(W.T,dZ)
        
        # compute the gradient
        gradients["dW" + str(i+1)] = dW_temp
        gradients["db" + str(i+1)] = db_temp


    return gradients

In [221]:
# test

layers_dim = [5,5,4]
X = np.random.randn(10,100)
y_true = np.random.randint(0,2,(1,100))
params = initialize_parameters(layers_dim,X.shape[0],1)

AL,caches = Forward_propagation(X,params)
    
gradients = backward_propagation(AL,y_true,caches)

for i in range(len(caches)):
    print(f"dW{i+1}",gradients[f"dW{i+1}"].shape)

dW1 (5, 10)
dW2 (5, 5)
dW3 (4, 5)
dW4 (1, 4)


### 1.1.5 Update parameters  <a class="anchor" id="section_1_1_5"></a>

## 1.2 L-layer Model <a class="anchor" id="section_1_2"></a>