# Section 1: Loading the dataset and normalizing
    

In [1]:
# Importing Useful Packages

import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy
from PIL import Image
from scipy import ndimage
import glob
from sklearn.model_selection import train_test_split

%matplotlib inline

## 1.1: Loading the Car Dataset

In [2]:
def load_dataset(database_path):
    # open dataset 
    dataset_db = h5py.File(database_path, "r")
    
    
    datasets = {}
    for dataset in ["train", "dev", "test"]:
        
        # load the train set feautres (picuture)
        datasets[dataset] = {'X' : np.array(dataset_db[dataset + "_img"][:]),  # dataset features
                              'Y' : np.array(dataset_db[dataset + "_labels"][:]) # dataset labels
                            }
    return datasets
    

## 1.2: Flattening and Standardizing the Dataset

In [3]:
datasets = load_dataset('car_dataset.hdf5')

X_Train = np.array(datasets["train"]["X"])
Y_Train = np.array(datasets["train"]["Y"])

X_Dev = np.array(datasets["dev"]["X"])
Y_Dev = np.array(datasets["dev"]["Y"])

X_Test = np.array(datasets["test"]["X"])
Y_Test = np.array(datasets["test"]["Y"])

# Reshape the Training and Test Dataset so that each Image is Flattened into Single Vectors of Shape (num_px ∗ num_px ∗ 3, 1)

flatten_x = X_Train.reshape(X_Train.shape[0], -1).T
flatten_x = X_Dev.reshape(X_Dev.shape[0], -1).T
flatten_x = X_Test.reshape(X_Test.shape[0], -1).T

# Dividing Every Row of the Dataset by 255

train_set_x = flatten_x/255.
dev_set_x = flatten_x/255.
test_set_x = flatten_x/255.

# Section 2: Logistic Regression

## 2.1: Sigmoid Function

In [4]:
def sigmoid(input):

    s = 1/(1 + np.exp(-input))
    cache = input
    return s , cache    

## 2.2: Implementing the Initialize with Zeros function

In [5]:
# Function that initializes w as a vector of zeros and set b to zero. 

def initialize_with_zeros(dim):

    w = np.zeros(shape=(dim,1))
    b =0.0
    
    
    assert(w.shape == (dim,1))
    assert(isinstance(b, float) or isinstance(b, int))
    
    return w,b

## 2.3 - Forward propagation

$$A = \sigma(w^T X + b) = (a^{(0)}, a^{(1)}, ..., a^{(m-1)}, a^{(m)})$$

$$J = -\frac{1}{m}\sum_{i=1}^{m}y^{(i)}\log(a^{(i)})+(1-y^{(i)})\log(1-a^{(i)})$$


In [6]:
# Implementing Forward Propagation to Calculate A and Cost

def forward(w,b,X,Y):
    
    
    m = X.shape[1]
    
    Z = np.dot(w.T,X) + b
    
    A,cache = sigmoid(Z)
    cost = (-1./m) * np.sum(Y * np.log(A) + (1-Y) * np.log(1 - A)) 
    
    cost = np.squeeze(cost)
    assert(cost.shape == ())
    #print ("cost = " + str(cost))
    
    return cost

## 2.4 - Backward propagation

$$ dw = \frac{\partial J}{\partial w} = \frac{1}{m}X(A-Y)^T\tag{7}$$
$$ db = \frac{\partial J}{\partial b} = \frac{1}{m} \sum_{i=1}^m (a^{(i)}-y^{(i)})\tag{8}$$

In [7]:
# Implementing Backward Propagation to Compute Gradients dW and db

def backward(w,b,X,Y):
    
    m = X.shape[1]
    Z = np.dot(w.T,X) + b
    A,cache = sigmoid(Z)
    
    dz = A - Y
    dw = (1/m) * np.dot(X,dz.T)
    db = (1/m) * np.sum(dz)

    assert(dw.shape == w.shape)
    assert(db.dtype == float)
    
    grads = {"dw": dw,
             "db": db}
    
    return grads

## 2.5 - Optimization

In [8]:
# Implementing the Optimization Function to learn W and b by Minimizing the Cost function J

def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
    
    costs = []
    
    for i in range(num_iterations):
        
        cost = forward(w,b,X,Y)    
        grads = backward(w,b,X,Y)
    
        dw = grads["dw"]
        db = grads["db"]
    
        w = w - learning_rate * dw
        b = b - learning_rate * db
    
        
        if i % 100 == 0:
            costs.append(cost)
            print('Cost after iteration %i: %f'%(i, cost))
        
        
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs    

## 2.6 - Prediction

$$\hat{Y} = A = \sigma(w^T X + b)$$

$$ 0, if activation <= 0.5 $$
$$1, if activation > 0.5 $$

In [9]:
# Implementing the Predict Function

def predict(w,b,X):
    
    
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    
    # Compute vector "A" predicting the probabilities of a cat being present in the picture
    Z = np.dot(w.T,X) + b
    A,cache = sigmoid(Z)
   
    
    for i in range(A.shape[1]):
        
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        Y_prediction[0,i] = np.around(A[0,i])
        
    
    assert(Y_prediction.shape == (1, m))
    
    return Y_prediction

## 2.7 - Regression Model


In [10]:
# Implementing the Model function

def model(X_train, Y_train, X_test, Y_test, X_dev,Y_dev,num_iterations ,learning_rate, print_cost = False):
    
    n = X_train.shape[0]
    w,b = initialize_with_zeros(dim=n)
    
    parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate,print_cost)
    
    
    w = parameters['w']
    b = parameters['b']
    
    
    Y_prediction_test = predict(w,b,X_test)
    Y_prediction_train = predict(w,b,X_train)
    Y_prediction_dev = predict(w,b,X_dev)
    
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
    print("dev accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_dev - Y_dev)) * 100))

    
    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train,
         "Y_prediction_dev": Y_prediction_dev,
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

## 2.8 Running the Regression Model function to Train the Model

In [11]:
# Testing with the Dataset

d = model(train_set_x,Y_Train,test_set_x,Y_Test,dev_set_x,Y_Dev, num_iterations = 2000, learning_rate = 0.005, print_cost = True)

Cost after iteration 0: 0.693147
Cost after iteration 100: 0.348325
Cost after iteration 200: 0.293235
Cost after iteration 300: 0.263346
Cost after iteration 400: 0.242531
Cost after iteration 500: 0.226353
Cost after iteration 600: 0.213014
Cost after iteration 700: 0.201614
Cost after iteration 800: 0.191640
Cost after iteration 900: 0.182771
Cost after iteration 1000: 0.174791
Cost after iteration 1100: 0.167547
Cost after iteration 1200: 0.160925
Cost after iteration 1300: 0.154839
Cost after iteration 1400: 0.149219
Cost after iteration 1500: 0.144009
Cost after iteration 1600: 0.139162
Cost after iteration 1700: 0.134640
Cost after iteration 1800: 0.130409
Cost after iteration 1900: 0.126441
train accuracy: 97.48549323017409 %
test accuracy: 89.59537572254335 %
dev accuracy: 91.86046511627907 %


# Section 3: Deep Neural Network

## 3.1: Initialization L-layer Neural Network

In [12]:
# GRADED FUNCTION: initialize_parameters_deep

def initialize_parameters_deep(layer_dims):
    """
    Arguments:
    layer_dims -- python array (list) containing the dimensions of each layer in our network
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl -- bias vector of shape (layer_dims[l], 1)
    """
    
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims)            # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(1/(layer_dims[l-1])) #np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2./layers_dims[l-1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))

        
    return parameters


## 3.2: Implementing the RELU Function

In [13]:
def relu(Z):
    """
    Implement the RELU function.

    Arguments:
    Z -- Output of the linear layer, of any shape

    Returns:
    A -- Post-activation parameter, of the same shape as Z
    cache -- a python dictionary containing "A" ; stored for computing the backward pass efficiently
    """
    
    A = np.maximum(0,Z)
    
    assert(A.shape == Z.shape)
    
    cache = Z 
    return A, cache

## 3.3: Implementing the Linear Forward and Linear Activation Forward Function

In [14]:
def linear_forward(A, W, b):
    """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    A -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter 
    cache -- a python dictionary containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """
    
    Z = W.dot(A) + b
    
    assert(Z.shape == (W.shape[0], A.shape[1]))
    cache = (A, W, b)
    
    return Z, cache

def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value 
    cache -- a python dictionary containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """
    
    if activation == "sigmoid":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
    
    elif activation == "relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
    
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache


## 3.4: Implementing the L Model Forward Function

In [15]:
def L_model_forward(X, parameters):
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()
    
    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2)
                the cache of linear_sigmoid_forward() (there is one, indexed L-1)
    """

    caches = []
    A = X
    L = len(parameters) // 2                  # number of layers in the neural network
    
    # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    # Use a for loop to replicate [LINEAR->RELU] (L-1) times
    for l in range(1, L):
        # A_prev represets the activation output of the previous layer 
        A_prev = A
        # Use the linear_activation_forward you had previously written (we import this function)
        A, cache = linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], "relu");
        caches.append(cache);
        # keep track of the cache 
        # Add "cache" to the "caches" list
        None
    
    # This is the last layer L which uses sigmoid activation function
    # Implement LINEAR -> SIGMOID. .
    AL, cache = linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], "sigmoid");
    caches.append(cache);
    # Add "cache" to the "caches" list
    None
    
    return AL, caches

## 3.5: Implementing the L Model Backward Function

In [16]:
def L_model_backward(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (there are (L-1) or them, indexes from 0 to L-2)
                the cache of linear_activation_forward() with "sigmoid" (there is one, index L-1)
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "AL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"]
    dA_prev_temp, dW_temp, db_temp = linear_activation_backward(dAL, caches[L - 1], "sigmoid")
    current_cache = caches[L-1]
    
    # use linear_activation_backward to get the derivatives
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = dA_prev_temp, dW_temp, db_temp
    
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) gradients.
        dA = dA_prev_temp
        # use linear_activation_backward to get the derivatives
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(dA, caches[l], "relu")
        # save the derivatives into a grads dictionary
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

## 3.6: Updating Parameters using Gradient Descent

In [17]:
def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    
    L = len(parameters) // 2 # number of layers in the neural network

    # Update rule for each parameter. Use a for loop.
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
        
    return parameters


## 3.7: Implementing the L-layer Neural Network

In [18]:
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009
    """
    Implements a L-layer neural network: [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID.
    
    Arguments:
    X -- data, numpy array of shape (number of examples, num_px * num_px * 3)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    layers_dims -- list containing the input size and each layer size, of length (number of layers + 1).
    learning_rate -- learning rate of the gradient descent update rule
    num_iterations -- number of iterations of the optimization loop
    print_cost -- if True, it prints the cost every 100 steps
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """

    np.random.seed(1)
    costs = []                         # keep track of cost
    
    # Parameters initialization.
    parameters = initialize_parameters_deep(layers_dims)
    
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):

        # Forward propagation: [LINEAR -> RELU]*(L-1) -> LINEAR -> SIGMOID.
        AL, caches = L_model_forward(X, parameters)
        
        # Compute cost.
        cost = compute_cost(AL,Y)
        
        # Backward propagation.
        grads = L_model_backward(AL,Y,caches)
        
        # Update parameters.
        parameters = update_parameters(parameters,grads,learning_rate)
                
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)
            
    
    
    return parameters

## 3.8: Computing the Cost

In [19]:
def compute_cost(AL, Y):
    """
    Implement the cost function defined by equation (7).

    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """
    
    m = Y.shape[0]
    

    # Compute loss from aL and y.
    cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T))
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape == ())
    
    return cost

## 3.9: Implementing the Linear Backward and Linear Activation Backward Function

In [20]:
def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer (layer l)

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1./m * np.dot(dZ,A_prev.T)
    db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T,dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

## 3.10: Implementing the Sigmoid Function

In [21]:
def sigmoid_backward(dA, cache):
    """
    Implement the backward propagation for a single SIGMOID unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

## 3.11: Implementing the RELU Backward Function

In [22]:
def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ

## 3.12: Training the Model

In [23]:
layers_dims = [12288, 12, 7, 5,9,16, 1]

In [24]:
model_parameters = L_layer_model(train_set_x, Y_Train, layers_dims, num_iterations = 2500, print_cost = False)


## 3.13: Predicting the Results

In [25]:
# This function predicts the accuracy of the Train and Test data sets

def predict(X,y, parameters):
    """
    This function is used to predict the results of a  L-layer neural network.
    
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model
    
    Returns:
    p -- predictions for the given dataset X
    """
    
      
    m = X.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((1,m))

    # Forward propagation
    probas, caches = L_model_forward(X, parameters)


    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0

    #print("Accuracy: "  + str(np.sum((p == y)/m)))

    return p

## 3.14: Predicting the Accuracy

In [26]:
predictions_train = predict(train_set_x, Y_Train, model_parameters)
m_train = Y_Train.shape[0]
print("Train Accuracy: "  + str(np.sum((predictions_train == Y_Train)/m_train)))
m_test = Y_Test.shape[0]
predictions_test = predict(test_set_x,Y_Test, model_parameters)
print("Test Accuracy: "  + str(np.sum((predictions_test == Y_Test)/m_test)))
m_dev = Y_Dev.shape[0]
predictions_dev = predict(dev_set_x,Y_Dev, model_parameters)
print("Dev Accuracy: "  + str(np.sum((predictions_dev == Y_Dev)/m_dev)))

Train Accuracy: 0.998065764023211
Test Accuracy: 0.8554913294797686
Dev Accuracy: 0.9011627906976744


# Section 4: Results

## 4.1: Logistic Regression

The cost obtained after 1900 iterations is 0.126441

The Training Accuracy obtained was 97.48%, Testing Accuracy obtained was 89.59% and the Dev Accuracy obtained was 91.86%

## 4.2: Deep NN

The Training Accuracy obtained was 99.80%, the Testing Accuracy obtained was 85.54% and the Dev Accuracy obtained was 90.11%