# Contents
[Forward Propagation](#Forward-Propagation)  
[Padding](#Padding)  
[Conv_Forward](#Conv_Forward)  
[Pooling_Forward](#Pooling_Forward)  
[Softmax_Forward](#Softmax_Forward)  
[Backpropagation](#Backpropagation)  
[Conv_Backward](#Conv_Backward)  
[Pooling_Backward](#Pooling_Backward)  
[Softmax_Backward](#Softmax_Backward)  

## Imports

In [6]:
import numpy as np
import matplotlib.pyplot as plt

# Temporary working directory of CIFAR-10
from tensorflow.keras.datasets import cifar10
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

## Define Network Structure

In [None]:
def layer_structure(X, y, hidden_size):
    '''
    Arguments:
    
    
    '''

### Initialize Parameters

## Optimization

#### Activation

## Padding

In [10]:
# Zero padding 
def zero_pad(X, pad):
    """
    Pad all images in dataset X with zeros along image height and width.
    
    Arguments:
        X -- python numpy array of shape (m, n_H, n_W, n_C) representing a batch
        of m images with height n_H, width n_W, and channels n_C
    
    Returns:
        X_pad -- padded image of shape (m, n_H + 2*pad, n_W + 2*pad, n_C)
    """
    
    X_pad = np.pad(
        X, (
            # Do not pad the the first dimension of X (m).
            (0, 0),
            # Pad the second dimension of X (n_H).
            (pad, pad),
            # Pad the third dimension of X (n_W).
            (pad, pad),
            # Do not pad the fourth dmesnion of X (n_C).
            (0, 0)),
        # Pad with a constant value.
        mode = 'constant',
        # Assign zero constant values to pad before and after each axis.
        constant_values = (0,0))
    
    return X_pad

# Forward Propagation

## Conv_Forward

In [13]:
# Convolution single step
def conv_single_step(a_slice_prev, W, b):
    """
    Apply one filter defined by parameters W on a single slice (a_slice_prev) of the output activation
    of the previous layer.
    
    Arguments:
        a_slice_prev -- slice of input data of shape (f, f, n_C_prev)
        W -- Weight parameters contained in a window - matrix of shape (f, f, n_C_prev)
        b -- Bias parameters contained in a window - matrix of shape (1, 1, 1)
    
    Returns:
        Z -- a scalar value, result of convolving the sliding window (W, b) on a slice x of the input data.
    """
    
    # Compute the element-wise product between a_slice_prev and W and save to a variable.
    s = np.multiply(a_slice_prev, W)
    
    # Sum over all the entries of the volume.
    Z = np.sum(s)
    
    # Add bias b to Z. Cast b to a float() so that Z results in a scalar value.
    Z = Z + np.float(b)
    
    return Z

In [15]:
# Convolutional layer.
# This needs to be vectorized.
def conv_forward(A_prev, W, b, hparameters):
    """
    Implements the forward propagation for a convolution function.
    
    Arguments:
        A_prev -- output activations of the previous layer, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)
        W -- Weights, numpy array of shape (f, f, n_C_prev, n_C)
        b -- Biases, numpy array of shape (1, 1, 1, n_C)
        hparameters -- python dictionary containing "stride" and "pad"
        
    Returns:
    Z -- convolution output, numpy array of shape (m, n_H, n_W, n_C)
    cache -- cache of values needed for backward propagation.
    """
    
    # Extract dimensions from A_prev's shape.
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    
    # Extract dimensions from W's shape.
    (f, f, n_C_prev, n_C) = W.shape
    
    # Extract information from "hparameters" dictionary.
    stride = hparameters["stride"]
    pad = hparameters["pad"]
    
    # Compute the dimensions of the CONV output volume.
    n_H = int((n_H_prev - f + 2*pad)/stride) + 1
    n_W = int((n_W_prev - f + 2*pad)/stride) + 1
    
    # Initialize the output volume Z with zeros.
    Z = np.zeros([m, n_H, n_W, n_C])
    
    # Create A_prev_pad by padding A_prev using the function.
    A_prev_pad = zero_pad(A_prev, pad)
    
    # Loop over the batch of training examples.
    for i in range(m):
        # Select the ith training example's padded activation.
        a_prev_pad = A_prev_pad[i, :, :, :]
        # Loop over the vertical axis of the output volume.
        for h in range(n_H):
            # Loop over the horizontal axis of the output volume.
            for w in range(n_W):
                # Loop over the channels of the output volume.
                for c in range(n_C):
                    
                    # Find the corners of the current "slice"
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    
                    # Use the corners to define the 3D slide of a_prev_pad.
                    a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]
                    
                    # Convolve the 3D slide with the correct filter W and bias b, to get back one output neuron.
                    Z[i, h, w, c] = conv_single_step(a_slice_prev, W[:, :, :, c], b[:, :, :, c])
                    
    # Check to make sure your output shape is correct.
    assert(Z.shape == (m, n_H, n_W, n_C))
    
    # Save information for backpropagation.
    cache = (A_prev, W, b, hparameters)
                    
    return Z, cache

In [16]:
# ReLU layer

## Pooling_Forward

In [11]:
# Pool layer
def pool_forward(A_prev, hparameters, mode = "max"):
    """
    Implements the forward pass of the pooling layer
    
    Arguments:
        A_prev -- Input data, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)
        hparameters -- python dictionary containing "f" and "stride"
        mode -- the pooling mode you would like to use, defined as a string ("max" or "average")
    
    Returns:
        A -- output of the pool layer, a numpy array of shape (m, n_H, n_W, n_C)
        cache -- cache used in the backward pass of the pooling layer, contains the input and hyperparameters
    """
    
    # Extract dimensions from the input shape.
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    
    # Extract hyperparameters from "hparameters".
    f = hparameters["f"]
    stride = hparameters["stride"]
    
    # Define the dimensions of the output.
    n_H = int(1 + (n_H_prev - f) / stride)
    n_W = int(1 + (n_W_prev - f) / stride)
    n_C = n_C_prev
              
    # Initialize the output matrix A.
    A = np.zeros((m, n_H, n_W, n_C))
    
    # Loop over the training examples.
    for i in range(m): 
        # Loop on the vertical axis of the output volume.
        for h in range (n_H):
            # Loop on the horizontal axis of the output volume.
            for w in range(n_W):
                # Loop over the channels of the output volume.
                for c in range(n_C):
                    
                    # Find the corners of the current "slice"
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    
                    # Use the corners to define the current slice on the ith training example of A_prev, channel c.
                    a_prev_slice = A_prev[i, vert_start:vert_end, horiz_start:horiz_end, c]
                    
                    # Compute the pooling operation on the slice
                    if mode == "max":
                        A[i, h, w, c] = np.max(a_prev_slice)
                    elif mode == "average":
                        A[i, h, w, c] = np.mean(a_prev_slice)
    
    # Store the input and hparameters in "cache" for backward prop.
    cache = (A_prev, hparameters)
    
    # Making sure your output shape is correct.
    assert(A.shape == (m, n_H, n_W, n_C))
    
    return A, cache    

In [12]:
# FC layer @ end

## Softmax_Forward

# Loss Computation

# Backpropagation

## Conv_Backward

In [18]:
def conv_backward(dZ, cache):
    """
    Implement the backward propagation for a convolution function.
    
    Arguments:
        dZ -- gradient of the cost with repsect to the output of conv layer (Z), numpy array
              of shape (m, n_H, n_W, n_C)
        cache -- cache of values needed for the conv_backward(), output of conv_forward()
        
    Returns:
        dA_prev -- gradient of the cost with respect to the input of the conv layer (A_prev),
                   numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)
        dW -- gradient of the cost with respect to the weights of the conv layer (W)
              numpy array of shape (f, f, n_C_prev, n_C)
        db -- gradient of the cost with respect to the biases of the conv layer (b)
              numpy array of shape (1, 1, 1, n_C)
    """
    
    # Extract information from the "cache".
    (A_prev, W, b, hparameters) = cache
    
    # Extract dimensions from A_prev's shape.
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    
    # Extract dimensions from W's shape.
    (f, f, n_C_prev, n_C) = W.shape
    
    # Retrieve information from "hparameters".
    stride = hparameters["stride"]
    pad = hparameters["pad"]
    
    # Retrieve dimensions from dZ's shape.
    (m, n_H, n_W, n_C) = dZ.shape
    
    # Initialize dA_prev, dW, dB with the correct shapes.
    dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))
    dW = np.zeros((f, f, n_C_prev, n_C))
    db = np.zeros((1, 1, 1, n_C))
    
    # Pad A_prev and dA_prev
    # This pad is stolen from zero_pad function above.
    A_prev_pad = zero_pad(A_prev, pad)
    da_prev_pad = zero_pad(dA_prev, pad)
    
    # Loop over training examples.
    for i in range(m):
        
        # Select ith training example from A_prev_pad and dA_prev_pad
        a_prev_pad = A_prev_pad[i]
        da_prev_pad = dA_prev_pad[i]
        
        # Loop over vertical axes of the output volume.
        for h in range(n_H):
            # Loop over horizontal axis of the output volume.
            for w in range(n_W):
                # Loop over the channels of the output volume.
                for c in range(n_C):
                    
                    # Find the corners of the current "slice".
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    
                    # Use the corners to define the slice from a_prev_pad
                    a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]
                    
                    # Update gradients for the window and the filter's parameters.
                    da_prev_pad[vert_start:vert_end, horiz_start, horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
                    dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
                    db[:,:,:,c] += dZ[i, h, w, c]
        
        # Set the ith training example's dA_prev to the upadded da_prev_pad (Hint: use X[pad:-pad, pad:-pad, :])
        dA_prev[i, :, :, :] = da_prev_pad[pad:-pad, pad:-pad, :]
        
    # Make sure your output shape is correc.t
    assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))
    
    return dA_prev, dW, db

## Pooling_Backward

In [20]:
# Max pooling - backward pass
# Define a helper function to create a mask matrix which keeps track of the maximum.
def create_mask_from_window(X):
    """
    Creates a mask from an input matrix X, to identify the max entry of X.
    
    Arugments:
        X -- Array of shape (f, f)
        
    Returns:
        mask -- Array with same shape as window, contains a True at the position corresponding to the max entry of X.
    """
    
    mask = X == np.max(X)
    
    return mask 

# Average pooling - backward pass
# Define a helper function to distribute values from average pooling evenly.
def distribute_value(dz, shape):
    """
    Distributes the input value in the matrix with dimensions shape.
    
    Arguments:
        dz -- input scalar
        shape -- the shape (n_H, n_W) of output matrix for which we want to distribute the value of dz.
        
    Returns:
        a -- Array of size (n_H, n_W) for which we distributed the value of dz.
    """
    
    # Extract dimensions from shape.
    (n_H, n_W) = shape
    
    # Compute the value to distribute on the matrix.
    average = dz / (n_H * n_W)
    
    # Createa a matrix where every entry is the "average" value.
    a = np.ones(shape) * average
    
    return a

In [None]:
# Put it together in pooling backward.
def pool_backward(dA, cache, mode = "max"):
    """
    Implements the backward pass of the pooling layer.
    
    Arguments:
        dA -- gradient of cost with respect tothe output of the pooling layer, same shape as A.
        cache -- cache output from the forward pass of the pooling layer, contains the layer's
                 input and hparameters
        mode -- the pooling mode you would like to use, defined as a string ("max" or "average")
    
    Returns:
        dA_prev -- gradient of cost 
    """

## Softmax_Backward

## Parameter Update (SGD)

## Neural Network

It would be nice to just have a convolutional network. Have a pretty vanilla implementation of a ConvNet.