In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.stats as s

## Activations:
Sigmoid:-
         $$ \frac{1}{1 + e^{-x}}$$
Relu :- 
         $$\max({0},{z})$$

In [3]:
class Activations:
    """
    Activations(their function and derivative) used:
    sigmoid and 
    relu  
    Reurns:
    Acitvation output and their acitvation cache"""
    
    def sigmoid(Z):
        """
        Calculates sigmoid function
        Returns:
        A: Output of sigmoid functiona
        cache:saves the linear part of activation so as to use the info in calculation of backward prop
        """
        e = np.exp(-Z)
        A = 1/(1+e)
        cache = Z
        return A,cache
    def relu(Z):
        """
        Calculates relu function
        Returns:
        A: Output of relu function
        cache:saves the linear part of activation so as to use the info in calculation of backward prop
        """
        A = np.maximum(0,Z)
        cache=Z
        return A,cache
    def sigmoid_backward(dA,cache):
        """
        Calculates derivative of sigmoid function for backward prop
        Returns:
        dZ:Output of dervative of sigmoid function
        """
        Z = cache
        s = 1/(1+np.exp(-Z))
        dZ = dA *s *s
        return dZ
    def relu_backwward(dA,cache):
        """
        Calculates derivative of relu function for backward prop
        Returns:
        dZ:Output of derivative of relu function
        """
        Z = cache
        dZ = np.array(dA,copy=True)
        dZ[Z<=0] = 0
        return dZ
    

## Parameter Initialisation
Default Intialisation :- Random initialisation of W(weight matrix) and zero initilisation of b(bias vector)\
He Inntialisation :- He Intialisation ,i.e., basically multiplying the default intialisation(both random) with $\nu$.Here,$\nu^2$ is
                     $$\nu^2 = \frac{2}{N}$$

In [4]:
class Parameters:
    def layer_parameter_initialise(layer_dimensions,initialiser = "default"):
        """"
        layers_dimensions = dimensions of layer
        initialiser = Type of intialiser (He or default)
        
        Returns:
        parameters = python dictionary(Initialised parameters (W,b))
        W = Weight matrix 
        b = bias vector
        """        
        parameters = {}
        layers = len(layer_dimensions)
        if initialiser=='He':
            for i in range(1,layers):
                parameters['W' + str(i)] = np.random.randn(layer_dimensions[i],layer_dimensions[i-1])*(np.sqrt(2/(i)))
                parameters['b' + str(i)] = np.random.randn(layer_dimensions[i],1)*(np.sqrt(2/(i)))
        else:
            for i in range(1,layers):
                parameters['W' + str(i)] = np.random.randn(layer_dimensions[i],layer_dimensions[i-1])*0.01
                parameters['b' + str(i)] = np.zeros((layer_dimensions[i],1))
        return parameters

In [5]:
class Layer_activation_function(Activations):
    def linear_forward(A,W,b):
        """
        A = acitivation from previous layer(previous layer,number of examples)
        W = Weight matrix(size of current layer , size of previous layer)
        b = bias vector (current layer,1)
        Returns:
        Z = Linear part of activation
        cache = python tuple containing(A,W,b)
        """
        Z = np.dot(W,A) + b
        cache = (A,W,b)
        return Z ,cache
    def linear_forward_activations(A_p,W,b,activation):
        """
        A = acitivation from previous layer
        W = Weight matrix(size of current layer , size of previous layer)
        b = bias vector (current layer,1)
        activation = type of activations as string(sigmoid or relu)
        Returns:
        A = output of activation function
        cache = python tuple containing linear_cahe and activation_cache for backward pass calculations
        """
        if activation == 'sigmoid':
            Z,linear_cache = Layer_activation_function.linear_forward(A_p,W,b)
            A,activation_cache = Activations.sigmoid(Z)
        elif activation == 'relu':
            Z,linear_cache = Layer_activation_function.linear_forward(A_p,W,b)
            A,activation_cache = Activations.relu(Z)
        cache = (linear_cache,activation_cache)
        return A,cache

In [6]:
def forward_pass(X,parameters):
    """
    Forward Prop with LINEAR -> RELU ----> LINEAR ->SIGMOID layer architecture
    X : data(numpy array with (input_size,number_of_examples))
    parameters: python dictionary ,containing output of layer_parameter_initialise function of Parameters class.
    Returns:
    Al : Output of last activation 
    cachces : a list of caches containing caches of linear_activations of class Layer_activation_function
    """
    caches =[]
    A = X
    #number of layers 
    L = len(parameters)//2 
    for i in range(1,L):
        A ,cache = Layer_activation_function.linear_forward_activations(A,parameters['W' + str(i)], parameters['b' +str(i)],activation='relu')
        caches.append(cache)
    Al,cache = Layer_activation_function.linear_forward_activations(A , parameters['W'+str(L)],parameters['b'+str(L)],activation = 'sigmoid')
    caches.append(cache)
    return Al,caches

In [7]:
np.random.seed(6)
X = np.random.randn(5,4)
W1 = np.random.randn(4,5)
b1 = np.random.randn(4,1)
W2 = np.random.randn(3,4)
b2 = np.random.randn(3,1)
W3 = np.random.randn(1,3)
b3 = np.random.randn(1,1)
  
parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3}
AL, caches = forward_pass(X, parameters)
print("AL = " + str(AL))
print("Length of caches list = " + str(len(caches)))

AL = [[0.03921668 0.70498921 0.19734387 0.04728177]]
Length of caches list = 3


## Cost 
Cost function:-
                $\-y(\log(p) + (1-y)(\log(1-p)\$

In [8]:
class Cost:
    def compute_cost(Al,Y):
        """
        Calculates the cost function
        Al:
        Y: label vector (1,number_of_examples)
        Returns:
        cost ---> cross-entropy cost
        """
        m = Y.shape[1]                  # number of examples
        cost =  - (1/m) * (np.dot(Y,np.log(Al).T) + np.dot(1-Y,np.log(1-Al).T))
        cost = np.squeeze(cost)
        return cost

In [9]:
Y = np.asarray([[1, 1, 0]])
AL = np.array([[.8,.9,0.4]])
print("cost = " + str(Cost.compute_cost(AL, Y)))

cost = 0.2797765635793422


In [10]:
class Backward_Activations(Activations):
    def linear_backward(dZ,cache):
        """
        Linear part of the backward activation
        dZ : Gradient of the cost w.r.t. output of current layer(layer l)
        cache : tuple of values (A_prev, W, b) coming from the forward propagation in the current layer
        Returns :
        dA_prev --> gradient w.r.t acitvation(previous layer (l-1)) , same shape as A_prev
        dW --> gradient w.r.t. W (current layer l),same shape as W
        db -->gradient w.r.t. b(current layer l),same shape as b
        """
        A_prev,W,b = cache
        m = A_prev.shape[1]
        dW = -(1/m)*np.dot(dZ,A_prev.T)
        db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
        dA_prev = np.dot(W.T,dZ)
        return dA_prev,dW,db
    
    def linear_backward_activations(dA ,cache,activation):
        """
        Complete Activation function for backward prop
        dA: post-activation gradient for current layer
        cache:tuple containing linear cache and activation cache
        activation:type of activation (sigmoid,relu)
        Return:
        dA_prev --> Gradient of the cost w.r.t. activation(previous layer l-1), same shape as A_prev
        dW --> Gradient of the cost w.r.t. W (current layer l), same shape as W
        db --> Gradient of the cost w.r.t. b (current layer l), same shape as b
        """
        linear_cache , activation_cache = cache
        if activation == "sigmoid":
            dZ = Activations.sigmoid_backward(dA,activation_cache)
            dA_prev, dW, db = Backward_Activations.linear_backward(dZ,linear_cache)
        elif activation=="relu":
            dZ = Activations.relu_backward(dA,activation_cache)
            dA_prev, dW, db = Backward_Activations.linear_backward(dZ,linear_cache)
        return dA_prev, dW, db

In [11]:
def backward_pass(Al,Y,caches):
    """
    Backward prop
    Al:probability vector 
    Y: true label vector
    caches:list of caches containing:
                every cache of linear_activation_forward() with "relu" and with "sigmoid"
    Returns:
    grads--> python dictionary, containing gradients of dA,dW,db
    """
    grads ={}
    L = len(caches)
    m=Al.shape[1]
    Y = Y.reshape(Al.shape)
    dAl = -(np.divide(Y,Al) - np.divide(1-Y,1-Al))
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = Backward_Activations.linear_backward_activations(dAl,current_cache,activation='sigmoid')
    for l in reversed(range(L-1)):
        current_cache = caches[l-1]
        dA_prev_temp, dW_temp, db_temp = Backward_Activations.linear_backward_activations(dAl,current_cache,activation='sigmoid')
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
    return grads

In [12]:
np.random.seed(3)
AL = np.random.randn(1, 2)
Y_assess = np.array([[1, 0]])

A1 = np.random.randn(4,2)
W1 = np.random.randn(3,4)
b1 = np.random.randn(3,1)
Z1 = np.random.randn(3,2)
linear_cache_activation_1 = ((A1, W1, b1), Z1)

A2 = np.random.randn(3,2)
W2 = np.random.randn(1,3)
b2 = np.random.randn(1,1)
Z2 = np.random.randn(1,2)
linear_cache_activation_2 = ((A2, W2, b2), Z2)

caches = (linear_cache_activation_1, linear_cache_activation_2)

grads = backward_pass(AL, Y_assess, caches)
print(grads)

{'dA1': array([[ 0.2465359 , -0.30822344],
       [-0.2706392 ,  0.3383578 ],
       [ 0.03176322, -0.03971093]]), 'dW2': array([[ 0.42517377,  0.04556475, -0.15219449]]), 'db2': array([[0.03012452]]), 'dA0': array([[ 0.2465359 , -0.30822344],
       [-0.2706392 ,  0.3383578 ],
       [ 0.03176322, -0.03971093]]), 'dW1': array([[ 0.42517377,  0.04556475, -0.15219449]]), 'db1': array([[0.03012452]])}


## Update Parameters
Applying gradient descent:-
$$\frac{1}{N}\sum_{i=0}^{i=N}(y - \alpha * dy)$$



In [13]:
def update_parameters(parameters,grads,learning_rate):
    """
    Update parameters using gradient descent
    parameters:python dictionary ,containing parameters 
    grads: python dictionary containing your gradients, output of backward_pass
    
    Returns:
    parameters --> python dictionary containing your updated parameters(W,b)
    """
    L = len(parameters) // 2
    for l in range(L):
            parameters["W" + str(l+1)] = parameters['W'+str(l+1)] - learning_rate * grads['dW' + str(l+1)]
            parameters["b" + str(l+1)] = parameters['b'+str(l+1)] - learning_rate * grads['db' + str(l+1)]
    return parameters