In [87]:
import numpy as np
np.random.seed(0)

# Defining inputs(X) with their labels(y)
x = np.random.rand(100,10)
y = np.round(np.random.rand(1,10))
y = y.reshape(1,10)
m = y.size #Number of training examples

# Initializing weights, bias for the 4 layers
w1 = np.random.randn(4,100) * 0.1
w2 = np.random.randn(3,4) 
w3 = np.random.randn(2,3)
w4 = np.random.randn(1,2) * 0.001
b1 = np.zeros((4,1))
b2 = np.zeros((3,1))
b3 = np.array([[0.3],[0.5]])
b4 = 0.7

# Set learning rate and number of iterations
lr = 0.15
iterations = 50


#Function to obtain derivative of the relu activation function
def drelu(z):
    return np.where(z<0, z*0, z*0+1)

for i in range(iterations):
    # Function to predict before applying activation function
    z1 = np.dot(w1, x) + b1 
    
    # Predicted a1 (applying the relu activation function)
    a1 = np.maximum(0, z1)
    da1 = z1
    da1 = drelu(da1)
   
    # 2nd layer with relu activation function
    z2 = np.dot(w2, a1) + b2
    a2 = np.maximum(0, z2)
    da2 = z2
    da2 = drelu(da2)
        
    # 3rd layer with relu activation function
    z3 = np.dot(w3, a2) + b3
    a3 = np.maximum(0, z3)
    da3 = z3
    da3 = drelu(da3)
     
    # 4th (output) layer with sigmoid activation function
    z4 = np.dot(w4, a3) + b4
    a4 = 1 / (1 + np.exp(-z4))
    
    if i==0:
        print('Predicted labels:' + str(np.round(a4)), 'Actual labels:' + str(y))
        
    # Derivatve of the sigmoid activation function
    da4 = (a4 * (1-a4))
    
    # Calculating the loss
    loss = -1 * ((y * np.log10(a4)) + ((1-y) * np.log10((1-a4))))

    # Calculating the cost function
    cost = (np.sum(loss))/(m)
    print('cost', cost)

    # Gradient descent
    dz4 = ((-y/a4) + ((1-y)/(1-a4))) * da4
    dw4 = np.dot(dz4, a3.T)/(m)
    db4 = (np.sum(dz4, axis=1, keepdims = True))/(m)
    
    dz3 = (np.dot(w4.T,dz4)) * (da3)
    dw3 = np.dot(dz3, a2.T)/(m)
    db3 = (np.sum(dz3, axis=1, keepdims = True))/(m)
    
    dz2 = (np.dot(w3.T,dz3)) * (da2)
    dw2 = np.dot(dz2, a1.T)/(m)
    db2 = (np.sum(dz2, axis=1, keepdims = True))/(m)
    
    dz1 = (np.dot(w2.T,dz2)) * (da1)
    dw1 = np.dot(dz1, x.T)/(m)
    db1 = (np.sum(dz1, axis=1, keepdims = True))/(m)

    # Updating weights and bias
    w1 = w1 - np.dot(lr, dw1)
    b1 = b1 - np.dot(lr, db1)
    w2 = w2 - np.dot(lr, dw2)
    b2 = b2 - np.dot(lr, db2)
    w3 = w3 - np.dot(lr, dw3)
    b3 = b3 - np.dot(lr, db3)
    w4 = w4 - np.dot(lr, dw4)
    b4 = b4 - np.dot(lr, db4)
    
print('Predicted labels:' + str(np.round(a4)), 'Actual labels:' + str(y))

Predicted labels:[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]] Actual labels:[[1. 0. 0. 1. 0. 1. 1. 0. 0. 1.]]
cost 0.32724479834358766
cost 0.320316799051637
cost 0.31477704303733695
cost 0.3101540047012672
cost 0.30616057087633264
cost 0.3027782315329442
cost 0.2997266650259415
cost 0.2967586705151811
cost 0.2937960634799285
cost 0.2906057021486958
cost 0.28726913256258957
cost 0.28366265361209536
cost 0.27921208859277663
cost 0.2752226941911139
cost 0.2700192036717509
cost 0.2657457913598279
cost 0.2590455605556107
cost 0.2527096761832829
cost 0.2485019554232896
cost 0.2419467121574884
cost 0.23520489924924096
cost 0.22841093173159055
cost 0.2334246721597854
cost 0.20773040645713364
cost 0.22068053710323193
cost 0.2000854049789646
cost 0.23053286005408946
cost 0.1804844946256141
cost 0.19937168597471516
cost 0.23458337210421304
cost 0.18900338690584556
cost 0.15944229541289973
cost 0.15198286355218427
cost 0.14589823790595832
cost 0.14453696557418516
cost 0.16494843556601285
cost 0.137031800583