In [23]:
# https://towardsdatascience.com/how-to-build-a-simple-neural-network-from-scratch-with-python-9f011896d2f3

import numpy as np

def sigmoid(z):
    return 1/(1 + np.exp(-z))

def initialize_parameters(neurons_count_input_layer, neurons_count_hidden_layer, neurons_count_output_layer):
    weight1 = np.random.randn(neurons_count_hidden_layer, neurons_count_input_layer)
    bias1 = np.zeros((neurons_count_hidden_layer, 1))
    weight2 = np.random.randn(neurons_count_output_layer, neurons_count_hidden_layer)
    bias2 = np.zeros((neurons_count_output_layer, 1))

    parameters = {
        "weight1": weight1,
        "bias1" : bias1,
        "weight2": weight2,
        "bias2" : bias2
    }
    return parameters

def forward_prop(X, parameters):
    weight1 = parameters["weight1"]
    bias1 = parameters["bias1"]
    weight2 = parameters["weight2"]
    bias2 = parameters["bias2"]

    Z1 = np.dot(weight1, X) + bias1
    A1 = np.tanh(Z1)
#    A1 = sigmoid(Z1)
    Z2 = np.dot(weight2, A1) + bias2
    A2 = sigmoid(Z2)
#    A2 = np.tanh(Z2)

    cache = {
        "A1": A1,
        "A2": A2
    }
    return A2, cache

def calculate_cost(A2, Y):
    cost = -np.sum(np.multiply(Y, np.log(A2)) +  np.multiply(1-Y, np.log(1-A2)))/m
    cost = np.squeeze(cost)

    return cost

def backward_prop(X, Y, cache, parameters):
    A1 = cache["A1"]
    A2 = cache["A2"]

    weight2 = parameters["weight2"]

    dZ2 = A2 - Y
    dweight2 = np.dot(dZ2, A1.T)/m
    dbias2 = np.sum(dZ2, axis=1, keepdims=True)/m
    dZ1 = np.multiply(np.dot(weight2.T, dZ2), 1-np.power(A1, 2))
    dweight1 = np.dot(dZ1, X.T)/m
    dbias1 = np.sum(dZ1, axis=1, keepdims=True)/m

    grads = {
        "dweight1": dweight1,
        "dbias1": dbias1,
        "dweight2": dweight2,
        "dbias2": dbias2
    }

    return grads

def update_parameters(parameters, grads, learning_rate):
    weight1 = parameters["weight1"]
    bias1 = parameters["bias1"]
    weight2 = parameters["weight2"]
    bias2 = parameters["bias2"]

    dweight1 = grads["dweight1"]
    dbias1 = grads["dbias1"]
    dweight2 = grads["dweight2"]
    dbias2 = grads["dbias2"]

    weight1 = weight1 - learning_rate*dweight1
    bias1 = bias1 - learning_rate*dbias1
    weight2 = weight2 - learning_rate*dweight2
    bias2 = bias2 - learning_rate*dbias2
    
    new_parameters = {
        "weight1": weight1,
        "weight2": weight2,
        "bias1" : bias1,
        "bias2" : bias2
    }

    return new_parameters


def model(X, Y, neurons_count_input_layer, neurons_count_hidden_layer, neurons_count_output_layer, num_of_iters, learning_rate):
    parameters = initialize_parameters(neurons_count_input_layer, neurons_count_hidden_layer, neurons_count_output_layer)

    for i in range(0, num_of_iters+1):
        a2, cache = forward_prop(X, parameters)

        cost = calculate_cost(a2, Y)

        grads = backward_prop(X, Y, cache, parameters)

        parameters = update_parameters(parameters, grads, learning_rate)

        if(i%50 == 0):
            print('Cost after iteration# {:d}: {:f}'.format(i, cost))
#            print('cache', cache)
            print('grads', grads)
#            print('parameters', parameters)

    return parameters

def predict(X, parameters):
    a2, cache = forward_prop(X, parameters)
    yhat = a2
    yhat = np.squeeze(yhat)
    if(yhat >= 0.5):
        y_predict = 1
    else:
        y_predict = 0

    return y_predict
    


np.random.seed(2)

# The 4 training examples by columns
X = np.array([[0, 0, 1, 1], [0, 1, 0, 1]])

# The outputs of the XOR for every example in X
Y = np.array([[0, 1, 1, 0]])

# No. of training examples
m = X.shape[1]

# Set the hyperparameters
neurons_count_input_layer = 2     #No. of neurons in first layer
neurons_count_hidden_layer = 2     #No. of neurons in hidden layer
neurons_count_output_layer = 1     #No. of neurons in output layer
num_of_iters = 1000
learning_rate = 0.3

trained_parameters = model(X, Y, neurons_count_input_layer, neurons_count_hidden_layer, neurons_count_output_layer, num_of_iters, learning_rate)

# Test 2X1 vector to calculate the XOR of its elements. 
# Try (0, 0), (0, 1), (1, 0), (1, 1)
X_test = np.array([[1], [1]])

y_predict = predict(X_test, trained_parameters)

print('Neural Network prediction for example ({:d}, {:d}) is {:d}'.format(
    X_test[0][0], X_test[1][0], y_predict))


Cost after iteration# 0: 0.856267
grads {'dweight1': array([[-0.20850296,  0.02049162],
       [-0.12492949, -0.10745412]]), 'dbias1': array([[-0.13603693],
       [-0.21063238]]), 'dweight2': array([[-0.05726626, -0.19815821]]), 'dbias2': array([[0.1054896]])}
Cost after iteration# 50: 0.503667
grads {'dweight1': array([[-0.02761564,  0.01814891],
       [ 0.02764883, -0.02626384]]), 'dbias1': array([[-0.00162985],
       [-0.02037731]]), 'dweight2': array([[0.02356708, 0.06216107]]), 'dbias2': array([[-0.03324471]])}
Cost after iteration# 100: 0.347426
grads {'dweight1': array([[-0.05169183,  0.03862595],
       [ 0.03076282, -0.01913592]]), 'dbias1': array([[-0.03096171],
       [-0.0077317 ]]), 'dweight2': array([[0.03170799, 0.04121168]]), 'dbias2': array([[-0.06792659]])}
Cost after iteration# 150: 0.171442
grads {'dweight1': array([[-0.03675009,  0.03477031],
       [ 0.01519262, -0.0112744 ]]), 'dbias1': array([[-0.02267367],
       [-0.00605038]]), 'dweight2': array([[0.029528