Based on repo [here](https://github.com/kavanp/Neural-Network-in-Numpy/blob/master/Neural%20Network%20with%20Numpy.ipynb)

In [1]:
class Neural_Network (object):
    def __init__(self,i,j,k):
        #define hyperparameters
        self.inputLayerSize = i
        self.outputLayerSize = j
        self.hiddenLayerSize = k

        #randomly initialize weights according to input, hidden and output layer size
        self.W1 = np.random.rand(self.inputLayerSize,self.hiddenLayerSize)
        self.W2 = np.random.rand(self.hiddenLayerSize,self.outputLayerSize)

    def forward(self,X):
        # A forward function that calculates output of the network with given input data
        self.z2 = np.dot(X,self.W1)
        self.a2 = self.sigmoid(self.z2) #An activation function defined below
        self.z3 = np.dot(self.a2,self.W2)
        yHat = self.sigmoid(self.z3)
        return yHat

    def sigmoid(self,z):
        #sigmoid activation function to intoduce non-linearity
        return 1/(1+np.exp(-z))

    def sigmoidprime(self,z):
        # Derivative of sigmoid function, this will be used to while backpropagating the error during Gradient Descent
        return np.exp(-z)/(1+np.exp(-z)**2)

    def costFunction (self,X,y):
        #Calculate error
        self.yHat = self.forward(X)
        return 1/2*sum((y-self.yHat)**2)

    def costFunctionPrime (self,X,y):
        # Calculate derivative of error with respect to each weight, I recommend to refer the math behind below calculations         
        self.yHat = self.forward(X)

        delta3 = np.multiply(-(y - self.yHat),self.sigmoidprime(self.z3))
        dJdW2 = np.dot(self.a2.T,delta3) #Change in error with respect to change in W2

        delta2 = np.dot(delta3,self.W2.T)*self.sigmoidprime(self.z2)
        dJdW1 = np.dot(X.T,delta2) #Change in error with respect to change in W1 

        return dJdW1,dJdW2

In [2]:
import numpy as np

X = np.array([[0,0],[0,1],[1,0],[1,1]])
Y = np.array([[0],[1],[1],[1]])

In [3]:
NeuralNet = Neural_Network(2,1,3)
NeuralNet.forward(X)

array([[0.55099555],
       [0.55835914],
       [0.56272449],
       [0.56941217]])

In [4]:
#Update the weights according to Gradient Descent.
for i in range(1000):
    djdw1,djdw2 = NeuralNet.costFunctionPrime(X,Y)
    NeuralNet.W1 = NeuralNet.W1 - 0.1*djdw1
    NeuralNet.W2 = NeuralNet.W2 - 0.1*djdw2

In [5]:
NeuralNet.forward([[0,0],[0,1],[1,0],[1,1]])

array([[0.67943413],
       [0.78301386],
       [0.78675018],
       [0.81169713]])

In [6]:
#Update the weights according to Gradient Descent.
lr = 0.1
for i in range(10000):
    djdw1,djdw2 = NeuralNet.costFunctionPrime(X,Y)
    NeuralNet.W1 = NeuralNet.W1 - lr*djdw1
    NeuralNet.W2 = NeuralNet.W2 - lr*djdw2

In [7]:
NeuralNet.forward([[0,0],[0,1],[1,0],[1,1]])

array([[0.05939922],
       [0.96836322],
       [0.96892061],
       [0.97813648]])