#### Train an MLP with 2 inputs, 3-4+ hidden units and one output on the following examples (XOR function):


Step 1) Build a perceptron
A stack of perceptrons together == hidden layer a.k.a a dense layer

In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)

In [125]:
class MLP:
    def __init__(self, X, y, hidden_units=3, epochs=100, learning_rate=.1, random_state=None):
        self.X = X
        self.y = y
        self.hidden_units = hidden_units
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.random_state = random_state
    
    def checkParameters(self):
        self.num_inputs = self.X.shape[1]
        self.num_outputs = self.y.shape[1]
        if self.X.shape[0] != self.y.shape[0]:
            raise ValueError('X and Y have mismatched shapes')
        if self.num_inputs < 1:
            raise ValueError('Must have at least 1 input')
        if self.num_outputs < 1:
            raise ValueError('Must have at least 1 output')
        if self.hidden_units < 1:
            raise ValueError('Must have at least 1 hidden unit')
        if self.epochs < 1:
            raise ValueError('Must train for at least 1 epoch')
        #Is it a max of 1 also??
        if self.learning_rate <=0:
            raise ValueError('Learning rate must be greater than 0')
    

    def sigmoid(self, x, deriv=False):
        if deriv is True:
            return x * (1-x)
        return (1 / (1 + np.exp(-x)))
     
    def myLogLoss(self, deriv=False):
        if deriv is True:
            return -(self.y/self.output) + ((1-self.y)/(1-self.output))
        return (1/self.num_inputs) * np.sum(-((self.y * np.log(self.output)) + (1 - self.y)  * np.log(1 - self.output)))
    
 
    def forward(self):
        self.z1 = self.X@self.w1 + self.b1
        self.a1 = np.tanh(self.z1)
        self.z2 = self.a1@self.w2 + self.b2
        self.output = self.sigmoid(self.z2)
            
        self.loss = self.myLogLoss()

    def backward(self):
        
        d_output = self.output - self.y
        d_w2 = (1/self.num_inputs) * (np.sum(d_output, axis=0, keepdims=True))
        d_b2 = (1/self.num_inputs) * (np.sum(d_output, axis=0, keepdims=True))
        d_z1 = (d_output@self.w2.T) * self.sigmoid(self.a1, deriv=True)
        d_w1 = (1/self.num_inputs) * (self.X.T@d_z1)
        d_b1 = (1/self.num_inputs) * (np.sum(d_z1, axis=0, keepdims=True))
    
        self.w1 -= (self.learning_rate * d_w1)
        self.w2 -= (self.learning_rate * d_w2)
        self.b1 -= (self.learning_rate * d_b1)
        self.b2 -= (self.learning_rate * d_b2)

    def fit(self, x, y):
        self.checkParameters()
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        #Xavier initialization to train model to converge faster
        self.w1 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.num_inputs+self.hidden_units)),
                                size = (self.num_inputs, self.hidden_units))
        self.w2 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.hidden_units+self.num_outputs)),
                                size = (self.hidden_units, self.num_outputs))
        self.b1 = np.zeros((1,self.hidden_units))       
        self.b2 = np.zeros((1,self.num_outputs))
        for i in range (self.epochs):
            self.forward()
        #    print("Epoch {} Loss is {}".format(i+1, self.loss))
            self.backward()
    
    def predict(self, x):
        self.X = x
        self.forward()
        return np.rint(self.output)

In [126]:
x = np.array(([0, 1],[1,0], [1,1], [0,0]))
y = np.array(([1],[1],[0],[0]))
test = MLP(x,y,3,random_state=0, epochs=100000)
test.fit(x,y)
test.predict(x)

array([[1.],
       [1.],
       [0.],
       [0.]])

In [110]:
    def predict(self, x):
        pass
   

    def myCost(self, loss):
        return np.mean(loss)

    def randomise(self):
        pass
    

    

    
    def mean_square(self, y):
        return (((y - self.output)**2)*.5).sum() 
    
    def softmax(self, x, deriv=False):
        y =  np.exp(x - (x.max()))
        return y / y.sum()
    
    def cross_entropyloss(y, yHat):
        return -np.sum(y * np.log(yHat))

    def backward(self, y):
        """
        Delta of weight = -learning_rate(derError/DerWeight)
        For the first layer 
        dError|dWeights = dweightedInput|dWeights * dActivation|dWeightedInput * dError|dActivation
        dError|dBias = dweightedInput|dBias * dActivation|dWeightedInput * dError|dActivation
        """
        d_error = self.myLogLoss(y, deriv=True)
        print('d error:', d_error)
        d_z2 = d_error * (self.sigmoid(self.z2, deriv=True))
        d_w2 = d_z2 * self.hidden
        d_b2 = np.sum(d_z2)
        d_a1 = self.w2.T * d_z2
        d_z1 = d_a1 * (self.sigmoid(self.z1, deriv=True).T)
        d_w1 = d_z1 * self.input
        d_b1 = np.sum(d_z1)
    
        print(self.w1)
        print(d_w1)
        self.w1 -= (self.learning_rate * d_w1)
        self.w2 -= (self.learning_rate * d_w2)
        
        self.b1 -= self.learning_rate * d_b1
        self.b2 -= self.learning_rate * d_b2
    def predict(self, x):
        self.input = x
        self.forward()
        return self.output
#         loss = (self.myLogLoss(y))
#         der_output = self.output - y
#         der_w2 = np.dot(der_output, self.hidden.transpose())
#         der_b2 = np.sum(der_output, axis=1, keepdims=True)
        
#         der_hidden = np.multuply(np.dot(W2.T, der_output), 1 - np.power(self.hidden, 2))
#         der_w1 = np.dot(der_hidden, self.inputs.transpose())
#         der_b1 = np.sum(der_hidden, azis = 1, keepdims=True)

In [8]:
test.predict(x)

ValueError: operands could not be broadcast together with shapes (4,2) (2,3) 