#### Train an MLP with 2 inputs, 3-4+ hidden units and one output on the following examples (XOR function):


Step 1) Build a perceptron
A stack of perceptrons together == hidden layer a.k.a a dense layer

In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)

In [2]:
class MLP:
    def __init__(self, inputs, outputs, hidden_units=3, epochs=500, learning_rate=.1, random_state=None):
        self.inputs = inputs
        self.outputs = outputs
        self.hidden_units = hidden_units
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.random_state = random_state
    def checkParameters(self):
        if self.inputs < 1:
            raise ValueError('Must have at least 1 input')
        if self.outputs < 1:
            raise ValueError('Must have at least 1 output')
        if self.hidden_units < 1:
            raise ValueError('Must have at least 1 hidden unit')
        if self.epochs < 1:
            raise ValueError('Must train for at least 1 epoch')
        #Is it a max of 1 also??
        if self.learning_rate <=0:
            raise ValueError('Learning rate must be greater than 0')
    
    def fit(self, x, y):
        self.checkParameters()
        self.input = x
        self.output = y
        if self.random_state is not None:
            np.random.seed(self.random_state)
        #Xavier initialization to train model to converge faster
        self.w1 = np.random.normal(loc=0.0,
                              scale=np.sqrt(2/(self.inputs+self.hidden_units)),
                             size = (self.inputs, self.hidden_units))
        self.w2 = np.random.normal(loc=0.0,
                              scale=np.sqrt(2/(self.hidden_units+self.outputs)),
                             size = (self.hidden_units, self.outputs))
        self.b1 = np.zeros(self.hidden_units)        
        self.b2 = np.zeros(self.outputs)
        for i in range(self.epochs):
            for j in range(0, x.shape[0]):
                self.forward(x[j])
                self.backward(y[j])
#             self.backward(y)
        
    def predict(self, x):
        pass
    
    def myLogLoss(self, y, deriv=False):
        if deriv is True:
            return -(y/self.output) + ((1-y)/(1-self.output))
        return -((y * np.log(self.output)) + (1 - y)  * np.log(1 - self.output))

    def myCost(self, loss):
        return np.mean(loss)

    def randomise(self):
        pass
    
    def sigmoid(self, x, deriv=False):
        if deriv is True:
            return x * (1-x)
        return (1 / (1 + np.exp(-x)))
    
    def forward(self):
        self.z1 = np.dot(training_sample, self.w1) + self.b1
        self.hidden = self.sigmoid(self.z1)
        self.z2 = np.dot(self.hidden, self.w2) + self.b2
        self.output = self.sigmoid(self.z2)
    
    def mean_square(self, y):
        return (((y - self.output)**2)*.5).sum() 
    
    def softmax(self, x, deriv=False):
        y =  np.exp(x - (x.max()))
        return y / y.sum()
    
    def cross_entropyloss(y, yHat):
        return -np.sum(y * np.log(yHat))

    def backward(self, y):
        """
        Delta of weight = -learning_rate(derError/DerWeight)
        For the first layer 
        dError|dWeights = dweightedInput|dWeights * dActivation|dWeightedInput * dError|dActivation
        dError|dBias = dweightedInput|dBias * dActivation|dWeightedInput * dError|dActivation
        """
        d_error = self.myLogLoss(y, deriv=True)
        print('d error:', d_error)
        d_z2 = d_error * (self.sigmoid(self.z2, deriv=True))
        d_w2 = d_z2 * self.hidden
        d_b2 = np.sum(d_z2)
        d_a1 = self.w2.T * d_z2
        d_z1 = d_a1 * (self.sigmoid(self.z1, deriv=True).T)
        d_w1 = d_z1 * self.input
        d_b1 = np.sum(d_z1)
    
        print(self.w1)
        print(d_w1)
        self.w1 -= (self.learning_rate * d_w1)
        self.w2 -= (self.learning_rate * d_w2)
        
        self.b1 -= self.learning_rate * d_b1
        self.b2 -= self.learning_rate * d_b2
    def predict(self, x):
        self.input = x
        self.forward()
        return self.output
#         loss = (self.myLogLoss(y))
#         der_output = self.output - y
#         der_w2 = np.dot(der_output, self.hidden.transpose())
#         der_b2 = np.sum(der_output, axis=1, keepdims=True)
        
#         der_hidden = np.multuply(np.dot(W2.T, der_output), 1 - np.power(self.hidden, 2))
#         der_w1 = np.dot(der_hidden, self.inputs.transpose())
#         der_b1 = np.sum(der_hidden, azis = 1, keepdims=True)

In [3]:
x = np.array(([0, 1],[1,0], [1,1], [0,0]))
y = np.array(([1],[1],[0],[0]))
test = MLP(2,1,3,random_state=0, epochs=1)
test.fit(x,y)

d error: [-1.64836792]
[[ 1.11568467  0.25308164  0.61900825]
 [ 1.4172653   1.18114738 -0.6180848 ]]
[[ 0.         -0.         -0.        ]
 [ 0.16080805 -0.00926872 -0.02954532]]


ValueError: non-broadcastable output operand with shape (3,1) doesn't match the broadcast shape (3,3)

In [27]:
test.predict(x)

array([[0.49456415],
       [0.48737562],
       [0.49391868],
       [0.47561739]])