#### Train an MLP with 2 inputs, 3-4+ hidden units and one output on the following examples (XOR function):


Step 1) Build a perceptron
A stack of perceptrons together == hidden layer a.k.a a dense layer

In [31]:
import numpy as np
import pandas as pd
np.random.seed(42)
from sklearn.model_selection import train_test_split

In [18]:
class MLP:
    def __init__(self, X, y, hidden_units=3, epochs=100, learning_rate=.1, random_state=None):
        self.X = X
        self.y = y
        self.hidden_units = hidden_units
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.random_state = random_state
    
    def checkParameters(self):
        self.num_inputs = self.X.shape[1]
        self.num_outputs = self.y.shape[1]
        if self.X.shape[0] != self.y.shape[0]:
            raise ValueError('X and Y have mismatched shapes')
        if self.num_inputs < 1:
            raise ValueError('Must have at least 1 input')
        if self.num_outputs < 1:
            raise ValueError('Must have at least 1 output')
        if self.hidden_units < 1:
            raise ValueError('Must have at least 1 hidden unit')
        if self.epochs < 1:
            raise ValueError('Must train for at least 1 epoch')
        #Is it a max of 1 also??
        if self.learning_rate <=0:
            raise ValueError('Learning rate must be greater than 0')
    

    def sigmoid(self, x, deriv=False):
        if deriv is True:
            return x * (1-x)
        return (1 / (1 + np.exp(-x)))
     
    def myLogLoss(self, deriv=False):
        if deriv is True:
            return -(self.y/self.output) + ((1-self.y)/(1-self.output))
        return (1/self.num_inputs) * np.sum(-((self.y * np.log(self.output)) + (1 - self.y)  * np.log(1 - self.output)))
    
 
    def forward(self):
        self.z1 = self.X@self.w1 + self.b1
        self.a1 = np.tanh(self.z1)
        self.z2 = self.a1@self.w2 + self.b2
        self.output = self.sigmoid(self.z2)
            
        self.loss = self.myLogLoss()

    def backward(self):
        
        d_output = self.output - self.y
        d_w2 = (1/self.num_inputs) * (np.sum(d_output, axis=0, keepdims=True))
        d_b2 = (1/self.num_inputs) * (np.sum(d_output, axis=0, keepdims=True))
        d_z1 = (d_output@self.w2.T) * self.sigmoid(self.a1, deriv=True)
        d_w1 = (1/self.num_inputs) * (self.X.T@d_z1)
        d_b1 = (1/self.num_inputs) * (np.sum(d_z1, axis=0, keepdims=True))
    
        self.w1 -= (self.learning_rate * d_w1)
        self.w2 -= (self.learning_rate * d_w2)
        self.b1 -= (self.learning_rate * d_b1)
        self.b2 -= (self.learning_rate * d_b2)

    def fit(self, x, y):
        self.checkParameters()
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        #Xavier initialization to train model to converge faster
        self.w1 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.num_inputs+self.hidden_units)),
                                size = (self.num_inputs, self.hidden_units))
        self.w2 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.hidden_units+self.num_outputs)),
                                size = (self.hidden_units, self.num_outputs))
        self.b1 = np.zeros((1,self.hidden_units))       
        self.b2 = np.zeros((1,self.num_outputs))
        for i in range (self.epochs):
            self.forward()
        #    print("Epoch {} Loss is {}".format(i+1, self.loss))
            self.backward()
    
    def predict(self, x):
        self.X = x
        self.forward()
        return self.output

In [25]:
x = np.array(([0, 1],[1,0], [1,1], [0,0]))
y = np.array(([1],[1],[0],[0]))
test = MLP(x,y,4,random_state=0, epochs=100000)
test.fit(x,y)

In [26]:
test.predict(x)

array([[0.67798406],
       [0.99875246],
       [0.28563373],
       [0.03943291]])

In [28]:
data = []
y_data = []
for n in range(100):
    data.append(np.random.uniform(low=-1.0,high=1.0, size=(4, 1)))
    y_data.append(np.sin(data[-1][0] - data[-1][1] + data[-1][2] - data[-1][3]))
y_data = np.array(y_data)
X_data = np.array(data)
X_data = X_data.reshape(100, 4)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=0, test_size = 0.25, train_size = 0.75)

In [34]:
print(np.max(y_data))
print(np.min(y_data))

0.9998459796152872
-0.9996706301937582


In [35]:
class MLP:
    def __init__(self, X, y, hidden_units=3, epochs=100, learning_rate=.1, random_state=None):
        self.X = X
        self.y = y
        self.hidden_units = hidden_units
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.random_state = random_state

    def checkParameters(self):
        self.num_inputs = self.X.shape[1]
        self.num_outputs = self.y.shape[1]
        if self.X.shape[0] != self.y.shape[0]:
            raise ValueError('X and Y have mismatched shapes')
        if self.num_inputs < 1:
            raise ValueError('Must have at least 1 input')
        if self.num_outputs < 1:
            raise ValueError('Must have at least 1 output')
        if self.hidden_units < 1:
            raise ValueError('Must have at least 1 hidden unit')
        if self.epochs < 1:
            raise ValueError('Must train for at least 1 epoch')
        #Is it a max of 1 also??
        if self.learning_rate <=0:
            raise ValueError('Learning rate must be greater than 0')


    def sigmoid(self, x, deriv=False):
        if deriv is True:
            return x * (1-x)
        return (1 / (1 + np.exp(-x)))

    def myLogLoss(self, deriv=False):
        if deriv is True:
            return -(self.y/self.output) + ((1-self.y)/(1-self.output))
      #  print(1/self.num_inputs)
        a = (-self.y.T)
        b = (np.log(self.output))
   #     print('here:', a.shape, b.shape, b @ a)
   #         return (1/self.num_inputs) * (np.sum(-self.y.T@np.log(self.output)-(1-self.y).T@np.log(1-self.output)))

    def MSE(self):
        return np.square(self.output - self.y).mean()

    def forward(self):
        self.z1 = self.X@self.w1 + self.b1
        self.a1 = np.tanh(self.z1)
        self.z2 = self.a1@self.w2 + self.b2
        self.output = np.tanh(self.z2)
        self.loss = self.MSE()

    def backward(self):

        d_output = self.output - self.y
        d_w2 = (1/self.num_inputs) * (self.a1.T @ d_output)
        d_b2 = (1/self.num_inputs) * (np.sum(d_output, axis = 0, keepdims=True))
        d_z1 = (d_output@self.w2.T) * (1-(self.a1**2))
        d_w1 = (1/self.num_inputs) * (self.X.T@d_z1)
        d_b1 = (1/self.num_inputs) * (np.sum(d_z1, axis=0, keepdims=True))

        self.w1 -= (self.learning_rate * d_w1)
        self.w2 -= (self.learning_rate * d_w2)
        self.b1 -= (self.learning_rate * d_b1)
        self.b2 -= (self.learning_rate * d_b2)

    def fit(self, x, y):
        self.checkParameters()
        if self.random_state is not None:
            np.random.seed(self.random_state)

#         Xavier initialization to train model to converge faster
        self.w1 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.num_inputs+self.hidden_units)),
                                size = (self.num_inputs, self.hidden_units))
        self.w2 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.hidden_units+self.num_outputs)),
                                size = (self.hidden_units, self.num_outputs))
        self.b1 = np.zeros((1,self.hidden_units))
        self.b2 = np.zeros((1,self.num_outputs))
        for i in range (self.epochs):
            self.forward()
        #    print("Epoch {} Loss is {}".format(i+1, self.loss))
            self.backward()

    def predict(self, x):
        self.X = x
        self.forward()
        return self.output

In [48]:
test = MLP(X_train,y_train,4,random_state=0, epochs=10000)
test.fit(X_train,y_train)

In [45]:
predict = test.predict(X_train)

In [49]:
error = np.power((predict - y_train), 2).mean()

In [47]:
error

0.006700603535698823

In [50]:
error

0.006700603535698823