#### Train an MLP with 2 inputs, 3-4+ hidden units and one output on the following examples (XOR function):


Step 1) Build a perceptron
A stack of perceptrons together == hidden layer a.k.a a dense layer

In [31]:
import numpy as np
import pandas as pd
np.random.seed(42)
from sklearn.model_selection import train_test_split

In [185]:
class MLP:
    def __init__(self, X, y, hidden_units=3, epochs=100, learning_rate=.1, random_state=None):
        self.X = X
        self.y = y
        self.hidden_units = hidden_units
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.random_state = random_state
    
    def checkParameters(self):
        self.num_inputs = self.X.shape[1]
        self.num_outputs = self.y.shape[1]
        if self.X.shape[0] != self.y.shape[0]:
            raise ValueError('X and Y have mismatched shapes')
        if self.num_inputs < 1:
            raise ValueError('Must have at least 1 input')
        if self.num_outputs < 1:
            raise ValueError('Must have at least 1 output')
        if self.hidden_units < 1:
            raise ValueError('Must have at least 1 hidden unit')
        if self.epochs < 1:
            raise ValueError('Must train for at least 1 epoch')
        #Is it a max of 1 also??
        if self.learning_rate <=0:
            raise ValueError('Learning rate must be greater than 0')
    
    def tan_h_deriv(self, x):
        return 1 - (x**2)
    def sigmoid(self, x, deriv=False):
        sig = (1 / (1 + np.exp(-x)))
        if deriv is True:
            return x * (1-x)
        return (1 / (1 + np.exp(-x)))
     
    def myLogLoss(self, deriv=False):
        #epsilon is to avoid divide by zero errors
        epsilon = 1e-5
        if deriv is True:
            return -(self.y/self.output) + ((1-self.y)/(1-self.output))
        return (1/self.num_inputs) * np.sum(-((self.y * np.log(self.output+epsilon)) + (1 - self.y)  * np.log(1 - self.output + epsilon)))
    
 
    def forward(self):
        self.z1 = self.X@self.w1 + self.b1
        self.a1 = np.tanh(self.z1)
        self.z2 = self.a1@self.w2 + self.b2
        self.output = self.sigmoid(self.z2)
   
    
    def backward(self):
        self.loss = self.myLogLoss()
        d_output = self.myLogLoss(deriv=True) * self.sigmoid(self.output, deriv=True)
        d_w2 = (1/self.num_inputs) * (self.a1.T @ d_output)
        d_b2 = (1/self.num_inputs) * (np.sum(d_output, axis = 0, keepdims=True))
        d_z1 = (d_output@self.w2.T) * self.tan_h_deriv(self.a1)
        d_w1 = (1/self.num_inputs) * (self.X.T@d_z1)
        d_b1 = (1/self.num_inputs) * (np.sum(d_z1, axis=0, keepdims=True))

        self.w1 -= (self.learning_rate * d_w1)
        self.w2 -= (self.learning_rate * d_w2)
        self.b1 -= (self.learning_rate * d_b1)
        self.b2 -= (self.learning_rate * d_b2)

    def fit(self, x, y):
        self.checkParameters()
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        #Xavier initialization to train model to converge faster
        self.w1 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.num_inputs+self.hidden_units)),
                                size = (self.num_inputs, self.hidden_units))
        self.w2 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.hidden_units+self.num_outputs)),
                                size = (self.hidden_units, self.num_outputs))
        self.b1 = np.zeros((1,self.hidden_units))       
        self.b2 = np.zeros((1,self.num_outputs))
        for i in range (self.epochs):
            self.forward()
        #    print("Epoch {} Loss is {}".format(i+1, self.loss))
            self.backward()
    
    def predict(self, x):
        self.X = x
        self.forward()
        return self.output

In [188]:
x = np.array(([0, 1],[1,0], [1,1], [0,0]))
y = np.array(([1],[1],[0],[0]))
test = MLP(x,y,4,random_state=0, epochs=100000)
test.fit(x,y)

In [189]:
np.set_printoptions(suppress=True)
test.predict(x)

array([[0.99985229],
       [0.99998968],
       [0.00014796],
       [0.00009289]])

In [170]:
test = np.array([6.43340103, 8.42472798, -6.41357507])
sigmoid(-7.04127255)
sigmoid(test)

array([0.9983956 , 0.99978067, 0.00163647])

In [152]:
def sigmoid(x, deriv=False):
    if deriv is True:
        return x * (1-x)
    return (1 / (1 + np.exp(-x)))

In [28]:
data = []
y_data = []
for n in range(100):
    data.append(np.random.uniform(low=-1.0,high=1.0, size=(4, 1)))
    y_data.append(np.sin(data[-1][0] - data[-1][1] + data[-1][2] - data[-1][3]))
y_data = np.array(y_data)
X_data = np.array(data)
X_data = X_data.reshape(100, 4)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=0, test_size = 0.25, train_size = 0.75)

In [34]:
print(np.max(y_data))
print(np.min(y_data))

0.9998459796152872
-0.9996706301937582


In [190]:
class MLP:
    def __init__(self, X, y, hidden_units=3, epochs=100, learning_rate=.1, random_state=None):
        self.X = X
        self.y = y
        self.hidden_units = hidden_units
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.random_state = random_state

    def checkParameters(self):
        self.num_inputs = self.X.shape[1]
        self.num_outputs = self.y.shape[1]
        if self.X.shape[0] != self.y.shape[0]:
            raise ValueError('X and Y have mismatched shapes')
        if self.num_inputs < 1:
            raise ValueError('Must have at least 1 input')
        if self.num_outputs < 1:
            raise ValueError('Must have at least 1 output')
        if self.hidden_units < 1:
            raise ValueError('Must have at least 1 hidden unit')
        if self.epochs < 1:
            raise ValueError('Must train for at least 1 epoch')
        #Is it a max of 1 also??
        if self.learning_rate <=0:
            raise ValueError('Learning rate must be greater than 0')


    def sigmoid(self, x, deriv=False):
        if deriv is True:
            return x * (1-x)
        return (1 / (1 + np.exp(-x)))

    def myLogLoss(self, deriv=False):
        #epsilon is to avoid divide by zero errors
        epsilon = 1e-5
    
        if deriv is True:
            return -(self.y/self.output) + ((1-self.y)/(1-self.output))
        return (1/self.num_inputs) * np.sum(-((self.y * np.log(self.output+epsilon)) + (1 - self.y)  * np.log(1 - self.output + epsilon)))
    
    def MSE(self, deriv=False):
        if deriv == True:
            return -2 * (self.y - self.output)
        return (np.square(self.y - self.output).mean())

    def forward(self):
        self.z1 = self.X@self.w1 + self.b1
        self.a1 = np.tanh(self.z1)
        self.z2 = self.a1@self.w2 + self.b2
        self.output = np.tanh(self.z2)
    
    def tan_h_deriv(self, x):
        return 1 - (x**2)
    
    def backward(self):
        self.loss = self.MSE()
        #added the times F'() see results four. Without it resutlts1
      #  d_output = (self.output - self.y) * (1-(self.output**2))
        d_output = self.MSE(deriv=True) * self.tan_h_deriv(self.output)
        d_w2 = (1/self.num_inputs) * (self.a1.T @ d_output)
        d_b2 = (1/self.num_inputs) * (np.sum(d_output, axis = 0, keepdims=True))
        d_z1 = (d_output@self.w2.T) * self.tan_h_deriv(self.a1)
        d_w1 = (1/self.num_inputs) * (self.X.T@d_z1)
        d_b1 = (1/self.num_inputs) * (np.sum(d_z1, axis=0, keepdims=True))

        self.w1 -= (self.learning_rate * d_w1)
        self.w2 -= (self.learning_rate * d_w2)
        self.b1 -= (self.learning_rate * d_b1)
        self.b2 -= (self.learning_rate * d_b2)

    def fit(self, x, y):
        self.checkParameters()
        if self.random_state is not None:
            np.random.seed(self.random_state)

#         Xavier initialization to train model to converge faster
        self.w1 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.num_inputs+self.hidden_units)),
                                size = (self.num_inputs, self.hidden_units))
        self.w2 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.hidden_units+self.num_outputs)),
                                size = (self.hidden_units, self.num_outputs))
        self.b1 = np.zeros((1,self.hidden_units))
        self.b2 = np.zeros((1,self.num_outputs))
        for i in range (self.epochs):
            self.forward()
        #    print("Epoch {} Loss is {}".format(i+1, self.loss))
            self.backward()

    def predict(self, x):
        self.X = x
        self.forward()
        return self.output

In [103]:
results6 = pd.DataFrame(index=['epochs', 'hidden units', 'train_error', 'test_error'])

In [83]:
results4 = pd.DataFrame(index=['epochs', 'hidden units', 'train_error', 'test_error'])

In [58]:
results = pd.DataFrame(index=['epochs', 'hidden units', 'train_error', 'test_error'])

In [66]:
results2 = pd.DataFrame(index=['epochs', 'hidden units', 'train_error', 'test_error'])

In [70]:
results3 = pd.DataFrame(index=['epochs', 'hidden units', 'train_error', 'test_error'])

In [59]:
def SSE(predict, y):
    return np.power((predict - y), 2).mean()

In [60]:
test_epochs = [1000, 10000, 100000, 1000000]

In [104]:
model = 1
for epoch in test_epochs:
    for hidden in range(2, 6):
        print('testing model {}'.format(model))
        test = MLP(X_train,y_train,hidden,random_state=0, epochs=epoch)
        test.fit(X_train, y_train)
        train_error = SSE(test.predict(X_train), y_train)
        test_error = SSE(test.predict(X_test), y_test)
        results6[model] = [epoch, hidden, train_error, test_error]
        model += 1

testing model 1
testing model 2
testing model 3
testing model 4
testing model 5
testing model 6
testing model 7
testing model 8
testing model 9
testing model 10
testing model 11
testing model 12
testing model 13
testing model 14
testing model 15
testing model 16


In [63]:
results

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
epochs,1000.0,1000.0,1000.0,1000.0,10000.0,10000.0,10000.0,10000.0,100000.0,100000.0,100000.0,100000.0,1000000.0,1000000.0,1000000.0,1000000.0
hidden units,2.0,3.0,4.0,5.0,2.0,3.0,4.0,5.0,2.0,3.0,4.0,5.0,2.0,3.0,4.0,5.0
train_error,0.025814,0.02937,0.015715,0.018677,0.020232,0.029754,0.009606,0.016511,0.020245,0.030341,0.006701,0.018379,0.02025,0.026067,0.00649,0.015274
test_error,0.042875,0.04252,0.038239,0.029322,0.038879,0.049897,0.051454,0.044841,0.03891,0.057722,0.065705,0.063738,0.038916,0.086603,0.070086,0.06389


In [85]:
results4

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
epochs,1000.0,1000.0,1000.0,1000.0,10000.0,10000.0,10000.0,10000.0,100000.0,100000.0,100000.0,100000.0,1000000.0,1000000.0,1000000.0,1000000.0
hidden units,2.0,3.0,4.0,5.0,2.0,3.0,4.0,5.0,2.0,3.0,4.0,5.0,2.0,3.0,4.0,5.0
train_error,0.020846,0.020371,0.012607,0.020393,0.018232,0.012038,0.008561,0.009684,0.017248,0.009807,0.008554,0.005275,0.017128,0.009162,0.008962,0.008278
test_error,0.035976,0.033296,0.019646,0.031125,0.040651,0.030903,0.039211,0.027347,0.04023,0.036524,0.046355,0.025387,0.040174,0.073451,0.0488,0.037489


In [101]:
results5

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
epochs,1000.0,1000.0,1000.0,1000.0,10000.0,10000.0,10000.0,10000.0,100000.0,100000.0,100000.0,100000.0,1000000.0,1000000.0,1000000.0,1000000.0
hidden units,2.0,3.0,4.0,5.0,2.0,3.0,4.0,5.0,2.0,3.0,4.0,5.0,2.0,3.0,4.0,5.0
train_error,0.02079,0.020523,0.021659,0.020244,0.020394,0.020078,0.014162,0.015391,0.021239,0.020149,0.020396,0.017172,0.021239,0.019867,0.022054,0.016973
test_error,0.036388,0.037338,0.038756,0.047729,0.037215,0.036336,0.093349,0.067977,0.03746,0.037888,0.159024,0.07839,0.03746,0.037601,0.157637,0.081951


In [77]:
def mySoftMax(X):
    return np.exp(X)/(np.sum(np.exp(X), axis=1, keepdims=1))

In [264]:
class MLP:
    def __init__(self, num_inputs, num_outputs,
                 hidden_units=3, epochs=100, learning_rate=.1,
                 random_state=None, loss='log',
                 first_activation='sig', second_activation='sig'):
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        self.hidden_units = hidden_units
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.random_state = random_state
        self.loss_type = loss
        self.a_func_1 = first_activation
        self.a_func_2 = second_activation
        
    def checkParameters(self):
        self.num_inputs = self.X.shape[1]
        self.num_outputs = self.y.shape[1]
        if self.X.shape[0] != self.y.shape[0]:
            raise ValueError('X and Y have mismatched shapes')
        if self.num_inputs < 1:
            raise ValueError('Must have at least 1 input')
        if self.num_outputs < 1:
            raise ValueError('Must have at least 1 output')
        if self.hidden_units < 1:
            raise ValueError('Must have at least 1 hidden unit')
        if self.epochs < 1:
            raise ValueError('Must train for at least 1 epoch')
        #Is it a max of 1 also??
        if self.learning_rate <=0:
            raise ValueError('Learning rate must be greater than 0')
        if self.loss_type not in ['log', 'mse']:
            raise ValueError('Unknown loss function {}'.format(self.loss_type))
        if self.a_func_1 not in ['sig', 'tanh']:
            raise ValueError('Unknown activation function {}'.format(self.a_func_1))
        if self.a_func_2 not in ['sig', 'tanh']:
            raise ValueError('Unknown activation function {}'.format(self.a_func_2))

        
    def myActivation(self, active):
        if active == 'sig':
            def active(x, deriv=False):
                print('sig')
                sig = (1 / (1 + np.exp(-x)))
                if deriv is True:
                    return sig * (1-sig)
                return sig
        else:
            def active(x, deriv=False):
                print('tanh')
                if deriv is True:
                    return 1 - (x**2)
                return np.tanh(x)
        return active

    def myLoss(self, loss):
        if loss == 'log':
            def loss(deriv=False):
                epsilon = 1e-5
                if deriv is True:
                    return -(self.y/self.output) + ((1-self.y)/(1-self.output))
                return (1/self.num_inputs) * np.sum(-((self.y * np.log(self.output+epsilon)) + (1 - self.y)  * np.log(1 - self.output + epsilon)))
        else:
            def loss(deriv=False):
                if deriv == True:
                    return -2 * (self.y - self.output)
                return (np.square(self.y - self.output).mean())
        return loss
        
    def mLoss(self, deriv=False):
        #epsilon is to avoid divide by zero errors
        epsilon = 1e-5
    
        if deriv is True:
            return -(self.y/self.output) + ((1-self.y)/(1-self.output))
        return (1/self.num_inputs) * np.sum(-((self.y * np.log(self.output+epsilon)) + (1 - self.y)  * np.log(1 - self.output + epsilon)))
    

    def forward(self):
        self.z1 = self.X@self.w1 + self.b1
        self.a1 = self.a_func_1(self.z1)
        self.z2 = self.a1@self.w2 + self.b2
        self.output = self.a_func_2(self.z2)
    
    
    def backward(self):
        self.loss = self.myLossFunction()
        #added the times F'() see results four. Without it resutlts1
      #  d_output = (self.output - self.y) * (1-(self.output**2))
        d_output = self.MSE(deriv=True) * self.a_func_2(self.output, deriv=True)
        d_w2 = (1/self.num_inputs) * (self.a1.T @ d_output)
        d_b2 = (1/self.num_inputs) * (np.sum(d_output, axis = 0, keepdims=True))
        d_z1 = (d_output@self.w2.T) * self.a_func_1(self.a1, deriv=True)
        d_w1 = (1/self.num_inputs) * (self.X.T@d_z1)
        d_b1 = (1/self.num_inputs) * (np.sum(d_z1, axis=0, keepdims=True))

        self.w1 -= (self.learning_rate * d_w1)
        self.w2 -= (self.learning_rate * d_w2)
        self.b1 -= (self.learning_rate * d_b1)
        self.b2 -= (self.learning_rate * d_b2)

    def fit(self, x, y):
        self.X = x
        self.y = y
        self.checkParameters()
        self.myLossFunction = self.myLoss(self.loss_type)
        self.a_func_1 = self.myActivation(self.a_func_1)
        self.a_func_2 = self.myActivation(self.a_func_2)
        if self.random_state is not None:
            np.random.seed(self.random_state)

#         Xavier initialization to train model to converge faster
        self.w1 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.num_inputs+self.hidden_units)),
                                size = (self.num_inputs, self.hidden_units))
        self.w2 = np.random.normal(loc=0.0,
                                scale=np.sqrt(2/(self.hidden_units+self.num_outputs)),
                                size = (self.hidden_units, self.num_outputs))
        self.b1 = np.zeros((1,self.hidden_units))
        self.b2 = np.zeros((1,self.num_outputs))
        for i in range (self.epochs):
            self.forward()
        #    print("Epoch {} Loss is {}".format(i+1, self.loss))
            self.backward()

    def predict(self, x):
        self.X = x
        self.forward()
        return self.output

In [271]:
test = MLP(X_train.shape[0], y_train.shape[0], 3, 1, .1, loss='mse', 
           first_activation = 'tanh', second_activation='tanh')

In [272]:
test.fit(X_train, y_train)

tanh
tanh
tanh
tanh


In [273]:
test.predict(X_test)

tanh
tanh


array([[ 0.6528469 ],
       [-0.55568389],
       [-0.35485113],
       [ 0.38638498],
       [ 0.69233975],
       [ 0.70093031],
       [ 0.56821487],
       [ 0.41246322],
       [ 0.30726114],
       [-0.94058008],
       [-0.03027645],
       [ 0.83782261],
       [-0.23349321],
       [ 0.72257247],
       [ 0.71763994],
       [-0.18329858],
       [ 0.06743704],
       [-0.8959221 ],
       [-0.77839625],
       [ 0.11023348],
       [ 0.3407258 ],
       [-0.76412524],
       [-0.92066867],
       [-0.72654014],
       [-0.48625013]])