In [1]:
import numpy as np

In [2]:
def linear(x, weights, bias):
    return np.dot(x, weights) + bias

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def to_onehot(y, num_classes):
    y_onehot = np.zeros((y.shape[0], num_classes))
    y_onehot[np.arange(y.size), y] = 1
    return y_onehot


In [83]:
# Let's implement a 1-hidden layer multi-layer perceptron (MLP)

class MLP():
    def __init__(self, num_features, num_hidden, num_classes):
        self.num_features = num_features
        self.num_hidden = num_hidden
        self.num_classes = num_classes
        
        # if you want to extend the number of hidden layers, you can simply repeat the following two lines
        self.weights_hidden = np.zeros((num_hidden, num_features), dtype = float)
        self.bias_hidden = np.zeros(num_hidden, dtype= float)

        self.weights_output = np.zeros((num_classes, num_hidden), dtype= float)
        self.bias_output = np.zeros((num_classes), dtype= float)
        
    def predict(self, x):
        z1 = linear(x, np.transpose(self.weights_hidden), self.bias_hidden)
        a1 = sigmoid(z1)

        z2 = linear(a1, np.transpose(self.weights_output), self.bias_output)
        a2 = sigmoid(z2)

        return a1, a2

    def calc_mse(self, x, y):
        _, yhat = self.predict(x)
        error = np.power(yhat - y, 2).mean()

    def backward(self, x, a1, a2, y):
        
        # Output layer gradient
        # calculate partial derivatives for the output layer's weights
        dloss_da2 = 2.0 * (a2 - y) / y.shape[0]
        da2_dz2 = a2 * (1. - a2) # sigmoid derivative
        dloss_dz2 = dloss_da2 * da2_dz2
        dloss_dw_output = np.dot(np.transpose(dloss_dz2), a1)
        dloss_db_output = np.sum(dloss_dz2)

        # calculate partial derivatives with respect to hidden layer's weights
        dz2_da1 = self.weights_output
        dloss_da1 = np.dot(dloss_dz2, dz2_da1)
        da1_dz1 = a1 * (1. - a1) # sigmoid derivative
        dz1_dw_hidden = x
        dloss_dz1 = dloss_da1 * da1_dz1
        
        dloss_dw_hidden = np.dot(np.transpose(dloss_dz1), dz1_dw_hidden)
        dloss_db_hidden = np.sum(dloss_dz1)

        return dloss_dw_output, dloss_db_output, dloss_dw_hidden, dloss_db_hidden
    
    def train(self, x, y, batch_size, epochs, lr = 0.1):
        
        minibatch_error = []
        epoch_error = [] 

        for e in range(epochs):
            for batch_idx in range(batch_size + 1):

                # Extract minibatches
                X_mini = x[batch_idx * batch_size:(batch_idx + 1) * batch_size, :]
                y_mini = y[batch_idx * batch_size:(batch_idx + 1) * batch_size, :]
                
                # Compute outputs
                a1, a2 = self.predict(X_mini)

                # Calculate gradients
                dloss_dw_output, dloss_db_output, dloss_dw_hidden, dloss_db_hidden = self.backward(X_mini, a1, a2, y_mini)

                # Update weights
                self.weights_hidden -= lr * dloss_dw_hidden
                self.bias_hidden -= lr * dloss_db_hidden
                self.weights_output -= lr * dloss_dw_output
                self.bias_output -= lr * dloss_db_output

                minibatch_error.append(self.calc_mse(X_mini, y_mini))

            epoch_error.append(self.calc_mse(x, y))

        return minibatch_error, epoch_error

In [94]:
# Load the dataset
data = np.genfromtxt('xor.csv', delimiter=',')
X, y = data[:, :2], data[:, -1]
y = y.reshape(-1, 1)
y = y.astype(int)

# Shuffling & train/test split
shuffle_idx = np.arange(y.shape[0])
shuffle_rng = np.random.RandomState(123)
shuffle_rng.shuffle(shuffle_idx)
X, y = X[shuffle_idx], y[shuffle_idx]

In [95]:
print(y.shape)

(751, 1)


In [96]:
mlp = MLP(num_features=2, num_hidden=50, num_classes=2)

In [97]:
RANDOM_SEED = 1
BATCH_SIZE = 10
NUM_EPOCHS = 5
LEARNING_RATE = 0.05

mlp.train(X, y, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

([None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None, None, None, None, None])

In [98]:
y

array([[                   1],
       [                   0],
       [                   0],
       [                   0],
       [                   0],
       [                   1],
       [                   1],
       [                   1],
       [                   1],
       [                   0],
       [                   1],
       [                   0],
       [                   1],
       [                   1],
       [                   0],
       [                   1],
       [                   1],
       [                   0],
       [                   0],
       [                   1],
       [                   0],
       [                   1],
       [                   0],
       [                   1],
       [                   1],
       [                   0],
       [                   0],
       [                   1],
       [                   0],
       [                   0],
       [                   0],
       [                   0],
       [