### import necessary libraries

In [1]:
import numpy as np
from typing import List, Tuple

### ANN class

In [2]:
class ANN:
    """
    Class representation of an artificial neural network (ANN).
    """
    def __init__(self, layers: List[int]):
        """
        Initialization function to set up the class.
        :param layers: Number of neurons for each layer that should be set up as List of ints.
        """
        self.W = dict()  # holding the weight matrices
        self.b = dict()  # holding the bias values
        self.z = dict()  # holding the intermediate values
        self.a = dict()  # holding the activation values
        
        self.dW = dict()  # holding the gradient of the weight matrices
        self.db = dict()  # holding the gradient of the bias values
        
        self.layers = layers  # User defined layers

        self._construct()
        
    def _construct(self):
        """
        Construct the internal shape of the ANN.
        """
        for idx, layer in enumerate(self.layers[:-1]):
            self.W[idx] = np.random.randn(layer, self.layers[idx+1])
            self.b[idx] = np.zeros(self.layers[idx+1])
            
    def loss(self, y: np.ndarray, gradient: bool = False) -> np.ndarray:
        """
        Compute the cross entropy loss for the given hypothesis (h) in contrast to the true results (y).
        If the gradient flag is True, the derivative of said loss function will be returned.
        :param y: True output data.
        :param gradient: Bool flag to indicate if gradient should be returned.
        :return: Cost/Loss of the current hypothesis.
        """
        h = self.a[len(self.layers)-2]  # our prediction / hypothesis
        
        if gradient:
            return -(y // h) + ((1 - y) // (1 - h))
        return -(1/y.size) * ((y.T @ np.log(h)) + ((1 - y.T) @ np.log(1 - h)))
    
    def activation(self, X: np.ndarray, gradient: bool = False) -> np.ndarray:
        """
        For the activation function we use the sigmoid.
        It will return 0 for every x << 0 and 1 for every x >> 0.
        Return the gradient of the sigmoid if a True gradient flag is given.
        :param X: data to transform via sigmoid function:
        :return: transformed data that lies between 0 and 1.
        """
        sigmoid = 1 / (1 + np.exp(-X))
        if gradient:
            return sigmoid * (1 - sigmoid)
        return sigmoid
    
    def forward(self, X: np.ndarray):
        """
        Successively propagate the input data (X) through the ANN and store all
        intermediate and activation values in their corresponding dictionaries.
        :param X: Input data to make predictions on.
        """
        self.a[-1] = X
        for idx in range(len(self.layers)-1):
            self.z[idx] = self.a[idx-1] @ self.W[idx] + self.b[idx]
            self.a[idx] = self.activation(self.z[idx])

    def backward(self, y: np.ndarray):
        """
        Successively propagate the prediction as well as the true output backwards through the ANN.
        Store the resulting gradients for weights and biases in their corresponding dictionaries.
        :param y: True output data.
        """
        da = self.loss(y=y, gradient=True)  # get gradient of last activation value
        
        for idx in range(len(self.layers)-2, -1, -1):  # loop from the last layer to zero (effectively)
            dz = da * self.activation(X=self.z[idx], gradient=True)
            da = dz @ self.W[idx].T  

            self.db[idx] = np.mean(dz, axis=0)  # get gradient of bias. use mean to pay respect to sample size.
            self.dW[idx] = (self.a[idx-1].T @ dz) / y.size  # get gradient of weights. divide by number of samples.     
            
    def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
        """
        Predict the output of the given data (X).
        :param X: Data to make prediction on.
        :param threshold: Threshold that decides if predicted value belongs to class 0 or 1.
        :return: Predicted value.
        """
        self.forward(X)
        return self.a[len(self.layers)-2] > threshold

    def update(self, learning_rate: float):
        """
        Update the current weights and biases by multiplying the learning rate with the previously computed gradients.
        :param learning_rate: The step size of the gradient applied to update the weights and biases (e.g. to learn).
        """
        for idx in range(len(self.layers)-1):
            self.W[idx] -= learning_rate * self.dW[idx]
            self.b[idx] -= learning_rate * self.db[idx]        
        
    def fit(self, X: np.ndarray, y: np.ndarray, learning_rate: float = 1, epochs: int = 1000):
        """
        Run the training procedure on the given data for the given epochs.
        This essentially fits the network to the given data.
        Print current loss value every epoch.
        :param X: Input data.
        :param y: True output data.
        :param learning_rate: The step size of the gradient applied to update the weights and biases (e.g. to learn).
        :param epochs: Number of training cycles to perform.
        """
        for epoch in range(epochs):
            self.forward(X=X)
            loss = self.loss(y=y)
            self.backward(y=y)
            self.update(learning_rate)
            
            print(f"({epoch+1}/{epochs}): {loss.item()}")

    def accuracy(self, X: np.ndarray, y: np.ndarray) -> float:
        """
        Perform predictions on all the given data (X) and compare these predictions to the ground truth values (y).
        Afterwards get the ratio of correctly to correctly+incorrectly predicted classes. This is the accuracy.
        :param X: Input data.
        :param y: True output data.
        :return: Accuracy as float value.
        """
        return np.sum(self.predict(X) == y) / y.size

### Toy data

In [3]:
def generate_data(N: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Original code credits to Prof. Dr. Stefan Harmeling
    
    Generate a data for training our linear model.
    :param N: number of samples multiplier.
    :return: tuple of x and y data as numpy ndarrays.
    """
    X = np.repeat(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), N, axis=0)
    X = X + np.random.randn(4 * N, 2) * 0.2
    y = np.repeat([0, 1, 1, 0], N)
    y = np.reshape(y, (len(y), 1))

    return X, y

In [4]:
X_train, y_train = generate_data(N=100)
X_test, y_test = generate_data(N=50)

In [5]:
model = ANN([2,3,1])
model.fit(X_train, y_train)

(1/1000): 0.8591724666394527
(2/1000): 0.7431489036622421
(3/1000): 0.7123940913316227
(4/1000): 0.7003342066826492
(5/1000): 0.6999082738452493
(6/1000): 0.6995076152891736
(7/1000): 0.6992151777057555
(8/1000): 0.6988915541763194
(9/1000): 0.6986181777223678
(10/1000): 0.6983255608155797
(11/1000): 0.6980625803457372
(12/1000): 0.6977980761357927
(13/1000): 0.6975479878438664
(14/1000): 0.6973069719555357
(15/1000): 0.6970743345951346
(16/1000): 0.6968497742559763
(17/1000): 0.6966326985177039
(18/1000): 0.6964241532928955
(19/1000): 0.6962205355902557
(20/1000): 0.6960267898913515
(21/1000): 0.6958357393237564
(22/1000): 0.6956544913573103
(23/1000): 0.69547548161496
(24/1000): 0.6953065818114104
(25/1000): 0.6951382186872206
(26/1000): 0.6949788198359519
(27/1000): 0.6948205814247428
(28/1000): 0.6946711531891834
(29/1000): 0.6945231372170991
(30/1000): 0.6943794077147394
(31/1000): 0.6942414356226668
(32/1000): 0.6941015209336812
(33/1000): 0.694006160407915
(34/1000): 0.694165827

(334/1000): 0.6250015299294388
(335/1000): 0.6244620410574757
(336/1000): 0.6236918932044665
(337/1000): 0.62318142462945
(338/1000): 0.6223916790172471
(339/1000): 0.6218202558467533
(340/1000): 0.6210153515977702
(341/1000): 0.6205656450119953
(342/1000): 0.6197355427039215
(343/1000): 0.6191311826492969
(344/1000): 0.6182887247610465
(345/1000): 0.6178554927135915
(346/1000): 0.6169854598800754
(347/1000): 0.6163469707691458
(348/1000): 0.6155127534204714
(349/1000): 0.6149997152535766
(350/1000): 0.6140954664497962
(351/1000): 0.6135660731872444
(352/1000): 0.612642567704625
(353/1000): 0.6120960161443306
(354/1000): 0.6111549272362004
(355/1000): 0.6106398639955437
(356/1000): 0.6096777608092728
(357/1000): 0.6091878405947802
(358/1000): 0.6082019325387549
(359/1000): 0.6076888842760432
(360/1000): 0.6066820130204941
(361/1000): 0.6061959778696938
(362/1000): 0.6051651861468578
(363/1000): 0.6046626355188693
(364/1000): 0.6036103175070006
(365/1000): 0.6030899941438383
(366/1000):

(711/1000): 0.20912024824728143
(712/1000): 0.2084762629550406
(713/1000): 0.20783695905564728
(714/1000): 0.20721100319944363
(715/1000): 0.20658747211646108
(716/1000): 0.2059669294345246
(717/1000): 0.20534080884381772
(718/1000): 0.20472199643198927
(719/1000): 0.2041095054394589
(720/1000): 0.20350268937550875
(721/1000): 0.2029011123307391
(722/1000): 0.20230447216883143
(723/1000): 0.20171255366192448
(724/1000): 0.20112519910925042
(725/1000): 0.2005422894781763
(726/1000): 0.19996373205996754
(727/1000): 0.19939780136599503
(728/1000): 0.1988257485107259
(729/1000): 0.19826708985979394
(730/1000): 0.1977023208022387
(731/1000): 0.19715130892656674
(732/1000): 0.19660318755282036
(733/1000): 0.19605823656968294
(734/1000): 0.19550722074305127
(735/1000): 0.19498420389148247
(736/1000): 0.19446170349837183
(737/1000): 0.19394031630100084
(738/1000): 0.19342063016801433
(739/1000): 0.1929031359332389
(740/1000): 0.19238821170736786
(741/1000): 0.19187613460632608
(742/1000): 0.19

In [6]:
model.accuracy(X_train, y_train)

0.965

In [7]:
model.accuracy(X_test, y_test)

0.93