# Setup

In [1]:

import pickle
import os
import pandas as pd
import numpy as np
import traceback
from pathlib import Path

# Data aquisition

## Preprocesing

In [2]:

def normalization_n1_p1(x):
    return (x/127.5)-1

def normalization_n0_p1(x):
    return x/255

def one_hot(y, num_classes=1):
    #print(y.shape, y)
    tmp = np.zeros((*y.shape, num_classes), dtype=np.float32)
    tmp[:, y] = 1.0
    return tmp


## Summary

In [3]:

def test_dataset_fn(img, label):
    info_str = """img: shape {}, min {}, max {}, type {};
label: shape {}, min {}, max {}, type {};""".format(img.shape, np.min(img), np.max(img), type(img),
                                           label.shape, np.min(label), np.max(label), type(label))
    print(info_str)

## Aquisition

In [4]:

class ExtendedMNISTDataset(object):
    def __init__(self, root: str = "/kaggle/input/fii-atnn-2025-competition-1", train: bool = True):
        """ExtendedMNISTDataset:
        root  - path root of dataset
        train - read train or test dataset (true - train, false - test)
        preprocessing - image preprocesing function
        Dataset structure:
            list -> (batchsize, tuple -> (np.array -> (image->(784, )), scalar -> (labels->1)))
        """
        # select filename
        if (train):
            file = "extended_mnist_train.pkl"
        else:
            file = "extended_mnist_test.pkl"
        # join root to filename
        filename = os.path.join(root, file)
        # read dataset
        dataset = self.__read(filename)
        self.inputs, self.outputs = self.__split_data(dataset)

    def __read(self, filename):
        # try to open filename
        try:
            f = open(filename, "rb")
            try:
                dataset = pickle.load(f)
            except Exception as e:
                dataset = None
                self.__show_exception(e)
            finally:
                f.close()
        except IOError as e:
            dataset = None
            self.__show_exception(e)
        return dataset

    def __show_exception(self, e) -> None:
        tb = traceback.extract_tb(e.__traceback__)
        last_call = tb[-1]
        print(f"❌ Error in function '{last_call.name}' at line {last_call.lineno}")
        print(f"   File: {last_call.filename}")
        print(f"   Exception: {e}")

    def __split_data(self, dataset):
        inputs  = []
        outputs = []
        for input, ouput in dataset:
            inputs.append(input)
            outputs.append(ouput)
        return np.array(inputs, dtype=np.uint8), np.array(outputs, dtype=np.int32)

    def __len__(self, ) -> int:
        return self.inputs.shape[0]

    def __getitem__(self, i : int):# int|np array
        return self.inputs[i], self.outputs[i]


## Load

In [5]:

class DataLoader(object):
    def __init__(self, dataset, batchsize=1, shuffle=False):
        assert (batchsize > 0), "batchsize should be ghreat than 'zero'"
        assert (isinstance(shuffle, bool)), "shuffle should be 'bool'"
        self.dataset = dataset
        self.batchsize = batchsize
        self.shuffle = shuffle
        self.size = len(self.dataset)
        # shufle
        if (self.shuffle):
            self.permutation = np.random.permutation(self.size)
        else:
            self.permutation = np.arange(self.size, dtype=np.int32)
        self.maps_fn = []

    def __len__(self) -> int:
        return len(self.dataset)//self.batchsize

    def __call__(self):
        try:
            for i in range(0, self.size, self.batchsize):
                pos  = self.permutation[i:i+self.batchsize]
                datas = self.dataset[pos]
                datas = self.__map_fn(datas)
                yield datas
            else:
                if (self.shuffle):
                    self.permutation = np.random.permutation(self.size)
        except Exception as e:
            self.__show_exception(e)

    def take(self, size):
        try:
            for i, data in zip(range(size), self()):
                yield data
            else:
                pass
        except Exception as e:
            self.__show_exception(e)

    def __show_exception(self, e) -> None:
        tb = traceback.extract_tb(e.__traceback__)
        last_call = tb[-1]
        print(f"❌ Error in function '{last_call.name}' at line {last_call.lineno}")
        print(f"   File: {last_call.filename}")
        print(f"   Exception: {e}")

    def map(self, fn):
        self.maps_fn.append(fn)

    def __map_fn(self, data):
        for fn in self.maps_fn:
            data = fn(*data)
        return data

## Test Load

In [6]:
# Pipiline
root = "/home/gheorghe/Desktop/Proiecte/master/CapitoleAvansateDinReteleNeuronale/fii-atnn-2025-competition-1"
train_ds = ExtendedMNISTDataset(root=root, train=True)
test_ds = ExtendedMNISTDataset(root=root, train=False)
# Data loader
train_loader = DataLoader(train_ds, batchsize=2, shuffle=True)
train_loader.map(lambda x, y: (normalization_n1_p1(x), one_hot(y, num_classes=10)))
train_loader.map(lambda x, y: (np.expand_dims(x, axis=-1), np.expand_dims(y, axis=-1)))
#test_loader  = DataLoader(test_ds,  batchsize=20, shuffle=False)
#test_loader.map(lambda x, y: (normalization_n1_p1(x), one_hot(y, num_classes=10)))

In [7]:
for img, label in train_loader.take(10):
    test_dataset_fn(img, label)
    #print(label)

img: shape (2, 784, 1), min -1.0, max 1.0, type <class 'numpy.ndarray'>;
label: shape (2, 10, 1), min 0.0, max 1.0, type <class 'numpy.ndarray'>;
img: shape (2, 784, 1), min -1.0, max 1.0, type <class 'numpy.ndarray'>;
label: shape (2, 10, 1), min 0.0, max 1.0, type <class 'numpy.ndarray'>;
img: shape (2, 784, 1), min -1.0, max 1.0, type <class 'numpy.ndarray'>;
label: shape (2, 10, 1), min 0.0, max 1.0, type <class 'numpy.ndarray'>;
img: shape (2, 784, 1), min -1.0, max 1.0, type <class 'numpy.ndarray'>;
label: shape (2, 10, 1), min 0.0, max 1.0, type <class 'numpy.ndarray'>;
img: shape (2, 784, 1), min -1.0, max 1.0, type <class 'numpy.ndarray'>;
label: shape (2, 10, 1), min 0.0, max 1.0, type <class 'numpy.ndarray'>;
img: shape (2, 784, 1), min -1.0, max 1.0, type <class 'numpy.ndarray'>;
label: shape (2, 10, 1), min 0.0, max 1.0, type <class 'numpy.ndarray'>;
img: shape (2, 784, 1), min -1.0, max 1.0, type <class 'numpy.ndarray'>;
label: shape (2, 10, 1), min 0.0, max 1.0, type <cl

# Metrics

In [8]:

class Acuracy():
    def __init__(self, ):
        pass

    def __call__(self, y_pred, y):
        # y_pred: logits or probabilities (batch_size, num_classes)
        # y_true: true labels (batch_size,)
        preds = np.argmax(y_pred, axis=1)
        y     = np.argmax(y,      axis=1)
        correct = (preds == y).sum()
        total = y.shape[0]
        return correct / total

accuracy = Acuracy()

#  Build Models

In [None]:
"""
network.py
~~~~~~~~~~

A module to implement the stochastic gradient descent learning
algorithm for a feedforward neural network.  Gradients are calculated
using backpropagation.  Note that I have focused on making the code
simple, easily readable, and easily modifiable.  It is not optimized,
and omits many desirable features.
"""

#### Libraries
# Standard library
import random

# Third-party libraries
import numpy as np

class Network(object):

    def __init__(self, sizes):
        """The list ``sizes`` contains the number of neurons in the
        respective layers of the network.  For example, if the list
        was [2, 3, 1] then it would be a three-layer network, with the
        first layer containing 2 neurons, the second layer 3 neurons,
        and the third layer 1 neuron.  The biases and weights for the
        network are initialized randomly, using a Gaussian
        distribution with mean 0, and variance 1.  Note that the first
        layer is assumed to be an input layer, and by convention we
        won't set any biases for those neurons, since biases are only
        ever used in computing the outputs from later layers."""
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(sizes[:-1], sizes[1:])]

    def feedforward(self, a):
        """Return the output of the network if ``a`` is input."""
        for b, w in zip(self.biases, self.weights):
            w = np.expand_dims(w, axis=0)
            a = np.matmul(w, a)+b
            a = sigmoid(a)
        return a

    def SGD(self, train_ds, epochs, eta=0.001):
        """Train the neural network using mini-batch stochastic
        gradient descent.  The ``training_data`` is a list of tuples
        ``(x, y)`` representing the training inputs and the desired
        outputs.  The other non-optional parameters are
        self-explanatory.  If ``test_data`` is provided then the
        network will be evaluated against the test data after each
        epoch, and partial progress printed out.  This is useful for
        tracking progress, but slows things down substantially."""
        for j in range(epochs):
            for i, data in zip(range(len(train_ds)), train_ds()):
                self.update_mini_batch(data, eta)
            acc = self.evaluate(train_ds)
            print("Epoch {} complete, acc {}".format(j, acc))

    def update_mini_batch(self, mini_batch, eta):
        """Update the network's weights and biases by applying
        gradient descent using backpropagation to a single mini batch.
        The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
        is the learning rate."""
        x, y = mini_batch
        nabla_b, nabla_w = self.backprop(x, y)
        self.weights = [w-(eta/x.shape[0])*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/x.shape[0])*nb
                       for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        """Return a tuple ``(nabla_b, nabla_w)`` representing the
        gradient for the cost function C_x.  ``nabla_b`` and
        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
        to ``self.biases`` and ``self.weights``."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [activation] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            #print("activation {}, w {}, b {}".format(activation.shape, w.shape, b.shape))
            w = np.expand_dims(w, axis=0)
            z = np.matmul(w, activation)+b
            #print("z {}".format(z.shape))
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        #print("y_pred", activations[-1].shape)
        #print("y", y.shape)
        #print("accuracy {}".format(accuracy(activations[-1], y)))
        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        #print("delta {}, act {}".format(delta.shape, activations[-2].transpose(0, 2, 1).shape))
        #print("nabla_w[-1] {}".format(nabla_w[-1].shape))
        nabla_b[-1] = np.sum(delta, axis=0)
        tmp_w = np.matmul(delta, activations[-2].transpose(0, 2, 1))
        nabla_w[-1] = np.sum(tmp_w, axis=0)
        #print("nabla_w[-1] {}".format(nabla_w[-1].shape))
        # Note that the variable l in the loop below is used a little
        # differently to the notation in Chapter 2 of the book.  Here,
        # l = 1 means the last layer of neurons, l = 2 is the
        # second-last layer, and so on.  It's a renumbering of the
        # scheme in the book, used here to take advantage of the fact
        # that Python can use negative indices in lists.
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            #
            w = self.weights[-l+1].transpose()
            w = np.expand_dims(w, axis=0)
            #print("delta {}, w {}".format(delta.shape, w.shape))
            delta = np.matmul(w, delta) * sp
            #print("delta {}, z {}, w {}".format(delta.shape, z.shape, self.weights[-l+1].transpose().shape))
            #print("act {}".format(activations[-l-1].transpose().shape))
            nabla_b[-l] = np.sum(delta, axis=0)
            tmp_w = np.matmul(delta, activations[-l-1].transpose(0, 2, 1))
            nabla_w[-l] = np.sum(tmp_w, axis=0)
        return (nabla_b, nabla_w)

    def evaluate(self, test_ds):
        """Return the number of test inputs for which the neural
        network outputs the correct result. Note that the neural
        network's output is assumed to be the index of whichever
        neuron in the final layer has the highest activation."""
        test_results = []
        for i, data in zip(range(len(test_ds)), test_ds()):
            x, y = data
            y_pred = self.feedforward(x)
            tmp = accuracy(y_pred, y)
            test_results.append(tmp)
        return np.mean(test_results)

    def cost_derivative(self, output_activations, y):
        """Return the vector of partial derivatives partial C_x
        partial a for the output activations."""
        return (output_activations-y)

#### Miscellaneous functions
def sigmoid(z):
    """The sigmoid function."""
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    return sigmoid(z)*(1-sigmoid(z))

In [None]:
net = Network([784, 100, 10])

In [None]:
net.SGD(train_loader, epochs=5, eta=0.001)

Epoch 0 complete, acc 0.26316666666666666
Epoch 1 complete, acc 0.2680666666666667
Epoch 2 complete, acc 0.2735
Epoch 3 complete, acc 0.28055
Epoch 4 complete, acc 0.2886666666666667


In [None]:
net.evaluate(train_loader)

np.float64(6.666666666666667e-05)

# Build Model by layer

## Build Layers

### Build layer

In [9]:

class Layer(object):
    def __init__(self, name="layer"):
        self.name = name
        self.__parameters = []
        self.__grads = []
        self.__is_derivable = False

    def set_is_derivable(self, bVal):
        self.__is_derivable = bVal

    def is_derivable(self):
        return self.__is_derivable

    def get_grads(self):
        return self.__grads

    def get_grad(self, arg:int):
        return self.__grads[arg]

    def _init_param(self, shape, init_fn):
        if (init_fn is not None):
            x = init_fn(shape)
        else:
            # init like glorot uniform
            lim = np.sqrt(6/np.sum(shape))
            x = np.random.uniform(low=-lim, high=lim, size=shape)
        self.__parameters.append(x)
        if (not self.__is_derivable):
            self.__grads.append(np.zeros(shape, dtype=np.float32))
        else:
            self.__grads = None
        return x

    def backward(self, x):
        raise NameError("Layer {}: The method 'backward' is not implemented".format(self.name))

    def get_prime(self, features):
        return None

    def get_weights(self):
        return None

    def parameters(self):
        return self.__parameters

    def __call__(self, x):
        raise NameError("Layer {}: The method '__call__' is not implemented".format(self.name))


### Build Dense

In [10]:

class Dense(Layer):
    def __init__(self, in_size, out_size, init_fn=None, use_bias=False, init_fn_b=None, **kw):
        super().__init__(**kw)
        self.__use_bias = use_bias
        self.weight = self._init_param((out_size, in_size), init_fn)
        if (self.__use_bias):
            self.bias = self._init_param((out_size, 1), init_fn_b)
        self.set_is_derivable(False)

    def backward(self, delta, features):
        # get batch size
        #batch_size = features.shape[0]
        # calculate weight gradients
        tmp_w  = np.matmul(delta, features.transpose(0, 2, 1))# D*Ft
        grad_w = self.get_grad(0)
        np.sum(tmp_w, axis=0, out=grad_w)# w = D*Ft
        #print("---start ID grad_w {}".format(id(grad_w)))
        #tmp = grad_w.copy()
        #print("----grad_w {}, equal {}".format(grad_w.shape, np.allclose(tmp, grad_w)))
        # calculate bias gradients
        if (self.__use_bias):
            grad_b = self.get_grad(1)
            np.sum(delta, axis=0, out=grad_b)
            #print("----grad_b", grad_b.shape)
        del tmp_w

    def get_prime(self, features):
        return None

    def get_weights(self):
        return self.weight

    def __call__(self, x):
        w = np.expand_dims(self.weight, axis=0)
        x = np.matmul(w, x)
        if (self.__use_bias):
            x += self.bias
        return x


In [None]:
l_dense = Dense(10, 20)

In [None]:
id(l_dense.weight) == id(l_dense.parameters()[0])

True

### Build Relu

In [11]:

class Relu(Layer):
    def __init__(self, min=0, **kw):
        super().__init__(**kw)
        self.__min = min
        self.set_is_derivable(True)

    def backward(self, delta, features):
        self.set_grads(None)

    def get_prime(self, features):
        return (features > 0).astype(np.float32)

    def get_weights(self):
        return None

    def __call__(self, x):
        x = np.maximum(x, self.__min)
        return x


### Build Sigmoid

In [12]:

class Sigmoid(Layer):
    def __init__(self, **kw):
        super().__init__(**kw)
        self.__t = 1.
        self.set_is_derivable(True)

    def backward(self, delta, features):
        pass

    def get_prime(self, features):
        # Suppose softmax over channel dim
        x = self(features)
        x = x*(self.__t-x)
        return x

    def get_weights(self):
        return None

    def __call__(self, x):
        x = 1.0/(1.0+np.exp(-x))
        return x


## Build Model

In [13]:
"""
network.py
~~~~~~~~~~

A module to implement the stochastic gradient descent learning
algorithm for a feedforward neural network.  Gradients are calculated
using backpropagation.  Note that I have focused on making the code
simple, easily readable, and easily modifiable.  It is not optimized,
and omits many desirable features.
"""

#### Libraries
# Standard library
import random

# Third-party libraries
import numpy as np

class NetworkLayers(object):

    def __init__(self, layers, act_layers):
        """The list ``sizes`` contains the number of neurons in the
        respective layers of the network.  For example, if the list
        was [2, 3, 1] then it would be a three-layer network, with the
        first layer containing 2 neurons, the second layer 3 neurons,
        and the third layer 1 neuron.  The biases and weights for the
        network are initialized randomly, using a Gaussian
        distribution with mean 0, and variance 1.  Note that the first
        layer is assumed to be an input layer, and by convention we
        won't set any biases for those neurons, since biases are only
        ever used in computing the outputs from later layers."""
        self.layers     = layers
        self.act_layers = act_layers
        self.num_layers = len(self.layers)+1

    def feedforward(self, a):
        """Return the output of the network if ``a`` is input."""
        for layer, act_layer in zip(self.layers, self.act_layers):
            a = layer(a)
            a = act_layer(a)
        return a

    def SGD(self, train_ds, epochs, eta=0.001):
        """Train the neural network using mini-batch stochastic
        gradient descent.  The ``training_data`` is a list of tuples
        ``(x, y)`` representing the training inputs and the desired
        outputs.  The other non-optional parameters are
        self-explanatory.  If ``test_data`` is provided then the
        network will be evaluated against the test data after each
        epoch, and partial progress printed out.  This is useful for
        tracking progress, but slows things down substantially."""
        for j in range(epochs):
            for i, data in zip(range(len(train_ds)), train_ds()):
                self.update_mini_batch(data, eta)
            acc = self.evaluate(train_ds)
            print("Epoch {} complete, acc {}".format(j, acc))

    def update_mini_batch(self, mini_batch, eta):
        """Update the network's weights and biases by applying
        gradient descent using backpropagation to a single mini batch.
        The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
        is the learning rate."""
        x, y = mini_batch
        self.backprop(x, y)

        for layer in self.layers:
            l_parameters = layer.parameters()
            l_grads = layer.get_grads()
            if (l_grads is not None):
                for parameters, grads in zip(l_parameters, l_grads):
                    #print("---start ID parameters {}".format(id(parameters)))
                    #print("---start ID grads {}".format(id(grads)))
                    parameters -= (eta/x.shape[0])*grads
                    #print("+++end ID parameters {}".format(id(parameters)))

    def virtual_backward(self, data):
        for x, y in data:
            # feedforward
            activation = x
            activations = [activation] # list to store all the activations, layer by layer
            zs = [] # list to store all the z vectors, layer by layer
            for layer, act_layer in zip(self.layers, self.act_layers):
                z = layer(activation)
                activation = act_layer(z)
                #print("activation {}, w {}, b {}".format(activation.shape, w.shape, b.shape))
                #print("z {}".format(z.shape))
                zs.append(z)
                activations.append(activation)
            # backward pass
            #print("y_pred", activations[-1].shape)
            #print("y", y.shape)
            #print("accuracy {}".format(accuracy(activations[-1], y)))
            graph_execution = []
            layer     = self.layers[-1]
            act_layer = self.act_layers[-1]
            graph_execution.append({"layer":(layer.name, -1)})
            delta = self.cost_derivative(activations[-1], y) * act_layer.get_prime(zs[-1])
            graph_execution[-1].update({"Wt":None})
            graph_execution[-1].update({"prime":(act_layer.name, -1)})
            #print("delta {}, act {}".format(delta.shape, activations[-2].transpose(0, 2, 1).shape))
            layer.backward(delta, activations[-2])
            graph_execution[-1].update({"update_params":(self.act_layers[-2].name, -2)})
            # Note that the variable l in the loop below is used a little
            # differently to the notation in Chapter 2 of the book.  Here,
            # l = 1 means the last layer of neurons, l = 2 is the
            # second-last layer, and so on.  It's a renumbering of the
            # scheme in the book, used here to take advantage of the fact
            # that Python can use negative indices in lists.
            for l in range(2, self.num_layers):
                layer     = self.layers[-l]
                act_layer = self.act_layers[-l]
                z = zs[-l]
                sp = act_layer.get_prime(z)
                #
                w = self.layers[-l+1].get_weights().transpose()
                w = np.expand_dims(w, axis=0)
                #print("delta {}, w {}".format(delta.shape, w.shape))
                delta = np.matmul(w, delta) * sp
                graph_execution[-1].update({"Wt":(self.__layers[-l+1].name, -l+1)})
                graph_execution[-1].update({"prime":(act_layer.name, -l)})
                #print("delta {}, z {}, w {}".format(delta.shape, z.shape, self.weights[-l+1].transpose().shape))
                #print("act {}".format(activations[-l-1].transpose().shape))
                layer.backward(delta, activations[-l-1])
                if (self.num_layers-(l+1) >= 0):
                    graph_execution[-1].update({"update_params":(self.act_layers[-l-1].name, -l-1)})
                else:
                    graph_execution[-1].update({"update_params":("Input", -l-1)})
            


    def backprop(self, x, y):
        """Return a tuple ``(nabla_b, nabla_w)`` representing the
        gradient for the cost function C_x.  ``nabla_b`` and
        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
        to ``self.biases`` and ``self.weights``."""
        # feedforward
        activation = x
        activations = [activation] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for layer, act_layer in zip(self.layers, self.act_layers):
            z = layer(activation)
            activation = act_layer(z)
            #print("activation {}, w {}, b {}".format(activation.shape, w.shape, b.shape))
            #print("z {}".format(z.shape))
            zs.append(z)
            activations.append(activation)
        # backward pass
        #print("y_pred", activations[-1].shape)
        #print("y", y.shape)
        #print("accuracy {}".format(accuracy(activations[-1], y)))
        layer     = self.layers[-1]
        act_layer = self.act_layers[-1]
        delta = self.cost_derivative(activations[-1], y) * act_layer.get_prime(zs[-1])
        #print("delta {}, act {}".format(delta.shape, activations[-2].transpose(0, 2, 1).shape))
        layer.backward(delta, activations[-2])
        # Note that the variable l in the loop below is used a little
        # differently to the notation in Chapter 2 of the book.  Here,
        # l = 1 means the last layer of neurons, l = 2 is the
        # second-last layer, and so on.  It's a renumbering of the
        # scheme in the book, used here to take advantage of the fact
        # that Python can use negative indices in lists.
        for l in range(2, self.num_layers):
            layer     = self.layers[-l]
            act_layer = self.act_layers[-l]
            z = zs[-l]
            sp = act_layer.get_prime(z)
            #
            w = self.layers[-l+1].get_weights().transpose()
            w = np.expand_dims(w, axis=0)
            #print("delta {}, w {}".format(delta.shape, w.shape))
            delta = np.matmul(w, delta) * sp
            #print("delta {}, z {}, w {}".format(delta.shape, z.shape, self.weights[-l+1].transpose().shape))
            #print("act {}".format(activations[-l-1].transpose().shape))
            layer.backward(delta, activations[-l-1])

    def evaluate(self, test_ds):
        """Return the number of test inputs for which the neural
        network outputs the correct result. Note that the neural
        network's output is assumed to be the index of whichever
        neuron in the final layer has the highest activation."""
        test_results = []
        for i, data in zip(range(len(test_ds)), test_ds()):
            x, y = data
            y_pred = self.feedforward(x)
            tmp = accuracy(y_pred, y)
            test_results.append(tmp)
        return np.mean(test_results)

    def cost_derivative(self, output_activations, y):
        """Return the vector of partial derivatives partial C_x
        partial a for the output activations."""
        return (output_activations-y)

#### Miscellaneous functions
def sigmoid(z):
    """The sigmoid function."""
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    return sigmoid(z)*(1-sigmoid(z))

In [14]:

init_fn = lambda shape: np.random.randn(*shape)

In [15]:
init_fn((1, 2))

array([[-0.26707612, -0.20242758]])

In [16]:

layers     = [
    Dense(784, 100, init_fn=init_fn, use_bias=True, init_fn_b=init_fn, name="Dense_h1"),
    Dense(100, 10, init_fn=init_fn, use_bias=True, init_fn_b=init_fn, name="Dense_h2"),
    ]
act_layers = [
    Sigmoid(name="Sigmoid_h1"),
    #Relu(name="Relu_h1"),
    Sigmoid(name="Sigmoid_h2"),
    ]
netLayers = NetworkLayers(layers, act_layers)

In [None]:
netLayers.SGD(train_loader, epochs=20, eta=0.001)

Epoch 0 complete, acc 0.23431666666666667
Epoch 1 complete, acc 0.2582


In [None]:
batch_size = 2
x = np.random.uniform(low=-1, high=1, size=(batch_size, 784, 1))
w = np.random.uniform(low=-1, high=1, size=(1, 100, 784))
b_y = np.matmul(w, x)
y = np.dot(w[0], x[0])

In [None]:
np.allclose(b_y[0], y)

True