# Neurons and Multi-Layer Perceptrons (MLPs)

Using the previously defined `Value` class, we can define a `Neuron` class.

In [726]:
import math
import random

In [727]:
class Value:
    """Class that represents a value that can be differentiated in respect to other values.

    Attributes:
        data: The value itself.
        _prev: A set of values that this value is dependent on.
        _op: The operation that produced this value.
    """

    def __init__(self, data, _children=(), _op='', label='') -> None:
        self.data = data
        self.grad = 0.0 # derivative of the loss function in respect to this value
        self._backward = lambda: None # defines how to propagate the output gradient to the input gradient
        self._prev = set(_children)
        self._op = _op
        self.label = label

    def __repr__(self) -> str:
        return f"Value(data={self.data}, label={self.label})"
    
    def __add__(self, other):
        def _backward():
            # For addition, the local derivative is always 1. So, given the expression a = b + c, it follows from the chain rule 
            # dz/dx = dz/dy*dy/dx that the gradients of b and c are (1.0 * the gradient of a)
            if isinstance(other, Value):
                self.grad += 1.0 * out.grad
                other.grad += 1.0 * out.grad
            else:
                #print(f"__add__ backward for {self} and {other}")
                #print(f"uid(self)={id(self)}, uid(other)={id(other)}")
                self.grad += 1.0 * out.grad

        if isinstance(other, Value):
            out = Value(self.data + other.data, (self, other), '+')
            out._backward = _backward
            return out
        else:
            out = Value(self.data + other, (self,), '+')
            out._backward = _backward
            return out
        
    def __radd__(self, other):
        return self + other
        
    def __neg__(self): #-self
        return self * -1
        
    def __sub__(self, other): #self - other, uses __neg__ and __add__
        return self + (-other)
        
    def __mul__(self, other):
        def _backward():
            # For multiplication, the local derivative is always the other value (because for z=x*y, dz/dx = y). 
            # So, given the expression a = b * c, it follows from the chain rule that the gradient of b
            # is (c * the gradient of a), and the gradient of c is (b * the gradient of a).
            if isinstance(other, Value):
               self.grad += other.data * out.grad
               other.grad += self.data * out.grad
            else:
                #print(f"__mul__ backward for {self} and {other}")
                #print(f"uid(self)={id(self)}, uid(other)={id(other)}")
                self.grad += other * out.grad

        if isinstance(other, Value):
            out = Value(self.data * other.data, (self, other), '*')
            out._backward = _backward
            return out
        else:
            out = Value(self.data * other, (self,), '*')
            out._backward = _backward
            return out
        
    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data ** other, (self,), f'**{other}')

        def _backward():
            #print(f"__pow__ backward for {self} and {other}")
            #print(f"uid(self)={id(self)}, uid(other)={id(other)}")
            self.grad += (other * self.data ** (other - 1)) * out.grad
        out._backward = _backward

        return out

    def __rmul__(self, other):
        return self * other
    
    
    def __truediv__(self, other):
        return self * (other ** -1)
        
    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self,), 'tanh')

        def _backward():
            #print(f"__tanh__ backward for {self}")
            #print(f"uid(self)={id(self)}")
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward

        return out
    
    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self,), 'exp')

        def _backward():
            #print(f"__exp__ backward for {self}")
            #print(f"uid(self)={id(self)}")
            self.grad += out.data * out.grad # because d/dx e^x = e^x
        out._backward = _backward

        return out
    
    def backward(self):
        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)
        
        self.grad = 1.0

        for node in reversed(topo):
            #print(f"In Value.backward(): processing node {node} of type {type(node)}")
            node._backward()

Using the `Value` class, we can now define the class `Neuron` to represent a single artificial neoron:

In [728]:
class Neuron:
    """Implementation of a neuron in a neural network."""

    def __init__(self, nin):
        self.w = [Value(random.uniform(-1, 1), label='w') for _ in range(nin)]
        self.b = Value(random.uniform(-1, 1), label='b')

    def __call__(self, x):
        # compute raw activation w * x + b
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        out = act.tanh()
        return out

    def parameters(self):
        return self.w + [self.b]


We can test the `Neuron` class like this:

In [729]:
x = [2.0, 6.0]
n = Neuron(2)
n(x)

Value(data=-0.48798312459108867, label=)

Now, with the `Neuron` class in place, we can define the class `Layer` to represent a layer of neurons:

In [730]:
class Layer:
    """Implementation of a layer in a neural network."""

    def __init__(self, nin, nout):
        """Initialize a layer of nout neurons, each taking nin inputs."""
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs
    
    def parameters(self):
        params = []
        for n in self.neurons:
            params.extend(n.parameters())

        return params

Finally, we can define an MLP:

In [731]:
class MLP:
    """Implementation of a multilayer perceptron."""

    def __init__(self, nin, nouts):
        """Initialize a MLP with nouts layers, each taking nin inputs.

        Args:
            nin: Number of inputs to the MLP.
            nouts: List of number of neurons in each layer (including the output layer).
        """
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

And with that we can create an MLP that has 3 inputs, to hidden layers with 4 neurons each, and 1 output:

In [732]:
x = [2.0, 3.0, -1.0]
m = MLP(3, [4, 4, 1])
m(x)
m.parameters()

[Value(data=0.0909697326091925, label=w),
 Value(data=0.6112772310149679, label=w),
 Value(data=-0.4577950619379323, label=w),
 Value(data=-0.5776340692060642, label=b),
 Value(data=-0.815249058822104, label=w),
 Value(data=0.2582770154305767, label=w),
 Value(data=-0.6390090586595913, label=w),
 Value(data=-0.8439652214931448, label=b),
 Value(data=0.810589835498601, label=w),
 Value(data=-0.5494748746516618, label=w),
 Value(data=0.3263442923256723, label=w),
 Value(data=-0.29442523991974445, label=b),
 Value(data=0.6143315480067737, label=w),
 Value(data=0.09891823289707613, label=w),
 Value(data=0.41786176911898165, label=w),
 Value(data=-0.9471203651062756, label=b),
 Value(data=-0.7248146719163313, label=w),
 Value(data=-0.6440168005187248, label=w),
 Value(data=0.006300007781024908, label=w),
 Value(data=-0.7186289410038025, label=w),
 Value(data=0.8821853381567681, label=b),
 Value(data=-0.3949357655976595, label=w),
 Value(data=-0.8590094734603584, label=w),
 Value(data=-0.987

And draw the graph of the MLP:

In [733]:
# Code for visualizing the graph
from graphviz import Digraph

def trace(root):
    # build a set of all nodes and edges in the graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # left to right
    nodes, edges = trace(root)

    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangle node for it ('record')
        dot.node(name = uid, label=f"{n.label} | uid {uid} | data {n.data:.4f} | grad {n.grad:.4f}", shape='record')
        if n._op:
            # if the value was produced by an operation, create an op node for it
            dot.node(name = uid + n._op, label=n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)

    for n1, n2 in edges:
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2))+ n2._op)

    return dot

In [734]:
#draw_dot(m(x))

We can now perform backpropagation using micrograd.

In [735]:
# Simple dataset
# Inputs
xs = [
    [2.0, 3.0,  -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
]

# Target outputs
ys = [1.0, -1.0, -1.0, 1.0]

# What predictions does the model currently make?
ypred = [m(x) for x in xs]
ypred

[Value(data=-0.05917979865995601, label=),
 Value(data=0.9512411366012725, label=),
 Value(data=0.6216821055349466, label=),
 Value(data=0.279251965353167, label=)]

To train the MLP, we need a single number to measure its performance and this number is called a _loss_. We will define loss in such a way so we can minimize it. Specifically, we will define it as the mean square error loss:

$$\sum_i (y_i - gt_i)^2$$

where $y_i$ is the MLP's actual output for input vector $i$ and $gt_i$ is the ground truth for input vector $i$. For instance, in our case, $gt_0 = 1.0$.

In [736]:
loss = sum((yout - ygt)**2 for yout, ygt in zip(ypred, ys))
loss

Value(data=8.0785343999139, label=)

In [737]:
# let's run backpropagation ...
loss.backward()

# ... and see how the gradients look like
m.layers[0].neurons[0].w[0].grad


0.3668324246648831

In [739]:
# If we would want to, we can plot the propagation graph for the loss function
#draw_dot(loss)
m.layers[0].neurons[0].w[0].data

0.0909697326091925

In [740]:
# Gradient descent: minimize the loss by changing the parameters of the model. Since we want to minimize the loss,
# we move in the opposite direction of the gradient (negative step size).
for p in m.parameters():
    p.data -= 0.01 * p.grad

In [741]:
m.layers[0].neurons[0].w[0].data

0.08730140836254367

In [742]:
# now recalculate the loss
ypred = [m(x) for x in xs]
loss = sum((yout - ygt)**2 for yout, ygt in zip(ypred, ys))
loss

Value(data=7.51487317615622, label=)

In [758]:
# do another step of gradient descent
loss.backward()
for p in m.parameters():
    p.data -= 0.01 * p.grad # update the parameters (gradients)

# and recalculate the loss
ypred = [m(x) for x in xs]
loss = sum((yout - ygt)**2 for yout, ygt in zip(ypred, ys))
loss

Value(data=0.004208074840806357, label=)

In [759]:
# What predictions does the model make now, after several rounds of gradient descent?
# Remember that the target outputs are 1.0, -1.0, -1.0, 1.0
ypred

[Value(data=0.9792078666872167, label=),
 Value(data=-0.9953641050390847, label=),
 Value(data=-0.9801373892305607, label=),
 Value(data=0.9420366736251022, label=)]

So, what we are doing in the _gradient descent_ is:
1. Forward pass
2. Backward pass
3. Update the weights

And we repeat this loop several times until we are happy with the value of the loss function. To demonstrate this, we'll have to reset the MLP.

In [767]:
m = MLP(3, [4, 4, 1])

# Inputs to the MLP
xs = [
    [2.0, 3.0,  -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
]

# Desired outputs (target outputs)
ys = [1.0, -1.0, -1.0, 1.0]

# Initial predictions of the MLP
ypred = [m(x) for x in xs]
print(f"[*] Initial predictions: {ypred}")

learning_rate = 0.05

for k in range(100):
    # forward pass
    ypred = [m(x) for x in xs]
    loss = sum((yout - ygt)**2 for yout, ygt in zip(ypred, ys))

    # backward pass (and don't forget to reset the gradients!)
    # note to myself: if the problem is very simple, without resetting the gradients, the gradients will accumulate,
    # but the descent will be faster...
    for p in m.parameters():
        p.grad = 0.0
    loss.backward()

    # update parameters
    for p in m.parameters():
        p.data -= learning_rate * p.grad # update the parameters (gradients)

    print(k, loss.data)

# What predictions does the model make now, after k rounds of gradient descent?
# Remember that the target outputs are 1.0, -1.0, -1.0, 1.0
ypred = [m(x) for x in xs]
print(f"[*] Final predictions: {ypred}")

[*] Initial predictions: [Value(data=-0.19187537774717942, label=), Value(data=-0.08719664114242351, label=), Value(data=0.5661226649603694, label=), Value(data=-0.17506707327258142, label=)]
0 6.0872997164136144
1 2.8896979243934635
2 1.2337072293917832
3 0.45635016505112147
4 0.25795472336832787
5 0.1781367385667548
6 0.1354958069762084
7 0.10905133426223233
8 0.09107949394578493
9 0.07808638926105817
10 0.06826411786855195
11 0.060584153022008355
12 0.05441857150716246
13 0.04936248957433281
14 0.04514320396201794
15 0.04157037899468581
16 0.03850716883539993
17 0.03585268548460085
18 0.03353093348953619
19 0.031483590824624944
20 0.02966516231873553
21 0.028039643608578295
22 0.026578173773429702
23 0.025257351088858324
24 0.02405800330991972
25 0.022964275628664887
26 0.021962944582417574
27 0.021042895244345074
28 0.020194718125998955
29 0.01941039501423194
30 0.018683051682985972
31 0.018006761456314583
32 0.017376387839038054
33 0.01678745745016259
34 0.016236056670271157
35 0.