# Micro neural net

Having built out the machinery for back-propogation (backprop.ipynb), we are going to do a simple neural net and MLP based on it

In [None]:
%pip install numpy 
%pip install matplotlib
%pip install sympy
%pip install graphviz
%pip install torch
# note also need to brew install graphviz or eqivalent for your OS

In [4]:
class Value:
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self._prev = set(_children)
        self._op = _op
        self.label = label
        self.grad = 0.0
        self._backward = lambda: None
    
    def __repr__(self):
        return f"Value(data={self.data})"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)

        out = Value(self.data + other.data, (self, other), '+')
        def _backward():
            # self is the capture of the original self when add() was called
            # other is the capture of the original other side of the mul opteration when mul() was called
            # out is the node backward is being called from
            # which means it's the child / previous node of the one _backward called from
            # out.data is result of foward pass

            # Local derivative at add node is dout.data/dself.data
            # which is 1.0 (for add operation)
            # Via chain rule, previous node grad is this nodes grad * local derivative
            self.grad += 1.0 * out.grad # += because there may be multiple paths to this node and chain rule means we accumulate them.  See multi-variable case on wikipedia
            other.grad += 1.0 * out.grad # other side of the add operation is back propogated too
        out._backward = _backward

        return out

    def __neg__(self): # -self
        return self * -1 # leverage mul to implement negation

    def __sub__(self, other): # self - other
        return self + (-other) # leverage add and mul to implement subtraction
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)

        out = Value(self.data * other.data, (self, other), '*')
        def _backward():
            # self is the capture of the original self when mul() was called
            # other is the capture of the original other side of the mul opteration when mul() was called
            # out is the node backward is being called from
            # which means it's the child / previous node of the one _backward called from
            # out.data is result of foward pass

            # Local derivative at mul node is dout.data/dself.data
            # which is other.data (for mul operation)
            # Via chain rule, previous node grad is this nodes grad * local derivative
            self.grad += other.data * out.grad # += because there may be multiple paths to this node and chain rule means we accumulate them See multi-variable case on wikipedia

            other.grad += self.data * out.grad # other side of the mul operation is back propogated too
        out._backward = _backward
        return out

    def __pow__(self, other): # self ** other
            assert isinstance(other, (int, float)), "only supporting int/float powers for now"
            out = Value(self.data ** other, (self, ), f'**{other}')

            def _backward():
                # self is the capture of the original self when pow() was called
                # other is the capture of the original other side of the pow operation when pow() was called
                # out is the node backward is being called from
                # out.data is result of foward pass

                # Local derivative at pow node is dout.data/dself.data
                # which is other * self**(other-1) (for pow operation)
                # Via chain rule, previous node grad is this nodes grad * local derivative

                self.grad += (other * self.data**(other - 1)) * out.grad # check derivative rules
            out._backward = _backward
            return out

    def __rmul__(self, other): # other * self.  Used for the use case scalar * Value
            return self * other
    
    def __truediv__(self, other): # self / other
            return self * other**-1 # division is mutiply by other to the power of -1

   # see hyperbolic functions, exponential definitions: https://en.wikipedia.org/wiki/Hyperbolic_functions 
    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')

        def _backward():
            # self is the capture of the original self when tanh() was called
            # out is the node backward is being called from
            # which means it's the child / previous node of the one _backward called from
            # out.data is result of foward pass

            # Local derivative at tanh node is dout.data/dself.data
            # which is 1 - tanh**2(out.data)
            # Via chain rule, previous node grad is this nodes grad * local derivative

            self.grad += (1 - out.data**2) * out.grad

        out._backward = _backward

        return out
    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')
        def _backward():
            # self is the capture of the original self when exp() was called
            # out.data is result of foward pass
            self.grad += out.data * out.grad #local derivative of exp * this node's gradient (chain rule)
        out._backward = _backward
        return out
    def backward(self):
        self.grad = 1.0
        topo = []
        visited = set ()

        def build_topo(v):
            if v not in visited:
                visited.add (v)
                for child in v._prev:
                    build_topo(child)
                    child.grad = 0.0
                topo.append (v)
        build_topo(self)
        for node in reversed(topo):
            node._backward()

In [5]:
from graphviz import Digraph

def trace(root):
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) 
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        dot.node(name=uid, label="{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            dot.node(name=uid + n._op, label=n._op)
            dot.edge(uid + n._op, uid)
    for n1, n2 in edges:
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    return dot



In [None]:
import random
import math

class Neuron:
    def __init__(self, ninputs):
        self.w = [Value(random.uniform(-1,1)) for _ in range(ninputs)]
        self.b = Value(random.uniform(-1,1))

    def __call__(self, x):
        # pairs = zip(self.w, x) # zip creates an iterator over two lists in parallel
        # print(list(pairs)) # converting to list exhausts the iterator, which drains the iterator hence comment out
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b) # create activation by summing w1*x1 + w2*x2 + ... + b aka forward pass
        out = act.tanh() # apply non-linear activation function
        return out # apply non-linear activation function
    
    def parameters(self):
        # with a list of weights eg [w1, w2, w3] and a bias b the output of this is [w1, w2, w3, b]
        return self.w + [self.b] # return list of weights and bias

class Layer:
    def __init__(self, nin, nneurons): # nin = number of inputs for each neuron, nneurons = number of neurons
        print(f'Layer init nin: {nin} nneurons: {nneurons}')
        self.neurons = [Neuron(nin) for _ in range(nneurons)] # create nneurons neurons each with nin inputs

    def __call__(self, x):
        outs = [n(x) for n in self.neurons] # set inputs

        return outs[0] if len(outs) == 1 else outs # return single value if only one neuron otherwise return list of outputs

    def parameters(self):
        # with example neuron parameters [w1, w2, w3, b] the result of this is [[w1, w2, w3, b], [w1, w2, w3, b], ...] for each neuron in the layer
        return [p for n in self.neurons for p in n.parameters()] # return list of weights and bias

class MLP:
    def __init__(self, nin, nlayers): # nin is number of inputs for the first layer, nlayers is a list of number of neurons in each layer e.g. [4,4,1] for 3 layers
        # sz is the list of layer sizes which is also the number of inputs to each neuron in that layer
        sz = [nin] + nlayers # e.g. nin = 3, nlayers = [4,4,1]  => sz = [3,4,4,1]

        print(f'sz: {sz}')
        # Create 3 layers
        # for sz = [3,4,4,1] first layer has 3 inputs and 4 neurons, second layer has 4 inputs and 4 neurons, third layer has 4 inputs and 1 neuron
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nlayers))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        # this is called a single list comprehension in python
        return [p for layer in self.layers for p in layer.parameters()]

n = Neuron(2)
x = [2.0, 3.0]
# n(x) # uses __call__
print(f'n: {n(x)}')

l = Layer(3, 3) # layer with 3 inputs and 3 neurons

#### 
x = [2.0, 3.0, -1.0]
print(f'l: {l(x)}')




n: Value(data=-0.968347985472863)
Layer init nin: 3 nneurons: 3
l: [Value(data=-0.997953929202848), Value(data=0.9623763831840364), Value(data=0.9283706075764311)]


In [522]:
### Multi Layer Perceptron initialisation
x = [2.0, 3.0, -1.0]
mlp1 = MLP(3, [4,4,1]) # 3 inputs, 3 layers  l1 = 4 neurons, l2 = 4 neurons, l3 = 1 neuron

print(f'mlp1: {mlp1(x)}')

sz: [3, 4, 4, 1]
Layer init nin: 3 nneurons: 4
Layer init nin: 4 nneurons: 4
Layer init nin: 4 nneurons: 1
mlp1: Value(data=0.5067864803165081)


In [511]:
# this is a binary classification problem with 4 samples and 3 features
xs = [
[2.0, 3.0, -1.0],
[3.0, -1.0, 0.5], 
[0.5, 1.0, 1.0], 
[1.0, 1.0, -1.0]
]

ys = [1.0, -1.0, -1.0, 1.0] # desired targets

In [399]:
# forward pass for 4 pairs of inputs and ground truth outputs
ypred = [mlp1(x) for x in xs] # predict all outputs
ypred

# calculate loss across 4 samples using RMS
losseach = [(yout - ygt)**2 for ygt, yout in zip(ys, ypred)]
 # MSE loss: prediction - actual squared.  squaring disguards the sign.  Could use abs here as an alternative
loss = sum(losseach, Value(0))  # Start sum with Value(0) instead of int 0
print(f'loss: {loss}')

# print(mlp1.layers[0].neurons[0].w[0].data)  # first weight of first neuron of first layer
# draw_dot(loss)

 # for p in mlp1.parameters():
 #   print(p)

 # all the weights and biases in the network: 41 in this case
# print('Number of parameters:', len(mlp1.parameters())) 

loss: Value(data=0.020385467701199818)


In [None]:
# backpropogate to compute gradients
loss.backward()

for p in mlp1.parameters():
    p.data += -0.01 * p.grad  # SGD update with learning rate 0.01.  Negative because we want to go down the gradient

ypred

# Gradient descent progress is:
# 1. Forward pass to compute prediction
# 2. Compute loss
# 3. Backward pass to compute gradients
# 4. Update weights with gradient descent using a learning rate
# 5. Repeat steps 1-4 for some number of iterations until loss is low enough
# Note as the loss approaches zero, the predicted values approach the ground truth values

[Value(data=0.9546449303477701),
 Value(data=-0.9689588412922081),
 Value(data=-0.9003119042863683),
 Value(data=0.9138192863970791)]

In [None]:
# Automatic training loop
for k in range(50): # 20 iterations of training loop
    # forward pass for 4 pairs of inputs and ground truth outputs
    ypred = [mlp1(x) for x in xs] # predict all outputs

    # backward pass to compute gradients

    # calculate loss across 4 samples using RMS
    losseach = [(yout - ygt)**2 for ygt, yout in zip(ys, ypred)]
     # MSE loss: prediction - actual squared.  squaring disguards the sign.  Could use abs here as an alternative
    loss = sum(losseach, Value(0))  # Start sum with Value(0) instead of int 0
    for p in mlp1.parameters():
        p.grad = 0.0 # reset gradients to zero before backward pass -- important!, easy mistake
        # https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://x.com/karpathy/status/1013244313327681536%3Flang%3Den&ved=2ahUKEwjt6szhyvuPAxVaITQIHat2DKYQFnoECBwQAQ&usg=AOvVaw0cy8ikbrorVwxQjM4QFxAA
    loss.backward()

    # iterate
    for p in mlp1.parameters():
        p.data += -0.05 * p.grad  # SGD update with learning rate 0.01.  Negative because we want to go down the gradient

print(f'ys: {ys}')
print(f'loss: {loss}')
print(f'ypred: {ypred}')

ys: [1.0, -1.0, -1.0, 1.0]
loss: Value(data=9.811795768034012e-05)
ypred: [Value(data=0.9974346899754655), Value(data=-0.994308073573488), Value(data=-0.9946011179674371), Value(data=0.9945235788591447)]
