In [108]:
import math

class Scalar:

    def __init__(self, value, op=None, children=(), backward=lambda g: (0,), label=''):
        self.value = value
        self.label = label
        self.op = op
        self.children = children
        self.backward = backward

    def tanh(self):
        return Scalar(math.tanh(self.value), 'tanh', [self], lambda g: ((1 - self.value) ** 2 * g,))

    def __neg__(self):
        return Scalar(-self.value, '-', [self], lambda g: (-g,))

    def __add__(self, other):
        other = wrap(other)
        return Scalar(self.value + other.value, '+', [self, other], lambda g: (g, g))

    def __radd__(self, other):
        other = wrap(other)
        return other + self

    def __sub__(self, other):
        other = wrap(other)
        return self + (-other)

    def __rsub__(self, other):
        other = wrap(other)
        return other + (-self)

    def __mul__(self, other):
        other = wrap(other)
        return Scalar(self.value * other.value, '*', [self, other], lambda g: (g*other.value, g*self.value))

    def __rmul__(self, other):
        other = wrap(other)
        return other * self

    def __pow__(self, power, modulo=None):
        assert isinstance(power, int | float)
        return Scalar(self.value ** power, '**', [self], lambda g: (power * self.value ** (power-1) * g,))

    def __truediv__(self, other):
        return self * (other ** -1)

    def __rtruediv__(self, other):
        other = wrap(other)
        return other / self

    def __repr__(self):
        return f'Scalar({self.label}={self.value})' if self.label != '' else f'Scalar({self.value})'

def wrap(other):
    return other if isinstance(other, Scalar) else Scalar(other)

def grad(root, params):
    grads = {root: 1}  # Gradient of self with respect to self is 1
    stack = [root]  # Stack of nodes to visit
    while stack:
        node = stack.pop()
        if node.children:
            g = grads[node]
            child_grads = node.backward(g)
            for child, child_grad in zip(node.children, child_grads):
                if child not in grads:
                    grads[child] = 0
                    stack.append(child)
                grads[child] += child_grad
    return [grads[p] for p in params]


In [109]:
a = Scalar(10, label='a')
b = Scalar(2, label='b')
c = Scalar(-3, label='c')

xi = a * b - 2 * c ** 2

gs = grad(xi, [a, b, c])
gs


[2, 10, 12]

In [110]:
import random

class Neuron:

    def __init__(self, dimensions):
        self.dimensions = dimensions
        self.weights = [Scalar(random.uniform(-1, 1), label="weight") for d in range(dimensions)]
        self.bias = Scalar(random.uniform(-1, 1), label="bias")

    def __call__(self, xs):
        assert len(xs) == self.dimensions
        act = sum([w * x for (x, w) in zip(xs, self.weights)], self.bias)
        return act.tanh()

    def parameters(self):
        return self.weights + [self.bias]

class Layer:

    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs
        self.neurons = [Neuron(inputs) for o in range(outputs)]

    def __call__(self, xs):
        assert len(xs) == self.inputs
        return [n(xs) for n in self.neurons]

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

class MLP:

    def __init__(self, inputs, *layers):
        self.inputs = inputs
        self.layers = [Layer(inputs if index == 0 else layers[index - 1], outputs) for index, outputs in enumerate(layers)]

    def __call__(self, xs):
        assert len(xs) == self.inputs
        x = xs
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for l in self.layers for p in l.parameters()]


In [139]:
mlp = MLP(2, 4, 4, 3)

x1 = [1, 2]
y1 = [-1, 1, 1]
p1 = mlp(x1)
p1


[Scalar(-0.6265775433460031),
 Scalar(-0.6135053732631057),
 Scalar(-0.6076525570850939)]

In [140]:
for step in range(20):
    pred = mlp(x1)
    loss = sum([(p - y) ** 2 for p, y in zip(pred, y1)])
    print(f"Loss: {loss.value}")
    params = mlp.parameters()
    grads = grad(loss, params)
    for g, p in zip(grads, params):
        p.value -= 0.1 * g


Loss: 5.327390664984661
Loss: 1.00686374552574
Loss: 0.15079531280808683
Loss: 0.021648326743653832
Loss: 0.01533283908872473
Loss: 0.013589987918302784
Loss: 0.012491633916745543
Loss: 0.011582818795232704
Loss: 0.010761418611198446
Loss: 0.009995561938093134
Loss: 0.009273146603103397
Loss: 0.008588949354585552
Loss: 0.007940494100702484
Loss: 0.007326495179372101
Loss: 0.006746193353298345
Loss: 0.006199042716400739
Loss: 0.005684550493136439
Loss: 0.00520218992331003
Loss: 0.0047513514833365834
Loss: 0.0043313163189147436


In [141]:
p2 = mlp(x1)
p2

[Scalar(-0.9975888393552859),
 Scalar(0.9798527369131781),
 Scalar(0.9405902530270555)]