In [1]:
from math import exp, log
import math
import random
import csv
from PIL import Image
from sklearn.datasets import make_moons, load_digits

class Value:
    def __init__(self, data, children=()):
        self.data = data
        self.children = set(children)
        self.gradient = 0
        self._backward = lambda:None
        output = None
        
    def __repr__(self):
        return f"({self.data}, gr={self.gradient})"
    
    def __add__(self, other):
        if not isinstance(other, Value):
            other = Value(other)
            
        self.output = Value(self.data + other.data, children=(self, other))
        
        def backward():
            self.gradient += self.output.gradient
            other.gradient += self.output.gradient   
        self.output._backward = backward
        
        return self.output
    
    def __sub__(self, other):
        if not isinstance(other, Value):
            other = Value(other)
        output = Value(self.data - other.data, children=(self, other))
        
        def backward():
            self.gradient += output.gradient
            other.gradient -= output.gradient  
        output._backward = backward
        
        return output
    
    def __neg__(self):
        return self * -1

    def __radd__(self, other):
        return self.__add__(other)
    
    def __mul__(self, other):
        if not isinstance(other, Value):
            other = Value(other)
        output = Value(self.data * other.data, children=(self, other))
        
        def backward():
            self.gradient += output.gradient * other.data
            other.gradient += output.gradient * self.data    
        output._backward = backward
        
        return output
    
    def __rmul__(self, other):
        return self.__mul__(other)
    
    def __pow__(self, power):
        output = Value(self.data ** power, children=(self,))
        
        def backward():
            self.gradient += output.gradient * power * self.data ** (power - 1) 
        output._backward = backward
    
        return output
    
    def __truediv__(self, other):
        return self * other ** -1
    
    def __rtruediv__(self, other):
        return other * self ** -1
    
    def __lt__(self, other):
        return self.data < other.data
    
    def exp(self):
        output = Value(exp(self.data), children=(self,))
        
        def backward():
            self.gradient += output.gradient * output.data
        output._backward = backward
        
        return output
            
    def relu(self):
        output = Value(max(self.data, 0), children=(self,))
        
        def backward():
            self.gradient += output.gradient if self.data > 0 else 0
        output._backward = backward
        
        return output
    
    def tanh(self):
        _tanh = (exp(2 * self.data) - 1) / (exp(2 * self.data) + 1)
        output = Value(_tanh, children=(self,))
        
        def backward():
            self.gradient += output.gradient * (1 - _tanh**2)
        output._backward = backward
        
        return output
    
    def ln(self):
        output = Value(math.log(self.data), children=(self,))
        
        def backward():
            self.gradient += output.gradient / self.data
        output._backward = backward
        
        return output
    
            
    def backward(self):
        topo_order = []
        visited = set()
        def traverse(v):
            if v not in visited:
                visited.add(v)
                for child in v.children:
                    traverse(child)
                topo_order.append(v)
        traverse(self)
        self.gradient = 1
        for node in reversed(topo_order):
            node._backward()


In [2]:
class Linear:
    def __init__(self, inputs, outputs):
        self.weights = []
        self.biases = []
        
        for _ in range(outputs):
            self.weights.append([Value(random.uniform(-1,1)) for _ in range(inputs)])
        self.biases = [Value(random.uniform(-1,1)) for _ in range(outputs)]
        
    def __call__(self, inputs):
        outputs = []
        for i, w in enumerate(self.weights):
            outputs.append(sum([weight * inp for weight, inp in zip(w, inputs)]) + self.biases[i])
        return outputs
    
    def params(self):
        params = []
        return [j for sub in self.weights for j in sub] + self.biases
      
def RelU(activations):
    return [a.relu() for a in activations]

# This works specifically for MNIST
def oneHotMNIST(inputs):
    outputs = []
    for n in inputs:
        l = [Value(0.0)] * 10
        l[n] = Value(1.0)
        outputs.append(l)
        
    return outputs

def RMSE(true, pred):
    return (sum([(a - b) ** 2 for a, b in zip(true, pred)]) / len(true)) ** 0.5

def softmax(inputs):
    max_input = max(inputs)
    doms = [i.exp() for i in inputs]
    denominator = sum(doms)
    return [i / denominator for i in doms]

def argmax(inputs):
    m = -10000
    out = -1
    for i, v in enumerate(inputs):
        if v.data >= m:
            m = v.data
            out = i
    return out

def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15  # to prevent division by zero or logarithm of zero
    log_preds = [(p + epsilon).ln() for p in y_pred]
    return -sum([y * lp for y, lp in zip(y_true, log_preds)]) / len(y_true)

In [3]:
X, y = load_digits(return_X_y=True)

X_train = []
y_train = []
for x in X:
    X_train.append([Value(p/16) for p in x])
    
for n in y:
    l = [Value(0.0)] * 10
    l[n] = Value(1.0)
    y_train.append(l)    

In [4]:
class Net:
    def __init__(self):
        self.linear1 = Linear(64,32)
        self.linear2 = Linear(32, 16)
        self.linear3 = Linear(16,10)
        
    def __call__(self, inputs):
        x = self.linear1(inputs)
        x = RelU(x)
        x = self.linear2(x)
        x = RelU(x)
        x = self.linear3(x)
        return x
    
    def params(self):
        return self.linear1.params() + self.linear2.params() + self.linear3.params()
    
net = Net()
LR = 0.1

batch_size = 20

for epoch in range(100):
    total_loss = 0.0
    
    preds = [net(X_train[i]) for i in range(batch_size)]
    loss = sum([RMSE(preds[i], y_train[i]) for i in range(batch_size)])
    loss.backward()
    
    for p in net.params():
            p.data -= p.gradient * LR
            p.gradient = 0.0     
    print(loss)
    #LR *= 0.99

(222.96458794635822, gr=1)
(378.69644764409986, gr=1)
(1442.386511451076, gr=1)
(296.86980021076124, gr=1)
(19.98479994601556, gr=1)
(11.909642740361297, gr=1)
(9.058033691954176, gr=1)
(7.0957328729465665, gr=1)
(6.24860828972907, gr=1)
(6.0437816934639494, gr=1)
(6.007287845317801, gr=1)
(6.001205019209765, gr=1)
(6.000199354662186, gr=1)
(6.000033015452486, gr=1)
(6.000005471937808, gr=1)


KeyboardInterrupt: 