In [1]:
import math
import random

In [2]:
class ZeroDimTensor:
    def __init__(self, val, _children=(), _op="", label=""):
        self.val = float(val)
        self.grad = 0.0
        self._op = _op # '_op' for consistency with Pytorch
        self.label = label
        self._children = set(_children)
        self._backward = lambda: None # this i 

    def __repr__(self):
        return f"ZeroDimTensor(label='{self.label}', val={self.val}, grad={self.grad}, op='{self._op}')"

    def __add__(self, other):
        other = other if isinstance(other, ZeroDimTensor) else ZeroDimTensor(other)
        return_val = ZeroDimTensor(self.val + other.val, (self, other), "+")

        def _backward_pass_add():
            # Gradients accumulate, so use += instead of = for all backward pass methods
            self.grad += return_val.grad
            other.grad += return_val.grad
            
        return_val._backward = _backward_pass_add # Assign to _backward
        
        return return_val

    def __radd__(self, other):
        return self + other

    def __mul__(self, other):
        other = other if isinstance(other, ZeroDimTensor) else ZeroDimTensor(other)
        return_val = ZeroDimTensor(self.val * other.val, (self, other), "*")
        
        def _backward_pass_mul():
            self.grad += other.val * return_val.grad
            other.grad += self.val * return_val.grad
            
        return_val._backward = _backward_pass_mul # Assign to _backward
        
        return return_val

    def __rmul__(self, other):
        return self * other

    def __sub__(self, other):
        other = other if isinstance(other, ZeroDimTensor) else ZeroDimTensor(other)
        return self + (-other)

    def __neg__(self):
        return self * -1

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "Power must be a scalar"
        return_val = ZeroDimTensor(self.val**other, (self,), f'**{other}')
        
        def _backward_pass_pow():
            self.grad += return_val.grad * (other * (self.val**(other-1)))
        return_val._backward = _backward_pass_pow
        return return_val

    def __sub__(self, other):
        # Ensure 'other' is a ZeroDimTensor
        other = other if isinstance(other, ZeroDimTensor) else ZeroDimTensor(other)
        # Forward pass for subtraction: A - B
        return_val = ZeroDimTensor(self.val - other.val, (self, other), "-")

        def _backward_pass_sub():
            self.grad += return_val.grad * 1.0
            other.grad += return_val.grad * -1.0 # Gradient with respect to subtracted term is negative
            
        return_val._backward = _backward_pass_sub
        return return_val

    def __neg__(self):
        return self * -1

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "Power must be a scalar"
        return_val = ZeroDimTensor(self.val**other, (self,), f'**{other}')
        
        def _backward_pass_pow():
            self.grad += return_val.grad * (other * (self.val**(other-1)))
        return_val._backward = _backward_pass_pow
        return return_val
    
    def __truediv__(self, other):
        # Ensure 'other' is a ZeroDimTensor
        other = other if isinstance(other, ZeroDimTensor) else ZeroDimTensor(other)
        # Forward pass for division: A / B
        return_val = ZeroDimTensor(self.val / other.val, (self, other), "/")

        # If C = A / B, then dL/dA = dL/dC * (1/B)
        # dL/dB = dL/dC * (-A/B^2)
        def _backward_pass_div():
            self.grad += return_val.grad * (1.0 / other.val)
            other.grad += return_val.grad * (-self.val / (other.val**2))
            
        return_val._backward = _backward_pass_div
        return return_val
    
    def tanh(self):
        return_val = ZeroDimTensor(math.tanh(self.val), (self,), "tanh")
        
        def _backward_pass_tanh():
            # Correct local derivative for tanh
            self.grad += (1 - return_val.val**2) * return_val.grad 
            
        return_val._backward = _backward_pass_tanh # Assign to _backward
        
        return return_val

    def backward(self): # Implements the backward pass (like PyTorch API)
        # Zero out all gradients in the graph before starting
        nodes_to_zero = set()
        stack = [self]
        while stack:
            node = stack.pop()
            if node not in nodes_to_zero:
                nodes_to_zero.add(node)
                for child in node._children:
                    stack.append(child)
        
        for node in nodes_to_zero:
            node.grad = 0.0

        topological_ordering = []
        visited = set()
        
        def topological_sort(node):
            if node not in visited:
                visited.add(node)
                for child in node._children:
                    topological_sort(child)
                topological_ordering.append(node)

        self.grad = 1.0 # Derivative of the root node (loss) is always 1
        topological_sort(self)

        for tensor in topological_ordering[::-1]: # Iterate in reverse topological order
            tensor._backward() # Call the stored backward function

        # For debugging: print tensors and their gradients after backward pass
        # print("\n--- Tensors and Gradients (after backward pass) ---")
        # for tensor in topological_ordering:
        #     print(tensor)

In [3]:
x1 = ZeroDimTensor(2.0, (), "", "x1")
w1 = ZeroDimTensor(-3.0, (), "", "w1")

x2 = ZeroDimTensor(0.0, (), "", "x2")
w2 = ZeroDimTensor(1.0, (), "", "w2")

b = ZeroDimTensor(6.8814, (), "", "b")

w1x1 = w1 * x1
w1x1.label = "w1x1"

w2x2 = w2 * x2
w2x2.label = "w2x2"

w1x1_plus_w2x2 = w1x1 + w2x2
w1x1_plus_w2x2.label = "w1x1_plus_w2x2"

n = w1x1_plus_w2x2 + b
n.label = "n"

o = n.tanh()
o.label = "o"
o.backward()

In [4]:
class Perceptron:
    def __init__(self, num_inputs, nonlin=True):
        # Initialize weights as a list of ZeroDimTensor objects
        self.weights = [ZeroDimTensor(random.uniform(-1,1), label=f"w{i}") for i in range(num_inputs)] # random weights with labels
        # Initialize bias as a ZeroDimTensor object
        self.bias = ZeroDimTensor(random.uniform(-1,1), label="b")
        self.nonlin = nonlin

    def parameters(self):
        return self.weights + [self.bias]

    def __call__(self, x_inputs):
        raw_activation = ZeroDimTensor(0.0, label="raw_act") 
        
        # Calculate w_i * x_i + b
        for i in range(len(x_inputs)):
            # Ensure x_inputs[i] is a ZeroDimTensor. If it's a float, __mul__ will convert it.
            # But passing ZeroDimTensor explicitly is safer for graph consistency.
            raw_activation += x_inputs[i] * self.weights[i]
        
        raw_activation += self.bias 

        # Apply the activation function (tanh) if nonlin == True
        if (self.nonlin):
            return raw_activation.tanh() # This proves that perceptrons and neural nets are just mathematical expressions
        else:
            return raw_activation



In [5]:
neuron = Perceptron(2)
x = [9.0, 10.0]
neuron(x)

ZeroDimTensor(label='', val=-0.9923646169421559, grad=0.0, op='tanh')

In [6]:
class Layer:
    def __init__(self, num_inputs, num_outputs, nonlin=True):
        # Initialize the layer of perceptrons with random weights and biases
        self.perceptrons = [Perceptron(num_inputs, nonlin=nonlin) for _ in range(num_outputs)] # Renamed for clarity

    def parameters(self):
        return [param for perceptron in self.perceptrons for param in perceptron.parameters()]

    def __call__(self, x):
        # Do a forward pass, passing in x (inputs) for each perceptron in the layer
        # The output is a list of ZeroDimTensor objects
        return [perceptron(x) for perceptron in self.perceptrons]

In [7]:
layer = Layer(2, 3) # 2 inputs per perceptron, and there is a layer of 3 perceptrons
x = [9.0, 10.0]
layer(x) # output the forward pass for 3 perceptrons each with input values x, random weights, and a random bias

[ZeroDimTensor(label='', val=-0.9994436104411393, grad=0.0, op='tanh'),
 ZeroDimTensor(label='', val=-0.9999999997185669, grad=0.0, op='tanh'),
 ZeroDimTensor(label='', val=0.8107934154239987, grad=0.0, op='tanh')]

In [8]:
class MultiLayerPerceptron:
    def __init__(self, num_inputs, num_outputs_per_layer):
        # sizes = [input_layer_size, hidden_layer1_size, ..., output_layer_size]
        sizes = [num_inputs] + num_outputs_per_layer 

        # Create layers such that the number of outputs for each layer correctly matches the number of inputs for the next one
        self.layers = []
        for i in range(len(num_outputs_per_layer)): 
            # Check if this is the last layer
            is_last_layer = (i == len(num_outputs_per_layer) - 1)
            # Pass nonlin=False for the last layer to prevent vanishing gradient, since tanh will make gradient close to zero
            self.layers.append(Layer(sizes[i], sizes[i+1], nonlin=not is_last_layer))

    def parameters(self):
        return [param for layer in self.layers for param in layer.parameters()]

    def __call__(self, x_inputs):
        # Pass the input through each layer sequentially
        for layer in self.layers:
            x_inputs = layer(x_inputs) # The output of one layer becomes the input for the next
        return x_inputs # The final output of the network
    

In [9]:
x = [1.0, 2.0, -3.0]
nn = MultiLayerPerceptron(3, [4, 4, 1])
nn(x)

[ZeroDimTensor(label='', val=-0.4672820014075061, grad=0.0, op='+')]

In [10]:
# Create a binary classifier

# Inputs: x training data
xt = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0,  1.0],
    [1.0, 1.0, -1.0]
]

# Desired outputs: binary classifier, either -1 or 1 for each feature (in this case, row) in the input dataset
yt = [-1.0, 1.0, 1.0, -1.0]

# Convert into ZeroDimTensor objects
xt_tensors = [[ZeroDimTensor(val, label=f"x_feature{j}") for j, val in enumerate(row)] for row in xt]
yt_tensors = [ZeroDimTensor(val, label="target") for val in yt] 

# 1. Create the MultiLayerPerceptron model
model = MultiLayerPerceptron(num_inputs=3, num_outputs_per_layer=[4, 1])

yhats = [model(x_row) for x_row in xt_tensors] 

yhats

[[ZeroDimTensor(label='', val=1.1373464153052226, grad=0.0, op='+')],
 [ZeroDimTensor(label='', val=0.14212179811762576, grad=0.0, op='+')],
 [ZeroDimTensor(label='', val=2.119856705177905, grad=0.0, op='+')],
 [ZeroDimTensor(label='', val=0.9356174752091296, grad=0.0, op='+')]]

In [11]:
total_loss = ZeroDimTensor(0.0, label="total_loss_sum")

for i, y_pred_list in enumerate(yhats):
    y_pred = y_pred_list[0] # Extract the single ZeroDimTensor
    y_true = yt_tensors[i] 
    loss_on_example = (y_pred - y_true)**2
    total_loss += loss_on_example

mse = total_loss / len(xt_tensors)
mse

ZeroDimTensor(label='', val=2.576224689687475, grad=0.0, op='/')

In [12]:
# Get all learnable parameters from the model
params = model.parameters()
print(f"Total learnable parameters: {len(params)}")

Total learnable parameters: 21


In [13]:
# Set learning rate (hyperparameter)
learning_rate = 0.05

In [14]:
print("\n--- Training Loop ---")
for epoch in range(100): 
    total_loss = ZeroDimTensor(0.0, label="total_loss") 

    # Forward pass for the entire dataset (batch gradient descent style)
    yhats = [model(x_row) for x_row in xt_tensors] 
    
    # Calculate loss for each example and sum them up
    for i, y_pred_list in enumerate(yhats):
        # For binary classification, y_pred_list will be [ZeroDimTensor(output_value)]
        y_pred = y_pred_list[0] 
        y_true = yt_tensors[i]
        
        # Use MSE. The model outputs logits, so apply tanh here.
        prediction_val = y_pred.tanh() # Apply tanh here to get a prediction between -1 and 1
        loss_on_example = (prediction_val - y_true)**2
        total_loss += loss_on_example

    mse = total_loss / len(xt_tensors)
    
    mse.backward()

    # Update parameters using Gradient Descent
    for p in params:
        # Recall, if gradient is positive, it means increasing the weight will increase the error, and vice versa
        # Thus, move in the opposite direction of the gradient.
        p.val -= learning_rate * p.grad 
    
    # Print progress
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: MSE = {mse.val:.6f}")
        # Print a sample prediction for inspection
        print(f"  Sample 0 Pred (raw): {yhats[0][0].val:.4f}, (tanh): {yhats[0][0].tanh().val:.4f}, Target: {yt[0]:.0f}")


--- Training Loop ---
Epoch 0: MSE = 1.757808
  Sample 0 Pred (raw): 1.1373, (tanh): 0.8135, Target: -1
Epoch 10: MSE = 0.213597
  Sample 0 Pred (raw): -0.7924, (tanh): -0.6598, Target: -1
Epoch 20: MSE = 0.068575
  Sample 0 Pred (raw): -1.1629, (tanh): -0.8220, Target: -1
Epoch 30: MSE = 0.037826
  Sample 0 Pred (raw): -1.3541, (tanh): -0.8750, Target: -1
Epoch 40: MSE = 0.025777
  Sample 0 Pred (raw): -1.4746, (tanh): -0.9005, Target: -1
Epoch 50: MSE = 0.019480
  Sample 0 Pred (raw): -1.5612, (tanh): -0.9156, Target: -1
Epoch 60: MSE = 0.015636
  Sample 0 Pred (raw): -1.6282, (tanh): -0.9258, Target: -1
Epoch 70: MSE = 0.013051
  Sample 0 Pred (raw): -1.6828, (tanh): -0.9332, Target: -1
Epoch 80: MSE = 0.011197
  Sample 0 Pred (raw): -1.7286, (tanh): -0.9389, Target: -1
Epoch 90: MSE = 0.009803
  Sample 0 Pred (raw): -1.7680, (tanh): -0.9434, Target: -1
