# a very basic neural network formulation and implementation

note: the implementation will be using numpy only 

the forward process will go as follows: 
$$Z_1 = W_1 * x + B_1$$
$$Z_1 = max(0, Z_1)$$
$$Z_2 = W_2 * z_1 + B_2$$
$$output = \text{softmax}(Z_2)$$

the backward process will go as follows: 
**Step 1: Compute loss gradient w.r.t. output**
$$\frac{\partial L}{\partial \text{output}} = \text{output} - \text{one\_hot}(y)$$

**Step 2: Gradient w.r.t. Z₂ (softmax derivative)**
$$\frac{\partial L}{\partial Z_2} = \frac{\partial L}{\partial \text{output}} \cdot \frac{\partial \text{output}}{\partial Z_2} = \text{output} - \text{one\_hot}(y)$$

**Step 3: Gradients w.r.t. W₂ and B₂**
$$\frac{\partial L}{\partial W_2} = Z_1^T \cdot \frac{\partial L}{\partial Z_2}$$
$$\frac{\partial L}{\partial B_2} = \sum \frac{\partial L}{\partial Z_2}$$

**Step 4: Gradient w.r.t. Z₁ (chain rule)**
$$\frac{\partial L}{\partial Z_1} = \frac{\partial L}{\partial Z_2} \cdot W_2^T \cdot \frac{\partial \text{ReLU}}{\partial Z_1}$$
$$\frac{\partial L}{\partial Z_1} = \frac{\partial L}{\partial Z_2} \cdot W_2^T \cdot (Z_1 > 0)$$

**Step 5: Gradients w.r.t. W₁ and B₁**
$$\frac{\partial L}{\partial W_1} = x^T \cdot \frac{\partial L}{\partial Z_1}$$
$$\frac{\partial L}{\partial B_1} = \sum \frac{\partial L}{\partial Z_1}$$

**Step 6: Parameter updates**
$$W_2 := W_2 - \alpha \frac{\partial L}{\partial W_2}$$
$$B_2 := B_2 - \alpha \frac{\partial L}{\partial B_2}$$
$$W_1 := W_1 - \alpha \frac{\partial L}{\partial W_1}$$
$$B_1 := B_1 - \alpha \frac{\partial L}{\partial B_1}$$

In [2]:
import numpy as np 

In [None]:
class MLP: 
    
    def __init__(self, dims, lr):
        self.dims = dims 
        self.lr = lr 
        
        #dims = [input dim, hidden dim, output dim]
        self.W1 = np.random.randn(dims[0], dims[1]) 
        self.b1 = np.zeros((1, dims[1]))
        self.W2 = np.random.randn(dims[1], dims[2])
        self.b2 = np.zeros((1, dims[2]))
        
    def activation(self, x):
        return np.maximum(0, x)
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward(self, X):
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.activation(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.softmax(self.z2)
        return self.a2
    
    def loss(self, y_true, y_pred):
        m = y_true.shape[0]
        log_likelihood = -np.log(y_pred[range(m), y_true])
        loss = np.sum(log_likelihood) / m
        return loss 
    
    def backward(self, x):
        m = x.shape[0]
        delta2 = self.a2.copy()
        delta2[range(m), x] -= 1
        delta2 /= m

        dW2 = np.dot(self.a1.T, delta2)
        db2 = np.sum(delta2, axis=0, keepdims=True)

        delta1 = np.dot(delta2, self.W2.T) * (self.z1 > 0)
        dW1 = np.dot(x.T, delta1)
        db1 = np.sum(delta1, axis=0, keepdims=True)

        return dW1, db1, dW2, db2
    
    def update(self, dW1, db1, dW2, db2):
        self.W1 -= self.lr * dW1
        self.b1 -= self.lr * db1
        self.W2 -= self.lr * dW2
        self.b2 -= self.lr * db2
        
    def train(self, X, y, epochs):
        for epoch in range(epochs):
            y_pred = self.forward(X
            loss = self.loss(y, y_pred)
            dW1, db1, dW2, db2 = self.backward(y)
            self.update(dW1, db1, dW2, db2)

            if epoch % 100 == 0:
                print(f'Epoch {epoch}, Loss: {loss:.4f}')
                

        

In [12]:
# Example data
X = np.random.rand(100, 20)
y = np.random.randint(0, 2, size=100)

# Initialize model
model = MLP(dims=[20, 10, 2], lr=0.001)

# Train model
model.train(X, y, epochs=1000)


Epoch 0, Loss: 1.5474
Epoch 100, Loss: 0.9780
Epoch 200, Loss: 0.9054
Epoch 300, Loss: 0.8757
Epoch 400, Loss: 0.8528
Epoch 500, Loss: 0.8312
Epoch 600, Loss: 0.8117
Epoch 700, Loss: 0.7945
Epoch 800, Loss: 0.7797
Epoch 900, Loss: 0.7646


In [18]:
from graphviz import Digraph

def draw_full_computational_graph():
    dot = Digraph(format='png', graph_attr={
        'rankdir': 'TB',
        'splines': 'spline',
        'nodesep': '0.6',
        'ranksep': '0.8'
    })

    # ----- Styles -----
    param_style = {'shape': 'record', 'style': 'rounded,filled', 'fillcolor': 'lightyellow'}
    op_style = {'shape': 'circle', 'style': 'filled', 'fillcolor': 'lightblue', 'width': '0.5', 'fixedsize': 'true'}
    value_style = {'shape': 'record', 'style': 'rounded,filled', 'fillcolor': 'lightgrey'}

    # ----- Input Layer -----
    dot.attr('node', **param_style)
    dot.node('X', '{X | data | grad: ∂L/∂X}')
    dot.node('y', '{y | labels}')
    dot.node('W1', '{W1 | params | grad: ∂L/∂W1 = Xᵀ·∂L/∂Z1}')
    dot.node('b1', '{b1 | params | grad: ∂L/∂b1 = Σ∂L/∂Z1}')
    dot.node('W2', '{W2 | params | grad: ∂L/∂W2 = A1ᵀ·∂L/∂Z2}')
    dot.node('b2', '{b2 | params | grad: ∂L/∂b2 = Σ∂L/∂Z2}')

    # ----- Operation Nodes -----
    dot.attr('node', **op_style)
    dot.node('dot1', '*')
    dot.node('add1', '+')
    dot.node('relu', 'ReLU')
    dot.node('dot2', '*')
    dot.node('add2', '+')
    dot.node('softmax', 'Softmax')
    dot.node('loss', 'Loss')

    # ----- Intermediate Values with Forward + Backward -----
    dot.attr('node', **value_style)
    dot.node('Z1', '''{Z1 | forward: Z1 = X·W1 + b1 | backward: ∂L/∂Z1 = (∂L/∂Z2·W2ᵀ) ⊙ ReLU'(Z1)}''')
    dot.node('A1', '''{A1 | forward: A1 = ReLU(Z1) | backward: ∂L/∂A1 = ∂L/∂Z2·W2ᵀ}''')
    dot.node('Z2', '''{Z2 | forward: Z2 = A1·W2 + b2 | backward: ∂L/∂Z2 = A2 - y}''')
    dot.node('A2', '''{A2 | forward: A2 = softmax(Z2) | backward: ∂L/∂A2 = A2 - y}''')
    dot.node('L', '''{L | forward: CrossEntropy | backward: ∂L/∂L = 1}''')

    # ----- Forward Pass Edges -----
    dot.edge('X', 'dot1')
    dot.edge('W1', 'dot1')
    dot.edge('dot1', 'add1')
    dot.edge('b1', 'add1')
    dot.edge('add1', 'Z1')
    dot.edge('Z1', 'relu')
    dot.edge('relu', 'A1')
    dot.edge('A1', 'dot2')
    dot.edge('W2', 'dot2')
    dot.edge('dot2', 'add2')
    dot.edge('b2', 'add2')
    dot.edge('add2', 'Z2')
    dot.edge('Z2', 'softmax')
    dot.edge('softmax', 'A2')
    dot.edge('A2', 'loss')
    dot.edge('y', 'loss')
    dot.edge('loss', 'L')

    return dot

# Render
graph = draw_full_computational_graph()
graph.render('computational_graph_full', format='png', cleanup=True)
graph.view()




'computational_graph_full.png'