## Denominator Layout

In [4]:
import numpy as np

class MLPLayer:
    def __init__(self, input_size, output_size, activation='sigmoid'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        # Initialize weights and biases
        self.weights = np.random.randn(output_size, input_size)
        print('w', self.weights.shape)

        self.biases = np.zeros((output_size, 1))
        print('b', self.biases.shape)
        
    def forward(self, X):
        self.X = X
        print('X', self.X.shape)
        
        self.z = np.dot(self.weights, X) + self.biases
        print('z', self.z.shape)
        
        self.a = self._activate(self.z)
        print('a', self.a.shape)

        return self.a

    def backward(self, dL_da, learning_rate):
        print('dL/da', dL_da.shape)
        
        da_dz = self._activate_derivative(self.z)
        print('da/dz', da_dz.shape)

        dL_dz = da_dz * dL_da
        print('dL/dz', dL_dz.shape)

        dz_dw = self.X.T
        dz_db = 1

        dL_dw = np.dot(dL_dz, dz_dw)
        print('dL/dW', dL_dw.shape)
        
        dL_db = dz_db * np.sum(dL_dz, axis=1, keepdims=True)
        print('dL/db', dL_db.shape)

        dL_da_prev = np.dot(self.weights.T, dL_dz)
        print('dL/dX (l-1)', dL_da_prev.shape)

        # Update weights and biases
        self.weights -= learning_rate * dL_dw
        self.biases -= learning_rate * dL_db

        return dL_da_prev

    def _activate(self, x):
        if self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-x))
        elif self.activation == 'relu':
            return np.maximum(0, x)
        elif self.activation == 'tanh':
            return np.tanh(x)
        else:
            raise ValueError("Invalid activation function.")

    def _activate_derivative(self, x):
        if self.activation == 'sigmoid':
            sig = self._activate(x)
            return sig * (1 - sig)
        elif self.activation == 'relu':
            return np.where(x > 0, 1, 0)
        elif self.activation == 'tanh':
            return 1 - np.tanh(x) ** 2
        else:
            raise ValueError("Invalid activation function.")

In [5]:
# Create an MLP layer with input size 3 and output size 2
input_size = 3
output_size = 2

mlp_layer = MLPLayer(input_size, output_size, activation='sigmoid')

# Perform forward pass
inputs = np.array([[0.2, 0.3, 0.4],
                   [0.5, 0.6, 0.7],
                   [0.8, 0.9, 1.0],
                   [1.1, 1.2, 1.3]]).T

output = mlp_layer.forward(inputs)
print("Forward pass output:", output.shape)

w (2, 3)
b (2, 1)
X (3, 4)
z (2, 4)
a (2, 4)
Forward pass output: (2, 4)


In [6]:
# Perform backward pass with gradient of loss w.r.t. output and learning rate
dL_da = np.array([[0.1, 0.2],
                  [0.3, 0.1],
                  [0.05, 0.6],
                  [0.1, 0.4]]).T
learning_rate = 0.1
dL_da_prev = mlp_layer.backward(dL_da, learning_rate)
print("Gradient w.r.t. input:", dL_da_prev)

dL/da (2, 4)
da/dz (2, 4)
dL/dz (2, 4)
dL/dW (2, 3)
dL/db (2, 1)
dL/dX (l-1) (3, 4)
Gradient w.r.t. input: [[-0.11674066 -0.07172005 -0.20094498 -0.09486496]
 [-0.01086257 -0.06146702  0.01070669 -0.01134419]
 [ 0.04315149  0.12159974  0.02324768  0.03943372]]


In [None]:
A = np.random.randn(3,2)
B = np.random.randn(3,2)
np.dot(A,B)

# Numerator layout

In [366]:
import torch
import torch.nn as nn

class TorchMLP(nn.Module):
    def __init__(self, input_size, output_size):
        super(TorchMLP, self).__init__()
        
        self.fc1 = nn.Linear(input_size, output_size)  # First fully connected layer
        self.activation = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)  # Pass the input through the first fully connected layer
        out = self.activation(out)  # Apply the ReLU activation function
        self.out = out
        self.out.retain_grad()
        return out

In [367]:
t_mlp = TorchMLP(input_size, output_size)

In [368]:
print(t_mlp.fc1.weight)
print(t_mlp.fc1.bias)

Parameter containing:
tensor([[ 0.2781,  0.4060,  0.4201],
        [-0.4212, -0.1295,  0.3830]], requires_grad=True)
Parameter containing:
tensor([-0.2347,  0.1003], requires_grad=True)


In [369]:
#L = 1/2n * sum((Y - A)^2)

# dA = dL/dA = d(1/2 * sum((Y - A)^2))/dA
#    = -2 * 1/2 * (Y - A)
#    = A - Y

# dZ = dL/dZ = dL/dA * dA/dZ
#    = (A - Y) * sigmoid_derivative(Z)

# dW = dL/dW = dL/dZ * dZ/dW
#    = X.T * dZ

# db = dL/db = dL/dZ * dZ/db
#    = sum(dZ, axis=0)

In [492]:
import numpy as np

class MLP:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.weights = np.random.randn(input_size, output_size)
        self.biases = np.zeros(output_size)
    
    def forward(self, X):
        self.X = X
        self.z = np.matmul(X, self.weights) + self.biases
        self.a = self.sigmoid(self.z)
        return self.a
    
    def backward(self, dA, learning_rate):
        dZ = self.sigmoid_derivative(self.z) * dA        # dL/dZ = dA/dZ * dL/dA
        dW = np.matmul(self.X.T, dZ)                     # dL/dW = dZ/dW * dL/dZ
        db = np.sum(1 * dZ, axis=0)                      # dL/db = dZ/db * dL/dZ
        dX = np.matmul(dZ, self.weights.T)               # dL/dX = dZ/dX * dL/dZ
        
        self.weights -= learning_rate * dW
        self.biases -= learning_rate * db
        
        return dX
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def mse_loss(self, output, target):
        return np.mean((output - target) ** 2)

    def mse_loss_derivative(self, predicted, targets):
        return predicted - targets

In [493]:
# Perform forward pass
X = np.random.randn(10, input_size)  # Example input
Y = np.random.randn(10, output_size)  # Example input

In [494]:
input_size = 3
output_size = 2

mlp = MLP(input_size, output_size)

In [495]:
A = mlp.forward(X)
A

array([[0.1972055 , 0.43293981],
       [0.16776044, 0.54691291],
       [0.25696774, 0.65363529],
       [0.48832962, 0.24477543],
       [0.05333731, 0.5989532 ],
       [0.6626236 , 0.78894806],
       [0.87031847, 0.17446047],
       [0.73584926, 0.49145121],
       [0.03747002, 0.48445427],
       [0.94109756, 0.28112193]])

In [496]:
loss = mlp.mse_loss(A, Y)
dA = mlp.mse_loss_derivative(A, Y)
dA

array([[-0.11374033,  1.70354183],
       [ 1.47290517,  0.44575059],
       [-0.28471107, -0.32983539],
       [ 0.67264599,  1.56807743],
       [-0.00514339,  1.61655007],
       [-0.43781747,  1.43233171],
       [ 0.06125633, -0.36610595],
       [-0.24974756, -0.21364742],
       [-0.26749673,  0.65657288],
       [ 1.32684866,  0.34042252]])

In [497]:
learning_rate = 0.1
dX = mlp.backward(dA, learning_rate)
dX

array([[-5.67141215e-01, -9.39809417e-01, -5.11400675e-01],
       [ 5.53874548e-01,  1.26617220e+01,  4.38169234e+00],
       [-1.20134852e-01, -1.30367383e+00, -4.73481706e-01],
       [-3.44965004e+00, -8.91144361e-01, -1.51548710e+00],
       [ 3.52219682e-01, -1.90467960e-02,  1.18360732e-01],
       [-5.45197210e-01,  4.43190500e-02, -1.78304690e-01],
       [ 1.34401952e+00,  5.88109539e-01,  6.70088819e-01],
       [ 6.44106402e-03, -1.09900415e-02, -1.35343434e-03],
       [-3.35005650e-01, -7.60710371e+00, -2.63329331e+00],
       [-4.80771580e-02,  1.32594982e+01,  4.36626101e+00]])

In [482]:
import numpy as np

class MLP:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.weights = np.random.randn(input_size, output_size)
        self.biases = np.zeros((1, output_size))
    
    def forward(self, X):
        self.X = X
        self.z = np.matmul(X, self.weights) + self.biases
        self.a = self.sigmoid(self.z)
        return self.a
    
    def backward(self, targets, learning_rate):
        dA = self.mse_loss_derivative(self.a, targets)
        dZ = self.sigmoid_derivative(self.z) * dA        # dL/dz = dA/dz * dL/dA
        dW = np.matmul(self.X.T, dZ)                     # dL/dW = dz/dW * dL/dz
        db = np.matmul(np.ones((1, dZ.shape[0])), dZ)    # dL/db = dz/db * dL/dz
        dX = np.matmul(dZ, self.weights.T)
        
        self.weights -= learning_rate * dW
        self.biases -= learning_rate * db
        
        return dX
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def mse_loss(self, output, target):
        return np.mean((output - target) ** 2)

    def mse_loss_derivative(self, predicted, targets):
        return predicted - targets

In [483]:
input_size = 3
output_size = 2
# Create MLP with input size 10 and output size 5
mlp = MLP(input_size, output_size)

In [484]:
output = mlp.forward(X)
output

array([[0.63462094, 0.39361231],
       [0.22967546, 0.24832513],
       [0.42406899, 0.03676409],
       [0.42979041, 0.01736074],
       [0.83922517, 0.68523752],
       [0.09887785, 0.99872105],
       [0.9090212 , 0.96892209],
       [0.09468154, 0.50754518],
       [0.16805481, 0.83964839],
       [0.14557361, 0.00283616]])

In [485]:
dX = mlp.backward(Y, learning_rate)
dX

array([[ -0.67011504,   0.9306511 ,   1.28544575],
       [  0.69752453,   3.07545823,  10.90657995],
       [  3.54412038,  -8.25456994, -16.88841197],
       [ -4.98914012,  12.8603824 ,  27.52927954],
       [  2.11983318,  -1.52224996,   0.23831816],
       [ 10.32569174,   7.28629802,  45.67176307],
       [  0.29188637,   6.82904833,  21.34383405],
       [  2.0996355 ,  -1.45020637,   0.41026106],
       [ -5.33149132,   6.45319285,   7.347319  ],
       [ -8.55133818,  11.95554139,  16.64428419]])

In [438]:
mlp.weights = t_mlp.fc1.weight.detach().numpy().T
mlp.biases = t_mlp.fc1.bias.detach().numpy()

In [374]:
print(mlp.weights.T)
print(mlp.biases.T)

[[ 0.2781462   0.4059974   0.42012388]
 [-0.4212348  -0.12950394  0.38296783]]
[-0.23468414  0.10032433]


In [375]:
output = mlp.forward(X)
output

array([[0.44722921, 0.40476478],
       [0.42615047, 0.2452808 ],
       [0.68599789, 0.36058244],
       [0.38489515, 0.43478932],
       [0.47692172, 0.43691784],
       [0.23973604, 0.25751712],
       [0.4638022 , 0.27973996],
       [0.50765635, 0.51952948],
       [0.38347513, 0.58258832],
       [0.60745182, 0.63928659]])

In [376]:
# Step 4: Forward pass
out = t_mlp(torch.tensor(X, dtype=torch.float32))
out

tensor([[0.4472, 0.4048],
        [0.4262, 0.2453],
        [0.6860, 0.3606],
        [0.3849, 0.4348],
        [0.4769, 0.4369],
        [0.2397, 0.2575],
        [0.4638, 0.2797],
        [0.5077, 0.5195],
        [0.3835, 0.5826],
        [0.6075, 0.6393]], grad_fn=<SigmoidBackward0>)

In [377]:
import torch.optim as optim
# Step 2: Define the loss function
criterion = nn.MSELoss()

# Step 3: Define the optimizer
learning_rate = 0.1
optimizer = optim.SGD(t_mlp.parameters(), lr=learning_rate)

# Step 5: Compute the loss
loss = criterion(out, torch.tensor(Y, dtype=torch.float32))
loss.retain_grad()

loss

tensor(1.3167, grad_fn=<MseLossBackward0>)

In [378]:
# Step 6: Backpropagation
optimizer.zero_grad()  # Clear gradients
loss.backward()  # Compute gradients
optimizer.step()  # Update model parameters

In [379]:
loss.grad

tensor(1.)

In [380]:
t_mlp.out.grad

tensor([[-0.0092, -0.0114],
        [ 0.1356,  0.0200],
        [ 0.1152,  0.0557],
        [ 0.0315,  0.1680],
        [ 0.0461,  0.0894],
        [-0.0040, -0.1300],
        [-0.1246,  0.1879],
        [ 0.2366,  0.1076],
        [ 0.0277,  0.1024],
        [ 0.1990,  0.0499]])

In [381]:
# Perform backward pass
learning_rate = 0.1
dX = mlp.backward(Y, learning_rate)

In [382]:
print(f"A={output.shape}")
print(f"X={X.shape}")
print(f"dA={dA.shape}")
print(f"dX={dX.shape}")
print(f"W={mlp.weights.shape}")
print(f"b={mlp.biases.shape}")
print(f"z={mlp.z.shape}")


A=(10, 2)
X=(10, 3)
dA=(10, 2)
dX=(10, 3)
W=(3, 2)
b=(2,)
z=(10, 2)


In [383]:
print(mlp.weights.T)
print(mlp.biases.T)

[[ 0.2866298   0.36066747  0.3000821 ]
 [-0.05485079  0.28255358  0.50762576]]
[-0.26092786  0.2804039 ]


In [384]:
print(t_mlp.fc1.weight)
print(t_mlp.fc1.bias)

Parameter containing:
tensor([[ 0.2866,  0.3607,  0.3001],
        [-0.0549,  0.2826,  0.5076]], requires_grad=True)
Parameter containing:
tensor([-0.2609,  0.2804], requires_grad=True)


In [254]:
mlp.mse_loss(Y, mlp.a)

0.66351810314361