In [11]:
# trying to create a small pytorch neural net with the new mat multi and relu operations made in metal shader

In [12]:
# the custom model
import torch
import torch.nn as nn
from my_extension import (
    CustomLinear, 
    CustomReLU
)

class CustomNeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CustomNeuralNet, self).__init__()
        self.layer1 = CustomLinear(input_size, hidden_size)
        self.relu = CustomReLU()
        self.layer2 = CustomLinear(hidden_size, output_size)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x



In [13]:
#3 comparing the custom net with a standard net
class StandardNeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(StandardNeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


In [14]:
# Create both models
input_size = 10
hidden_size = 5
output_size = 1

standard_model = StandardNeuralNet(input_size, hidden_size, output_size)
custom_model = CustomNeuralNet(input_size, hidden_size, output_size)

# Move models to MPS
standard_model.to('mps')
custom_model.to('mps')

# Create the input tensor and target on the MPS device
input_tensor = torch.randn(1, input_size, requires_grad=True).to('mps')
target = torch.randn(1, output_size).to('mps')


In [15]:
# try a forward and backward pass on both models

# Forward pass through the standard model
output_standard = standard_model(input_tensor)
# Compute loss
loss_standard = torch.mean((output_standard - target) ** 2)
# Backward pass
loss_standard.backward()

# Forward pass through the custom model
output_custom = custom_model(input_tensor)
# Compute the same loss
loss_custom = torch.mean((output_custom - target) ** 2)
# Backward pass
loss_custom.backward()


In [16]:
# Compare outputs
print("Output - Standard Model:", output_standard)
print("Output - Custom Model:", output_custom)

# Compare gradients
for (name1, param1), (name2, param2) in zip(standard_model.named_parameters(), custom_model.named_parameters()):
    print(f"Gradients comparison for {name1}: {torch.allclose(param1.grad, param2.grad, atol=1e-6)}")


Output - Standard Model: tensor([[0.2352]], device='mps:0', grad_fn=<LinearBackward0>)
Output - Custom Model: tensor([[0.]], device='mps:0', grad_fn=<CustomLinearFunctionBackward>)
Gradients comparison for fc1.weight: False
Gradients comparison for fc1.bias: False


In [17]:
# look for gradients in the custom model
for name, param in custom_model.named_parameters():
    print(f"Gradients of {name}: {param.grad}")


Gradients of layer1.weight: tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [-0.1570, -0.0344,  0.2542,  0.0716,  0.1698, -0.1733, -0.1381, -0.3383,
         -0.1458,  0.2149],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.1855,  0.0407, -0.3003, -0.0846, -0.2006,  0.2048,  0.1632,  0.3998,
          0.1723, -0.2539],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]], device='mps:0')
Gradients of layer2.weight: tensor([[ 0.0000, -0.3035, -0.1962, -0.5432, -0.0349]], device='mps:0')


In [18]:
for name, param in standard_model.named_parameters():
    print(f"Gradients of {name}: {param.grad}")

Gradients of fc1.weight: tensor([[-0.5128, -0.1124,  0.8300,  0.2338,  0.5545, -0.5660, -0.4510, -1.1050,
         -0.4761,  0.7017],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.5007,  0.1097, -0.8106, -0.2283, -0.5415,  0.5528,  0.4404,  1.0791,
          0.4649, -0.6853],
        [ 0.4101,  0.0899, -0.6639, -0.1870, -0.4435,  0.4528,  0.3607,  0.8839,
          0.3808, -0.5613],
        [-0.5645, -0.1237,  0.9137,  0.2574,  0.6104, -0.6231, -0.4964, -1.2164,
         -0.5241,  0.7725]], device='mps:0')
Gradients of fc1.bias: tensor([-0.4654,  0.0000,  0.4545,  0.3723, -0.5123], device='mps:0')
Gradients of fc2.weight: tensor([[-1.5046,  0.0000, -1.1051, -0.5423, -0.3172]], device='mps:0')
Gradients of fc2.bias: tensor([-1.3956], device='mps:0')


In [19]:
import torch
from torch.autograd import gradcheck
from my_extension import (
    CustomReLUFunction
)

# gradcheck requires double precision
inp = torch.randn(1, 5, dtype=torch.float, requires_grad=True).to('mps')
test = gradcheck(CustomReLUFunction.apply, inp, eps=1e-6, atol=1e-4)
print("Gradient check passed:", test)



GradcheckError: Jacobian mismatch for output 0 with respect to input 0,
numerical:tensor([[-2.6731e+05,  1.5175e+05,  3.2534e+05,  4.5771e+05,  2.3870e+05],
        [-5.0000e+05, -2.0117e+00, -1.4901e+00, -3.2187e+00, -6.7055e-01],
        [-2.3269e+05, -1.5175e+05, -3.2535e+05, -4.5771e+05, -2.3870e+05],
        [ 2.6731e+05, -1.5175e+05, -3.2534e+05, -4.5771e+05, -2.3870e+05],
        [ 5.0000e+05,  2.0117e+00,  1.4901e+00,  3.2187e+00,  6.7055e-01]],
       device='mps:0')
analytical:tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0.]], device='mps:0')


In [1]:
import torchviz

import torch
from torch.autograd import gradcheck
from my_extension import (
    CustomReLUFunction
)


# x = torch.tensor(1., requires_grad=True).to('mps')
# out = CustomReLUFunction.apply(x)
#grad_x, = torch.autograd.grad(out, x, create_graph=True)
#torchviz.make_dot((grad_x, x, out), {"grad_x": grad_x, "x": x, "out": out})

In [2]:
x = torch.randn(1, 5, requires_grad=True).to('mps')

t = CustomReLUFunction.apply(x)

In [6]:
t

tensor([[0., 0., 0., 0., 0.]], device='mps:0',
       grad_fn=<CustomReLUFunctionBackward>)

In [4]:
grad_x, = torch.autograd.grad(t, x, create_graph=True)

RuntimeError: grad can be implicitly created only for scalar outputs

In [10]:
import torchviz

torchviz.make_dot(standard_model)


AttributeError: 'StandardNeuralNet' object has no attribute 'size'

In [9]:
# try some peice of networks separately. 
fc1 = nn.Linear(5, 5)

In [3]:
input_tensor = torch.randn(1, 5, requires_grad=True).to('cpu')
fc1 = nn.Linear(5, 5)
x = fc1(input_tensor)

In [4]:
x

tensor([[ 0.2927, -0.2686, -1.0230,  1.0898,  0.0386]],
       grad_fn=<AddmmBackward0>)

In [16]:
import torch
from my_extension import CustomReLU

input_tensor = torch.randn(1, 5, requires_grad=True).to('mps')
cr = CustomReLU()
x = cr(input_tensor)

In [17]:
x

tensor([[-0.6064, -0.0156, -2.8058, -0.2572, -0.6120]], device='mps:0',
       grad_fn=<CustomReLUFunctionBackward>)

In [21]:
grad_output = torch.ones_like(x)
x.backward(grad_output, retain_graph=True)

In [23]:
x

tensor([[-0.6064, -0.0156, -2.8058, -0.2572, -0.6120]], device='mps:0',
       grad_fn=<CustomReLUFunctionBackward>)

In [2]:
import torch
from my_extension import CustomLinear

input_tensor = torch.randn(5, 5, requires_grad=True).to('mps')
cl = CustomLinear(5, 5).to('mps')
x = cl(input_tensor)

tensor([[ 0.2453, -0.1818,  0.1541, -0.0299, -0.0662],
        [ 0.1551,  0.2177, -0.0136,  0.0114, -0.1537],
        [ 0.9256, -1.1568,  0.7372,  0.2290,  0.4485],
        [ 1.0097, -0.9976,  0.9373,  0.9538,  0.3853],
        [-0.2037,  0.3990, -0.2366, -0.0811,  0.3480]], device='mps:0',
       grad_fn=<CustomLinearFunctionBackward>)