In [1]:
# the custom model
import torch
import torch.nn as nn
import custom_cpp
from custom import (
    CustomLinear, 
    CustomReLU
)

In [2]:
# test the matrix_multiply function
a = torch.tensor(
    [[1., 2., 3.],
     [2., 3., 4.]]
).to('mps')
b = torch.tensor(
    [[4., 5.],
     [6., 7.],
     [8., 9.]]
).to('mps')

result = custom_cpp.matrix_multiply(a, b)
result_2 = a @ b

result == result_2

tensor([[True, True],
        [True, True]], device='mps:0')

In [3]:
result

tensor([[40., 46.],
        [58., 67.]], device='mps:0')

In [4]:
# try the Relus

sr = nn.functional.relu(a)

cr = custom_cpp.relu(a)

In [5]:
cr

tensor([[1., 2., 3.],
        [2., 3., 4.]], device='mps:0')

In [6]:
(cr == sr).sum()

tensor(6, device='mps:0')

In [7]:
sr.numel()

6

In [9]:
a = torch.tensor(
    [[1., 2., 3.],
     [2., 3., 4.],
     [2., 3., 4.],
     [2., 3., 4.],
     [-2., -3., -4.],
     [2., 3., 4.]]
).to('mps')

b = torch.tensor(
    [[1., 2., 3.]]
).to('mps')

b = b.unsqueeze(0)

sr = a + b
test_length = 1000

wrong = []
right = []
for i in range(0, test_length):
    cr = custom_cpp.matrix_add(a, b)
    if (cr == sr).sum() != sr.numel():
        wrong.append(1)
    else: 
        right.append(1)

print(f"{(len(wrong)/test_length)*100}% wrong, and {(len(right)/test_length)*100}% right")

0.0% wrong, and 100.0% right


In [10]:
cr

tensor([[ 2.,  4.,  6.],
        [ 3.,  5.,  7.],
        [ 3.,  5.,  7.],
        [ 3.,  5.,  7.],
        [-1., -1., -1.],
        [ 3.,  5.,  7.]], device='mps:0')

In [12]:
a = torch.tensor([[1., 2., 3.]]).to('mps')
cr = custom_cpp.relu(a)


In [13]:
a.sum(0)

tensor([1., 2., 3.], device='mps:0')

In [15]:
with torch.autograd.profiler.profile() as prof:
    output = custom_cpp.relu(a)
print(prof.key_averages().table(sort_by="self_cpu_time_total"))

-----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
            aten::copy_        96.43%       8.218ms        97.02%       8.268ms       4.134ms             2  
           aten::select         0.93%      79.000us         1.00%      85.000us      42.500us             2  
         aten::_to_copy         0.69%      59.000us        98.06%       8.357ms       4.178ms             2  
        aten::expand_as         0.50%      43.000us         0.59%      50.000us      25.000us             2  
               aten::to         0.47%      40.000us        98.53%       8.397ms       2.099ms             4  
    aten::empty_strided         0.38%      32.000us         0.38%      32.000us      10.667us             3  
          

STAGE:2024-02-19 21:55:25 85160:7635471 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-19 21:55:25 85160:7635471 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-19 21:55:25 85160:7635471 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [16]:
prof

[<FunctionEvent id=1 name=aten::empty device_type=DeviceType.CPU node_id=-1 cpu_time=19.000us start_us=332 end_us=351 cpu_children=[] cuda_time=0.000us name=aten::empty thread=1 input_shapes=[] cpu_memory_usage=0 cuda_memory_usage=0 is_async=False is_remote=False seq_nr=-1 is_legacy=False>, <FunctionEvent id=2 name=aten::select device_type=DeviceType.CPU node_id=-1 cpu_time=71.000us start_us=393 end_us=464 cpu_children=[3] cuda_time=0.000us name=aten::select thread=1 input_shapes=[] cpu_memory_usage=0 cuda_memory_usage=0 is_async=False is_remote=False seq_nr=0 is_legacy=False>, <FunctionEvent id=3 name=aten::as_strided device_type=DeviceType.CPU node_id=-1 cpu_time=4.000us start_us=450 end_us=454 cpu_children=[] cuda_time=0.000us name=aten::as_strided thread=1 input_shapes=[] cpu_memory_usage=0 cuda_memory_usage=0 is_async=False is_remote=False seq_nr=-1 is_legacy=False>, <FunctionEvent id=4 name=aten::fill_ device_type=DeviceType.CPU node_id=-1 cpu_time=8.000us start_us=501 end_us=509

In [17]:
cr = custom_cpp.relu(a)

In [18]:
cr

tensor([[1., 2., 3.]], device='mps:0')

In [19]:
import torch

# Settings
num_features = 1
num_samples = 100  # Number of data points
noise_factor = 0.1  # Noise factor for output data

# Generate data for a single feature
inputs = torch.linspace(-1, 1, steps=num_samples).unsqueeze(1)  # Shape: [num_samples, 1]

# Add a little noise to inputs
inputs += torch.randn(inputs.shape) * noise_factor

# Normalize and center the input data
inputs_normalized = (inputs - inputs.mean()) / inputs.std()

# Create a simple linear relationship (y = mx + b) with some noise
m = torch.tensor([2.0])  # Slope
b = torch.tensor([1.0])  # Intercept

# Generate the target output with noise
targets = m * inputs_normalized + b
targets += torch.randn(targets.shape) * noise_factor  # Adding noise

target_mean = targets.mean()
shifted_targets = targets - target_mean

# Split into training and testing sets
train_inputs = inputs_normalized[:10]  # 80% for training
train_outputs = shifted_targets[:10]
test_inputs = inputs_normalized[90:]  # 20% for testing
test_outputs = shifted_targets[90:]

train_inputs = train_inputs.to('mps')
train_outputs = train_outputs.to('mps')


In [20]:
c = CustomReLU()

In [21]:
c.to('mps')

CustomReLU(MPS-based ReLU)

In [22]:
t = c(train_inputs)

In [23]:
train_inputs

tensor([[-1.6126],
        [-1.8104],
        [-1.4046],
        [-1.3656],
        [-1.4613],
        [-1.6235],
        [-1.4803],
        [-1.3303],
        [-1.2234],
        [-1.6529]], device='mps:0')

In [24]:
t

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], device='mps:0')

In [25]:
from torch.nn import ReLU

In [26]:
r = ReLU()
r.to('mps')

ReLU()

In [27]:
tt = r(train_inputs)

In [28]:
tt

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], device='mps:0')

In [29]:
tt.dtype

torch.float32

In [31]:
# the custom model
import torch
import torch.nn as nn
from custom import CustomReLU

cr = CustomReLU()
cr.to('mps')

sr = nn.LeakyReLU()
sr.to('mps')

test_input = torch.tensor([[-1., 1., -1., 1., -1]], requires_grad=True).to('mps')

cr_output = cr(test_input)

sr_output = sr(test_input)

cr_output == sr_output

tensor([[False,  True, False,  True, False]], device='mps:0')

In [32]:
sr_output

tensor([[-0.0100,  1.0000, -0.0100,  1.0000, -0.0100]], device='mps:0',
       grad_fn=<LeakyReluBackward0>)

In [33]:
import torch
import torch.nn as nn
from custom import CustomReLU

# Initialize both ReLU implementations and move them to the appropriate device
cr = CustomReLU().to('mps')
sr = nn.LeakyReLU().to('mps')

# Prepare a test input tensor with requires_grad=True to track gradients
test_input = torch.tensor([[-8000., 1000.6556, -.0005643, 8., 1.000]], requires_grad=True).to('mps')
test_input.retain_grad()
# Forward pass through CustomReLU
cr_output = cr(test_input)
# Perform a backward pass through CustomReLU
cr_output.sum().backward()  # Use sum() to ensure scalar output for backward

# Save the gradient of the input tensor after CustomReLU backward pass
cr_grad = test_input.grad.clone()

# Zero out gradients in test_input for a fresh backward pass
test_input.grad.zero_()

# Forward pass through PyTorch's LeakyReLU
sr_output = sr(test_input)
# Perform a backward pass through PyTorch's LeakyReLU
sr_output.sum().backward()  # Use sum() to ensure scalar output for backward

# Save the gradient of the input tensor after LeakyReLU backward pass
sr_grad = test_input.grad.clone()

# Compare the gradients from both backward passes
are_gradients_equal = torch.equal(cr_grad, sr_grad)
print(f"Are gradients equal? {are_gradients_equal}")


Are gradients equal? False


In [34]:
cr_grad

tensor([[0., 1., 0., 1., 1.]], device='mps:0')

In [35]:
import torch
from torch.autograd import gradcheck
from custom import CustomLinearFunction  # Assuming this is your custom function

# Convert to float32 for single precision
input_features = torch.randn((10, 3), dtype=torch.float64, requires_grad=True)
weight = torch.randn((2, 3), dtype=torch.float64, requires_grad=True)

# Move your tensors to the appropriate device
# = input_features#.to('mps')
#weight = weight#.to('mps')

# Use a higher epsilon and atol because float32 is less precise than float64
test = gradcheck(CustomLinearFunction.apply, (input_features, weight), eps=1e-3, atol=1e-2, raise_exception=True)
print(test)


RuntimeError: input must be a MPS tensor