In [1]:
import torch, casadi_on_gpu as cog

In [2]:
N = 80000
q_all = torch.zeros((N, cog.FK_DOF), device="cuda", dtype=torch.float32)
p1 = torch.zeros((6,), device="cuda", dtype=torch.float32)
p2 = torch.zeros((6,), device="cuda", dtype=torch.float32)
out = torch.zeros((N, cog.FK_OUT_DIM), device="cuda", dtype=torch.float32)
stream = torch.cuda.current_stream().cuda_stream
cog.fk_forward(q_all.data_ptr(), p1.data_ptr(), p2.data_ptr(), out.data_ptr(),
                N, stream_ptr=stream, sync=True)

out


tensor([[-8.0000e-02,  5.3072e-08,  3.4450e-02,  0.0000e+00,  0.0000e+00,
         -1.0472e+00],
        [-8.0000e-02,  5.3072e-08,  3.4450e-02,  0.0000e+00,  0.0000e+00,
         -1.0472e+00],
        [-8.0000e-02,  5.3072e-08,  3.4450e-02,  0.0000e+00,  0.0000e+00,
         -1.0472e+00],
        ...,
        [-8.0000e-02,  5.3072e-08,  3.4450e-02,  0.0000e+00,  0.0000e+00,
         -1.0472e+00],
        [-8.0000e-02,  5.3072e-08,  3.4450e-02,  0.0000e+00,  0.0000e+00,
         -1.0472e+00],
        [-8.0000e-02,  5.3072e-08,  3.4450e-02,  0.0000e+00,  0.0000e+00,
         -1.0472e+00]], device='cuda:0')

In [3]:
# Warm up
warmup = 50
for _ in range(warmup):
    cog.fk_forward(
        q_all.data_ptr(), p1.data_ptr(), p2.data_ptr(), out.data_ptr(),
        N, stream_ptr=stream, sync=False
    )
torch.cuda.synchronize()

# Time
reps = 500
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

start.record()
for _ in range(reps):
    cog.fk_forward(
        q_all.data_ptr(), p1.data_ptr(), p2.data_ptr(), out.data_ptr(),
        N, stream_ptr=stream, sync=False
    )
end.record()
torch.cuda.synchronize()

ms_total = start.elapsed_time(end)
ms_per = ms_total / reps
throughput = (N / ms_per) * 1000.0
print(f"4 dof forward kinematic model [Pytorch]")
print(f"FK total: {ms_total:.3f} ms for {reps} calls")
print(f"FK per call: {ms_per:.4f} ms")
print(f"Throughput: {throughput:,.0f} eval/s (batch N={N})")


4 dof forward kinematic model [Pytorch]
FK total: 4.134 ms for 500 calls
FK per call: 0.0083 ms
Throughput: 9,676,644,916 eval/s (batch N=80000)


In [4]:
import cupy as cp
import casadi_on_gpu as cog

N = 80000
q_all = cp.zeros((N, cog.FK_DOF), dtype=cp.float32)
p1 = cp.zeros((6,), dtype=cp.float32)
p2 = cp.zeros((6,), dtype=cp.float32)
out = cp.zeros((N, cog.FK_OUT_DIM), dtype=cp.float32)

stream = cp.cuda.get_current_stream().ptr
cog.fk_forward(q_all.data.ptr, p1.data.ptr, p2.data.ptr, out.data.ptr,
               N, stream_ptr=stream, sync=False)
out

array([[-7.9999998e-02,  5.3071794e-08,  3.4449995e-02,  0.0000000e+00,
         0.0000000e+00, -1.0472052e+00],
       [-7.9999998e-02,  5.3071794e-08,  3.4449995e-02,  0.0000000e+00,
         0.0000000e+00, -1.0472052e+00],
       [-7.9999998e-02,  5.3071794e-08,  3.4449995e-02,  0.0000000e+00,
         0.0000000e+00, -1.0472052e+00],
       ...,
       [-7.9999998e-02,  5.3071794e-08,  3.4449995e-02,  0.0000000e+00,
         0.0000000e+00, -1.0472052e+00],
       [-7.9999998e-02,  5.3071794e-08,  3.4449995e-02,  0.0000000e+00,
         0.0000000e+00, -1.0472052e+00],
       [-7.9999998e-02,  5.3071794e-08,  3.4449995e-02,  0.0000000e+00,
         0.0000000e+00, -1.0472052e+00]], dtype=float32)

In [5]:
stream = cp.cuda.get_current_stream().ptr

# Warmup
warmup = 50
for _ in range(warmup):
    cog.fk_forward(
        q_all.data.ptr, p1.data.ptr, p2.data.ptr, out.data.ptr,
        N, stream_ptr=stream, sync=False
    )
cp.cuda.runtime.deviceSynchronize()

# Timing
reps = 500
start = cp.cuda.Event()
end = cp.cuda.Event()

start.record()
for _ in range(reps):
    cog.fk_forward(
        q_all.data.ptr, p1.data.ptr, p2.data.ptr, out.data.ptr,
        N, stream_ptr=stream, sync=False
    )
end.record()

end.synchronize()

ms_total = cp.cuda.get_elapsed_time(start, end)
ms_per = ms_total / reps
throughput = (N / ms_per) * 1000.0
print(f"4 dof forward kinematic model [Cupy]")
print(f"FK total: {ms_total:.3f} ms for {reps} calls")
print(f"FK per call: {ms_per:.4f} ms")
print(f"Throughput: {throughput:,.0f} eval/s (batch N={N})")


4 dof forward kinematic model [Cupy]
FK total: 4.127 ms for 500 calls
FK per call: 0.0083 ms
Throughput: 9,692,176,688 eval/s (batch N=80000)
