In [1]:
import torch
import casadi_on_gpu as cog
print(cog.__file__)
print(cog.dynamics_forward.__doc__)
import os, numpy as np

/home/mr-robot/.local/lib/python3.12/site-packages/casadi_on_gpu.cpython-312-x86_64-linux-gnu.so
dynamics_forward(sim_x_ptr: int, sim_u_ptr: int, sim_p_all_ptr: int, dt_ptr: int, f_ext_ptr: int, sim_x_next_all_ptr: int, n_candidates: int, threads_per_block: int = 128, stream_ptr: int = 0, sync: bool = True) -> None

Launch dynamics kernel. Pointers (including dt_ptr) must be GPU addresses.



In [2]:
path = "/home/mr-robot/sci_ws/casadi-on-gpu/src/posterior.bin"
param_dim = cog.DYNAMICS_PARAM_DIM

# the file is float64, we cast to float32 for GPU.
params = np.fromfile(path, dtype=np.float64).reshape(-1, param_dim).astype(np.float32)

In [3]:
N = params.shape[0]
device = "cuda"
sim_p_all = torch.from_numpy(params).to(device)
sim_p_all = sim_p_all.contiguous()
assert sim_p_all.is_cuda and sim_p_all.dtype == torch.float32
assert torch.isfinite(sim_p_all).all()

In [4]:
# Same initial conditions as the C++ demo
sim_x = torch.tensor([0.1 * (i + 1) for i in range(cog.DYNAMICS_STATE_DIM)],
                    device=device, dtype=torch.float32)
sim_u = torch.tensor([0.05 * (i + 1) for i in range(cog.DYNAMICS_CONTROL_DIM)],
                    device=device, dtype=torch.float32)
f_ext = torch.zeros((cog.DYNAMICS_CONTROL_DIM,), device=device, dtype=torch.float32)

sim_x_next_all = torch.zeros((N, cog.DYNAMICS_OUT_DIM), device=device, dtype=torch.float32)

dt = torch.tensor([0.04], device=device, dtype=torch.float32)
stream = torch.cuda.current_stream().cuda_stream
cog.dynamics_forward(
    sim_x.data_ptr(),
    sim_u.data_ptr(),
    sim_p_all.data_ptr(),
    dt.data_ptr(),
    f_ext.data_ptr(),
    sim_x_next_all.data_ptr(),
    N,
    threads_per_block=128,
    stream_ptr=stream,
    sync=True
)

print(torch.isfinite(sim_x_next_all).all(), sim_x_next_all[0])
print(sim_x_next_all)

tensor(False, device='cuda:0') tensor([0.1262, 0.2371, 0.3277, 0.4756, 0.5371, 0.6842, 0.5045, 0.7987, 0.8608,
        0.7026, 1.9532, 1.3321], device='cuda:0')
tensor([[ 0.1262,  0.2371,  0.3277,  ...,  0.7026,  1.9532,  1.3321],
        [ 0.1263,  0.2387,  0.3274,  ..., -3.0556,  1.4371,  1.2419],
        [ 0.1246,  0.2367,  0.3289,  ..., -0.3697,  2.4083,  1.1635],
        ...,
        [ 0.1259,  0.2366,  0.3277,  ..., -1.2767,  2.0363,  1.2118],
        [ 0.1246,  0.2362,  0.3288,  ..., -0.3268,  2.8331,  1.2396],
        [ 0.1269,  0.2373,  0.3270,  ..., -1.0844,  1.5446,  1.2502]],
       device='cuda:0')


In [5]:
bad_mask = ~torch.isfinite(sim_x_next_all)
bad_rows = bad_mask.any(dim=1).nonzero().squeeze()

print("num bad rows:", bad_rows.numel())
print("first bad rows:", bad_rows.tolist())

# bad rows are as a result of invalid combination of sim parameters

num bad rows: 27
first bad rows: [5335, 14711, 14948, 20024, 23836, 26868, 27361, 30188, 30728, 31097, 32035, 32367, 33293, 34664, 38133, 38204, 41643, 49828, 54433, 55219, 56256, 57265, 58309, 59475, 71954, 76139, 77676]


In [6]:
import torch

device = "cuda"

# Warmup, avoid first call overhead
warmup = 20
reps = 200

# Optional, lock GPU clocks can help on laptops, but skip if you do not want to change settings.

# Warmup runs
for _ in range(warmup):
    cog.dynamics_forward(
        sim_x.data_ptr(),
        sim_u.data_ptr(),
        sim_p_all.data_ptr(),
        dt.data_ptr(),
        f_ext.data_ptr(),
        sim_x_next_all.data_ptr(),
        N,
        threads_per_block=128,
        stream_ptr=torch.cuda.current_stream().cuda_stream,
        sync=False,   # do not sync inside, we will sync around timing
    )

torch.cuda.synchronize()

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

start.record()
for _ in range(reps):
    cog.dynamics_forward(
        sim_x.data_ptr(),
        sim_u.data_ptr(),
        sim_p_all.data_ptr(),
        dt.data_ptr(),
        f_ext.data_ptr(),
        sim_x_next_all.data_ptr(),
        N,
        threads_per_block=128,
        stream_ptr=torch.cuda.current_stream().cuda_stream,
        sync=False,
    )
end.record()

torch.cuda.synchronize()

ms = start.elapsed_time(end)
ms_per_call = ms / reps
evals_per_s = (N / ms_per_call) * 1000.0
print(f"33 parameter forward dynamics model with 12 states [Pytorch]")
print(f"Total: {ms:.3f} ms for {reps} calls")
print(f"Per call: {ms_per_call:.4f} ms")
print(f"Throughput: {evals_per_s:,.0f} eval/s  (batch N={N})")


33 parameter forward dynamics model with 12 states [Pytorch]
Total: 1334.619 ms for 200 calls
Per call: 6.6731 ms
Throughput: 11,988,438 eval/s  (batch N=80000)


In [7]:
import numpy as np
import cupy as cp
import casadi_on_gpu as cog

param_dim = cog.DYNAMICS_PARAM_DIM

# Load parameters from file (float64 on disk), cast to float32, move to GPU
params_cpu = np.fromfile(path, dtype=np.float64).reshape(-1, param_dim).astype(np.float32)
sim_p_all = cp.asarray(params_cpu)

# Choose how many to benchmark
N = 80000
sim_p_all = sim_p_all[:N, :]  # take first N rows
sim_p_all = cp.ascontiguousarray(sim_p_all)

# Inputs
sim_x = cp.asarray([0.1 * (i + 1) for i in range(cog.DYNAMICS_STATE_DIM)], dtype=cp.float32)
sim_u = cp.asarray([0.05 * (i + 1) for i in range(cog.DYNAMICS_CONTROL_DIM)], dtype=cp.float32)
dt    = cp.asarray([0.04], dtype=cp.float32)
f_ext = cp.zeros((cog.DYNAMICS_CONTROL_DIM,), dtype=cp.float32)

# Output
sim_x_next_all = cp.zeros((N, cog.DYNAMICS_OUT_DIM), dtype=cp.float32)

stream = cp.cuda.get_current_stream().ptr

# Warmup
warmup = 20
for _ in range(warmup):
    cog.dynamics_forward(
        sim_x.data.ptr,
        sim_u.data.ptr,
        sim_p_all.data.ptr,
        dt.data.ptr,
        f_ext.data.ptr,
        sim_x_next_all.data.ptr,
        N,
        threads_per_block=128,
        stream_ptr=stream,
        sync=False,
    )

cp.cuda.runtime.deviceSynchronize()

# Timing
reps = 200
start = cp.cuda.Event()
end = cp.cuda.Event()

start.record()
for _ in range(reps):
    cog.dynamics_forward(
        sim_x.data.ptr,
        sim_u.data.ptr,
        sim_p_all.data.ptr,
        dt.data.ptr,
        f_ext.data.ptr,
        sim_x_next_all.data.ptr,
        N,
        threads_per_block=128,
        stream_ptr=stream,
        sync=False,
    )
end.record()
end.synchronize()

ms = cp.cuda.get_elapsed_time(start, end)
ms_per_call = ms / reps
evals_per_s = (N / ms_per_call) * 1000.0

print("33 parameter forward dynamics model with 12 states [CuPy]")
print(f"Total: {ms:.3f} ms for {reps} calls")
print(f"Per call: {ms_per_call:.4f} ms")
print(f"Throughput: {evals_per_s:,.0f} eval/s (batch N={N})")


33 parameter forward dynamics model with 12 states [CuPy]
Total: 1330.460 ms for 200 calls
Per call: 6.6523 ms
Throughput: 12,025,920 eval/s (batch N=80000)
