In [7]:
import cupy as cp
from cupyx.profiler import benchmark

# Define input arrays
a = cp.arange(10)
b = cp.arange(10, 20)
c = cp.arange(20, 30)

# Define an elementwise computation using @cp.fuse() decorator
@cp.fuse()
def elementwise_computation(x, y, z):
    return cp.sin(x) + cp.cos(y) / cp.sqrt(z)

# Invoke the elementwise computation
bench = benchmark(elementwise_computation, (a, b, c), n_repeat=10)

# Print the result
print(bench.to_str())


elementwise_computation:    CPU:    18.496 us   +/-  1.667 (min:    17.403 / max:    23.193) us


In [8]:
@cp.fuse(kernel_name='squared_diff')
def squared_diff(x, y):
    return (x - y) * (x - y)

bench = benchmark(squared_diff, (b, c), n_repeat=10)

print(bench.to_str())

squared_diff        :    CPU:    19.973 us   +/-  2.339 (min:    18.275 / max:    24.627) us


In [19]:
import cupy as cp
from cupyx import jit

@jit.rawkernel()
def elementwise_square(x, y, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.gridDim.x * jit.blockDim.x
    for i in range(tid, size, ntid):
        y[i] = x[i] * x[i]

size = cp.uint32(2 ** 22)
x = cp.arange(size, dtype=cp.float32)
y = cp.empty((size,), dtype=cp.float32)

elementwise_square((128,), (1024,), (x, y, size))  # RawKernel style
assert (y == x * x).all()

elementwise_square[128, 1024](x, y, size)  # Numba style
assert (y == x * x).all()