In [6]:
import cupy as cp
from cupyx.profiler import benchmark

# Define input arrays
a = cp.arange(10)
b = cp.arange(10, 20)
c = cp.arange(20, 30)

# Define an elementwise computation using @cp.fuse() decorator
@cp.fuse()
def elementwise_computation(x, y, z):
    return cp.sin(x) + cp.cos(y) / cp.sqrt(z)

# Invoke the elementwise computation
print(benchmark(elementwise_computation, (a, b, c), n_repeat=10))

# Print the result
# print(result)


elementwise_computation:    CPU:    69.067 us   +/-  0.907 (min:    68.239 / max:    70.953) us     GPU-0:    74.787 us   +/-  1.575 (min:    73.664 / max:    78.624) us


In [12]:
@cp.fuse(kernel_name='squared_diff')
def squared_diff(x, y):
    return (x - y) * (x - y)

print(benchmark(squared_diff, (b, c), n_repeat=10))

squared_diff        :    CPU:    17.277 us   +/-  0.810 (min:    16.241 / max:    18.565) us     GPU-0:    22.150 us   +/-  1.233 (min:    20.704 / max:    24.576) us


In [19]:
import cupy as cp
from cupyx import jit

@jit.rawkernel()
def elementwise_square(x, y, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.gridDim.x * jit.blockDim.x
    for i in range(tid, size, ntid):
        y[i] = x[i] * x[i]

size = cp.uint32(2 ** 22)
x = cp.arange(size, dtype=cp.float32)
y = cp.empty((size,), dtype=cp.float32)

elementwise_square((128,), (1024,), (x, y, size))  # RawKernel style
assert (y == x * x).all()

elementwise_square[128, 1024](x, y, size)  # Numba style
assert (y == x * x).all()