In [14]:
import numpy as np
import cupy as cp
npoints = int(1e7)
a = np.arange(npoints, dtype=np.float32)

In [15]:
x_gpu = cp.asarray(a)

In [19]:
x_gpu.max(), a.max()

(array(9999999., dtype=float32), 9999999.0)

In [20]:
x_gpu.min(), a.min()

(array(0., dtype=float32), 0.0)

In [23]:
cp.mean(x_gpu)

array(5000000., dtype=float32)

In [24]:
a.mean()

5000000.0

In [22]:
from cupyx import jit

@jit.rawkernel()
def elementwise_copy(x, y, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.gridDim.x * jit.blockDim.x
    for i in range(tid, size, ntid):
        y[i] = x[i]

size = cp.uint32(2 ** 22)
x = cp.random.normal(size=(size,), dtype=cp.float32)
y = cp.empty((size,), dtype=cp.float32)

elementwise_copy((128,), (1024,), (x, y, size))  # RawKernel style
assert (x == y).all()

elementwise_copy[128, 1024](x, y, size)  #  Numba style
assert (x == y).all()

  cupy._util.experimental('cupyx.jit.rawkernel')


In [10]:
# Python program to
# demonstrate speed comparison
# between cupy and numpy
 
# Importing modules
import cupy as cp
import numpy as np

import time
 
# NumPy and CPU Runtime
cpus = time.perf_counter()
np.ones((1000, 1000, 200))
cpue = time.perf_counter()
print(f"Time consumed by numpy: {cpue - cpus}")
 
# CuPy and GPU Runtime
s = time.perf_counter()
cp.ones((1000, 1000, 200))
e = time.perf_counter()
print(f"\nTime consumed by cupy: {e - s}")

print(f"\nspeed-up is by a factor {(cpue-cpus)/(e-s)}")

Time consumed by numpy: 0.19518351601436734

Time consumed by cupy: 0.008497097995132208

speed-up is by a factor 22.97060903924887


In [12]:
from cupyx.profiler import benchmark
import cupy as cp
import numpy as np
 
# NumPy and CPU Runtime
def cpu_init():
    return np.ones((1000, 1000, 200))
 
# CuPy and GPU Runtime
def gpu_init():
    return cp.ones((1000, 1000, 200))

cpu_bench = benchmark(cpu_init, n_repeat=20)
gpu_bench = benchmark(gpu_init, n_repeat=20)

In [32]:
cpu_bench

cpu_init            :    CPU: 196290.390 us   +/- 1385.358 (min: 194292.732 / max: 199221.589) us     GPU-0: 196364.082 us   +/- 1413.739 (min: 194399.994 / max: 199336.700) us

In [33]:
gpu_bench

gpu_init            :    CPU:    21.039 us   +/-  5.942 (min:    13.415 / max:    40.005) us     GPU-0:  8385.842 us   +/- 597.780 (min:  7681.024 / max:  9244.768) us

In [4]:
import cupy as cp
import numpy as np
from cupyx.profiler import benchmark

# Stable implementation of log(1 + exp(x))
def softplus(x):
    xp = cp.get_array_module(x)
    print("Using:", xp.__name__)
    return xp.maximum(0, x) + xp.log1p(xp.exp(-abs(x)))

x = np.random.random(int(1e5))
x_gpu = cp.asarray(x)

cpu_bench = benchmark(softplus, (x,), n_repeat=10)
gpu_bench = benchmark(softplus, (x_gpu,), n_repeat=10)

Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: numpy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy
Using: cupy


In [6]:
cpu_bench.to_str()

'softplus            :    CPU:  2336.364 us   +/- 83.127 (min:  2089.450 / max:  2374.256) us'

In [7]:
gpu_bench.to_str()

'softplus            :    CPU:    56.202 us   +/-  1.779 (min:    54.793 / max:    60.534) us'