In [None]:
import numba
import cupy as cp
import numpy as np
from random import randint

In [None]:
# define a random input array on the GPU
a_py = [randint(1,100) for i in range(2000)]
a_np = np.random.randint(1,100, 2000)
a_cp = cp.asarray(a_np)
a_cuda = numba.cuda.to_device(a_np)

In [None]:
## PURE PYTHON IMPLEMENTATION

In [None]:
def double_all_elements_pure(py_list):
  for i in range(len(py_list)):
    py_list[i] *= 2

  return py_list


In [None]:
%timeit double_all_elements_pure(a_py)

1000 loops, best of 5: 342 µs per loop


In [None]:
# NUMPY IMPLEMENTATION(s)

In [None]:
def double_all_elements_np(np_arr):
  return np_arr * 2

In [None]:
%timeit double_all_elements_np(a_np)

The slowest run took 47.07 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 5: 1.77 µs per loop


In [None]:
@np.vectorize
def double_element_np(element):
  return element * 2

In [None]:
%timeit double_element_np(a_np)

1000 loops, best of 5: 262 µs per loop


In [None]:
# CUPY IMPLEMENTATION(s)

In [None]:
def double_all_elements_cp(cp_arr):
  return cp_arr * 2

In [None]:
%timeit double_all_elements_cp(a_cp)

The slowest run took 11.66 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 5: 20.9 µs per loop


In [None]:
@cp.vectorize
def double_element_cp(element):
  return element * 2

In [None]:
%timeit double_element_cp(a_cp)

The slowest run took 136.11 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 5: 9.88 µs per loop


In [None]:
# NUMBA IMPLEMENTATION(s)

In [None]:
@numba.cuda.jit
def double_all_elements_cuda(cuda_arr):
    pos = numba.cuda.grid(1)
    if pos < cuda_arr.size:
        cuda_arr[pos] *= 2

In [None]:
threadsperblock = 32
blockspergrid = np.ceil(a_cuda.shape[0] / threadsperblock).astype(int)

%timeit double_all_elements_cuda[blockspergrid, threadsperblock](a_cuda)

The slowest run took 6.38 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 5: 98.4 µs per loop


In [None]:
@numba.cuda.jit
def double_all_elements_cuda(cuda_arr):
    start = numba.cuda.grid(1)
    stride = numba.cuda.gridsize(1)
    
    for i in range(start, cuda_arr.size,stride):
      if i < cuda_arr.size:
          cuda_arr[i] *= 2

In [None]:
%timeit double_all_elements_cuda[blockspergrid, threadsperblock](a_cuda)

The slowest run took 1576.09 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 5: 101 µs per loop


In [None]:
# much longer compilation time, but the same speed afterwards.... not a good tradeoff

In [None]:
@numba.vectorize(['int64(int64)'], target='cuda')
def double_element_cuda(element):
  return element * 2

In [None]:
%timeit double_element_cuda(a_cuda)

1000 loops, best of 5: 525 µs per loop


In [None]:
# food for thought - when running on cpu and on the numpy array, it performs
# much better...
@numba.vectorize(['int64(int64)'], target='cpu')
def double_element_cuda(element):
  return element * 2

%timeit double_element_cuda(a_np)

The slowest run took 24.07 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 5: 878 ns per loop


In [None]:
# numba running on cpu on np array proves to be the fastest one for this specific task (800-900 nanoseconds)
# followed by a simple numpy implementation (1.8 microsecond)
# with cupy's vectorized function being third (10 microseconds)