In [None]:
from numba import cuda
import numpy as np
import numba

In [None]:
print(cuda.gpus)

In [None]:
def test_function_squares_python(in_array):
    for i in range(len(in_array)):
        in_array[i] = in_array[i] * in_array[i]

    return in_array

In [None]:
our_array = np.ones(50000000, dtype=int)

In [None]:
%%time 
test_function_squares_python(our_array)

In [None]:
our_array = np.ones(50000000, dtype=int)

In [None]:
@numba.jit
def test_function_squares_cpu(in_array):
    for i in range(len(in_array)):
        in_array[i] = in_array[i] * in_array[i]

    return in_array

In [None]:
%%time 
test_function_squares(our_array)

In [None]:
our_array = np.ones(5000000000, dtype=int)

In [None]:
%%time 
test_function_squares_cpu(our_array)

In [None]:
@cuda.jit
def test_function_squares_gpu(in_array):
    # Thread id in a 1D block
    tx = cuda.threadIdx.x
    # Block id in a 1D grid
    ty = cuda.blockIdx.x
    # Block width, i.e. number of threads per block
    bw = cuda.blockDim.x
    # Compute flattened index inside the array
    pos = tx + ty * bw
    if pos < in_array.size:  # Check array boundaries
        in_array[pos] = in_array[pos]*in_array[pos]


In [None]:
our_array = np.ones(5000000000, dtype=int)

In [None]:
# calculate the number of threads and blocks
threadsperblock = 32
blockspergrid = (our_array.size + (threadsperblock - 1)) // threadsperblock


In [None]:
%%time
test_function_squares_gpu[blockspergrid, threadsperblock](our_array)

In [None]:
our_array = np.ones(500000000, dtype=int)

In [None]:
# calculate the number of threads and blocks
threadsperblock = 32
blockspergrid = (our_array.size + (threadsperblock - 1)) // threadsperblock


In [None]:
%%time
test_function_squares_gpu[blockspergrid, threadsperblock](our_array)

In [None]:
%%time
test_function_squares_gpu[blockspergrid, threadsperblock](our_array)

In [None]:
%%time
test_function_squares_cpu(our_array)

In [None]:
@cuda.jit
def test_function_squares_long_gpu(in_array):
    # Thread id in a 1D block
    tx = cuda.threadIdx.x
    # Block id in a 1D grid
    ty = cuda.blockIdx.x
    # Block width, i.e. number of threads per block
    bw = cuda.blockDim.x
    # Compute flattened index inside the array
    pos = tx + ty * bw
    if pos < in_array.size:  # Check array boundaries
        for i in range(100):
            in_array[pos] = in_array[pos]*in_array[pos]


In [None]:
@numba.jit
def test_function_squares_long_cpu(in_array):
    for i in range(len(in_array)):
        for j in range(100):
            in_array[i] = in_array[i] * in_array[i]

    return in_array

In [None]:
%%time
test_function_squares_long_cpu(our_array)

In [None]:
%%time
test_function_squares_long_gpu[blockspergrid, threadsperblock](our_array)