## Download Example Data

In [1]:
# ! FILEID="1OO0tUguZMyQ1d37K7F9jiwV7mm_z2yuD" && FILENAME="example_data.npy" && wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id='$FILEID -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$FILEID" -O $FILENAME && rm -rf /tmp/cookies.txt

## Import Numba CUDA

In [2]:
from numba import cuda
import numpy as np
import math

In [3]:
data = np.load('example_data.npy')

## Numba (CUDA Python)

In [7]:
@cuda.jit
def cuda_ridge_detection(f, count, thres):
    start_i, start_j = cuda.grid(2)
    stride_i, stride_j = cuda.gridsize(2)
    for i in range(start_i, f.shape[0], stride_i):
        for j in range(start_j, f.shape[1], stride_j):
            if (
                i > 0
                and j > 0
                and i < (f.shape[0] - 1)
                and j < (f.shape[1] - 1)
                and f[i, j] > thres
                and ~math.isnan(f[i, j])
            ):
                step_i = i
                step_j = j
                for k in range(1000):
                    if (
                        step_i == 0
                        or step_j == 0
                        or step_i == (f.shape[0] - 1)
                        or step_j == (f.shape[1] - 1)
                    ):
                        break
                    index = 4
                    vmax = -np.inf
                    for ii in range(3):
                        for jj in range(3):
                            if f[step_i + ii - 1, step_j + jj - 1] > vmax:
                                vmax = f[step_i + ii - 1, step_j + jj - 1]
                                index = jj + 3 * ii
                    if index == 4 or vmax == f[step_i, step_j] or math.isnan(vmax):
                        break
                    row = int(index / 3)
                    col = index % 3
                    cuda.atomic.add(count, (step_i - 1 + row, step_j - 1 + col), 1)
                    step_i = step_i - 1 + row
                    step_j = step_j - 1 + col

In [8]:
def test_func(data):
    device_data = cuda.to_device(data)
    device_results = cuda.device_array_like(device_data)
    cuda_ridge_detection[(8, 8), (8, 32)](device_data, device_results, 0)
    cuda_results = device_results.copy_to_host()
    return cuda_results

In [9]:
cuda_results = test_func(data)
np.testing.assert_almost_equal(results, cuda_results)

In [10]:
%timeit -r 7 -n 1000 test_func(data)

1.67 ms ± 8.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Speedup by 200,000+ times!!!!!

- CPU-based solution cost 366 seconds (366000 ms)
- CUDA Python solution cost 0.00167 seconds (1.67 ms)

In [11]:
366000 / 1.67

219161.6766467066