In [1]:
import numpy as np
from numba import cuda
import numpy.typing as npt

In [2]:
@cuda.jit
def gather_kernel(src: npt.NDArray, dst: npt.NDArray, lookup: npt.NDArray):
    tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    
    if tid < dst.size:
        dst[tid] = src[lookup[tid]] #gather

In [3]:
if __name__ == "__main__":
    # Initialize arrays
    src = np.array([10, 20, 30, 40, 50, 60], dtype=np.int32)
    lookup = np.array([5, 2, 0, 3, 1, 4], dtype=np.int32)  # Which values to read from source
    dst = np.zeros_like(src)
    
    # Move data to GPU
    d_src = cuda.to_device(src)
    d_dst = cuda.to_device(dst)
    d_lookup = cuda.to_device(lookup)
    
    # Configure the grid
    block_size = 256
    grid_size = (dst.size + block_size - 1) // block_size
    
    # Run kernel
    gather_kernel[grid_size, block_size](d_src, d_dst, d_lookup)
    
    # Copy result back to host
    result = d_dst.copy_to_host()
    
    # Print results
    print("Source array:", src)
    print("Lookup table (values to gather):", lookup)  
    print("Result array:", result)



Source array: [10 20 30 40 50 60]
Lookup table (values to gather): [5 2 0 3 1 4]
Result array: [60 30 10 40 20 50]
