## Example Juila and CUDA Code

In [1]:
using Test
using CUDA
using CUDA: i32

"""
This is a cuda kernel function.
a,b,c are CuArrays which are CUDA Device Arrays

Each thread will simultaneously run from the start of the kernel.
Using their ids, we can parallelize work amongst the various threads.
An example for vector addition is to give each thread an set of elements to add
We can do this by creating a step range based on the number of threads assigned and thread blocks
"""
function vadd!(a, b, c)
    index = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
    stride = gridDim().x * blockDim().x;
    
    i = index;
    
    while i < length(c)
        # using inbounds ignore the bounds checking which is slow
        @inbounds c[i] = a[i] + b[i];
        i+=stride;
    end
    
    return nothing;
end

vadd!

In [2]:
function VaddExample()
    N = 2^20;
    
    d_a = CUDA.fill(0f0, N);
    d_b = CUDA.fill(1f0, N);
    d_c = similar(d_b);

    # Calculate number of threads/block to use, each block has the same shared memory region
    nthreads = 256;
    
    # Number of blocks needed to support 2^20 element such that 1 thread does 1 element
    nblocks = ceil(Int, N/nthreads);
    
    # Need to call CUDA sync because the kernel launch is async
    CUDA.@sync begin
        @cuda threads=nthreads blocks=nblocks vadd!(d_a, d_b, d_c);
    end
    
    # Copy the result array on GPU back to Main Memory
    c = Array(d_c);
    return c;
end

VaddExample (generic function with 1 method)

In [5]:
@time result = VaddExample();

  0.004380 seconds (83 allocations: 4.005 MiB)


In [7]:
result[1]

1.0f0