Some GPU + Windows + current CUDA.jl versions have a known bug with dynamic shared memory allocation.

So using static shared memory for running this computation under Windows 10.

In [1]:
using CUDA

function kernel_blelloch_scan_static!(data, n)
    T = eltype(data)
    sdata = @cuStaticSharedMem(T, 1024)

    tid = threadIdx().x
    t = tid - 1  # 0-based

    if tid <= n
        sdata[tid] = data[tid]
    else
        return
    end
    sync_threads()

    # upsweep
    offset = 1
    while offset < n
        paircount = n ÷ (2 * offset)
        if t < paircount
            ai = offset*(2t+1)
            bi = offset*(2t+2)
            sdata[bi] += sdata[ai]
        end
        offset *= 2
        sync_threads()
    end

    if tid == 1
        sdata[n] = zero(T)
    end
    sync_threads()

    # downsweep
    offset = n ÷ 2
    while offset ≥ 1
        paircount = n ÷ (2 * offset)
        if t < paircount
            ai = offset*(2t+1)
            bi = offset*(2t+2)
            tmp = sdata[ai]
            sdata[ai] = sdata[bi]
            sdata[bi] += tmp
        end
        offset ÷= 2
        sync_threads()
    end

    data[tid] = sdata[tid]
    return
end

function gpu_scan_static!(x::CuArray)
    n = length(x)
    @assert ispow2(n)
    threads = n
    @cuda threads=threads kernel_blelloch_scan_static!(x, n)
end

gpu_scan_static! (generic function with 1 method)

In [2]:
N = 1024
x = CuArray(rand(Float32, N))
gpu_scan_static!(x)
collect(x)[1:10]

10-element Vector{Float32}:
 0.0
 0.32711983
 0.41333407
 1.3656528
 1.4945819
 2.4301267
 2.891673
 3.301309
 4.133217
 4.390769

CPU serial version for checking and comparison

In [3]:
using BenchmarkTools

# CPU reference exclusive scan (Blelloch-compatible)
function cpu_serial_scan(x::Vector{T}) where T
    n = length(x)
    out = Vector{T}(undef, n)
    out[1] = zero(T)
    for i in 2:n
        out[i] = out[i - 1] + x[i - 1]
    end
    return out
end

cpu_serial_scan (generic function with 1 method)

Compare

In [7]:
N = 1024           # must be power of 2, and max is 1024 bcz this is single block version
h_input = rand(Float32, N)

# CPU serial scan
cpu_ref = cpu_serial_scan(h_input)

# GPU scan
d_input = CuArray(h_input)
gpu_scan_static!(d_input)
gpu_result = collect(d_input)

println("First 10 CPU: ", cpu_ref[1:10])
println("First 10 GPU: ", gpu_result[1:10])
println("Correct? ", cpu_ref ≈ gpu_result)

println("\n=== Timing ===")

println("CPU:")
@btime cpu_serial_scan($h_input);
println("GPU:")
@btime begin
    copyto!(d_input, h_input)   # reset input on GPU
    gpu_scan_static!(d_input)
    synchronize()
end;

First 10 CPU: Float32[0.0, 0.597388, 1.1095417, 1.9304504, 2.0989053, 2.233097, 2.3040352, 2.3629696, 3.33138, 4.0705714]
First 10 GPU: Float32[0.0, 0.597388, 1.1095417, 1.9304504, 2.0989056, 2.233097, 2.3040352, 2.3629696, 3.33138, 4.0705714]
Correct? true

=== Timing ===
CPU:
  638.922 ns (3 allocations: 4.09 KiB)
GPU:
  26.700 μs (43 allocations: 720 bytes)
