# Setup

In [1]:
using Pkg

In [2]:
Pkg.activate("..")

[32m[1m  Activating[22m[39m environment at `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Project.toml`


In [3]:
Pkg.add("BenchmarkTools")
Pkg.add("CUDA")
Pkg.add("SortingAlgorithms")
Pkg.add("Flux")

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Workspace/Re

In [4]:
using BenchmarkTools
using CUDA
using SortingAlgorithms
using Flux
using Test

In [5]:
CUDA.version()

v"11.3.0"

# CUDA example

## Benchmark

In [27]:
N = 10^9

1000000000

In [28]:
x_d = CUDA.fill(1.0f0, N);  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = CUDA.fill(2.0f0, N); # a vector stored on the GPU filled with 2.0

In [29]:
x = fill(1.0f0, N);  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N);

In [30]:
function sequential_add!(y, x)
    for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
sequential_add!(y, x)
@test all(Array(y) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [31]:
@benchmark sequential_add!($y, $x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     537.816 ms (0.00% GC)
  median time:      596.160 ms (0.00% GC)
  mean time:        595.937 ms (0.00% GC)
  maximum time:     659.410 ms (0.00% GC)
  --------------
  samples:          9
  evals/sample:     1

In [45]:
Threads.nthreads()

1

In [32]:
function parallel_add!(y, x)
    Threads.@threads for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
parallel_add!(y, x)
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [33]:
@benchmark parallel_add!($y, $x)

BenchmarkTools.Trial: 
  memory estimate:  576 bytes
  allocs estimate:  8
  --------------
  minimum time:     526.176 ms (0.00% GC)
  median time:      538.213 ms (0.00% GC)
  mean time:        541.256 ms (0.00% GC)
  maximum time:     564.816 ms (0.00% GC)
  --------------
  samples:          10
  evals/sample:     1

In [34]:
function gpu_add1!(y, x)
    for i = 1:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda gpu_add1!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [35]:
function bench_gpu1!(y, x)
    CUDA.@sync begin
        @cuda gpu_add1!(y, x)
    end
end

bench_gpu1! (generic function with 1 method)

In [36]:
bench_gpu1!(y_d, x_d)  # run it once to force compilation
CUDA.@profile bench_gpu1!(y_d, x_d)

CUDA.HostKernel{gpu_add1!, Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(CuContext(0x0000000004e9fe50, instance 1ece2041d2db82b3), CuModule(Ptr{Nothing} @0x0000000003cf1200, CuContext(0x0000000004e9fe50, instance 1ece2041d2db82b3)), CuFunction(Ptr{Nothing} @0x0000000007572c80, CuModule(Ptr{Nothing} @0x0000000003cf1200, CuContext(0x0000000004e9fe50, instance 1ece2041d2db82b3))))

In [46]:
@benchmark bench_gpu1!($y_d, $x_d)

BenchmarkTools.Trial: 
  memory estimate:  432 bytes
  allocs estimate:  18
  --------------
  minimum time:     45.170 s (0.00% GC)
  median time:      45.170 s (0.00% GC)
  mean time:        45.170 s (0.00% GC)
  maximum time:     45.170 s (0.00% GC)
  --------------
  samples:          1
  evals/sample:     1

In [38]:
function gpu_add2!(y, x)
    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
    stride = blockDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda threads=256 gpu_add2!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [50]:
function bench_gpu2!(y, x)
    CUDA.@sync begin
        @cuda threads=1024 gpu_add2!(y, x)
    end
end

bench_gpu2! (generic function with 1 method)

In [51]:
@benchmark bench_gpu2!($y_d, $x_d)

BenchmarkTools.Trial: 
  memory estimate:  432 bytes
  allocs estimate:  18
  --------------
  minimum time:     344.468 ms (0.00% GC)
  median time:      345.136 ms (0.00% GC)
  mean time:        345.222 ms (0.00% GC)
  maximum time:     346.363 ms (0.00% GC)
  --------------
  samples:          15
  evals/sample:     1

In [60]:
function gpu_add3!(y, x)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return
end

numblocks = ceil(Int, N/256)

fill!(y_d, 2)
@cuda threads=256 blocks=numblocks gpu_add3!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [61]:
function bench_gpu3!(y, x)
    numblocks = ceil(Int, length(y)/256)
    CUDA.@sync begin
        @cuda threads=256 blocks=numblocks gpu_add3!(y, x)
    end
end

bench_gpu3! (generic function with 1 method)

In [62]:
@benchmark bench_gpu3!($y_d, $x_d)

BenchmarkTools.Trial: 
  memory estimate:  432 bytes
  allocs estimate:  18
  --------------
  minimum time:     22.365 ms (0.00% GC)
  median time:      24.588 ms (0.00% GC)
  mean time:        24.585 ms (0.00% GC)
  maximum time:     25.374 ms (0.00% GC)
  --------------
  samples:          204
  evals/sample:     1

In [63]:
595.937/24.585

24.239861704291233