In [None]:
using Pkg

In [2]:
Pkg.activate("..")

[32m[1m  Activating[22m[39m environment at `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Project.toml`


In [9]:
# Pkg.add("BinomialGPU")
Pkg.add("Distributions")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m    Updating[22m[39m `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Project.toml`
 [90m [31c24e10] [39m[92m+ Distributions v0.24.18[39m
[32m[1m  No Changes[22m[39m to `~/Workspace/Repo/Code/SpinGlassExhaustive.jl/Manifest.toml`


In [3]:
using BenchmarkTools
using CUDA
using SortingAlgorithms
using Flux
using Test
using BinomialGPU
using Distributions
using AbstractPlotting
using Makie
using LinearAlgebra



# GPU Binomial disitribution

In [26]:
A = CUDA.zeros(Int, 2^28)
@benchmark rand_binomial!(A, count = 10, prob = 0.5)

BenchmarkTools.Trial: 
  memory estimate:  3.27 KiB
  allocs estimate:  99
  --------------
  minimum time:     2.771 ms (0.00% GC)
  median time:      5.361 ms (0.00% GC)
  mean time:        238.118 ms (0.31% GC)
  maximum time:     497.405 ms (0.30% GC)
  --------------
  samples:          23
  evals/sample:     1

In [21]:
A = Binomial(10, 0.5)
@benchmark rand(A, 2^28)

BenchmarkTools.Trial: 
  memory estimate:  2.00 GiB
  allocs estimate:  3
  --------------
  minimum time:     20.675 s (0.00% GC)
  median time:      20.675 s (0.00% GC)
  mean time:        20.675 s (0.00% GC)
  maximum time:     20.675 s (0.00% GC)
  --------------
  samples:          1
  evals/sample:     1

In [25]:
@benchmark rand(2^28)

BenchmarkTools.Trial: 
  memory estimate:  2.00 GiB
  allocs estimate:  2
  --------------
  minimum time:     544.070 ms (0.00% GC)
  median time:      628.633 ms (9.14% GC)
  mean time:        653.855 ms (14.24% GC)
  maximum time:     838.481 ms (31.57% GC)
  --------------
  samples:          8
  evals/sample:     1

In [24]:
@benchmark CUDA.rand(2^28)

BenchmarkTools.Trial: 
  memory estimate:  192 bytes
  allocs estimate:  7
  --------------
  minimum time:     683.966 μs (0.00% GC)
  median time:      15.237 ms (0.00% GC)
  mean time:        21.373 ms (0.95% GC)
  maximum time:     83.311 ms (3.89% GC)
  --------------
  samples:          234
  evals/sample:     1

In [27]:
653/21

31.095238095238095

# Collatz Problem

In [35]:
# we need to define zero on tuples
import Base.zero
function zero(::Type{Tuple{X, Y}}) where {X, Y}
    (zero(X), zero(Y))
end

n = Int(1e6)
a = collect(1:n)

# this can run on cpu or gpu!
function collatz_length(x)
    out = 0
    while x > 1
      if x % 2 == 0
          x = x ÷ 2
      else
          x = 3 * x + 1
      end
      out += 1
    end
    out
end

# compute the collatz lengths, sort by them, extract just the starting numbers
# we sort tuples like (key, value) so that key is explicity computed only once
function rank_by_collatz(A)
    B = map(x -> (collatz_length(x), x), A)
    return map(x->x[2], sort!(B, rev=true))
end


rank_by_collatz (generic function with 1 method)

In [36]:
# CPU: 
@benchmark begin
    result = rank_by_collatz(a)
end samples=10 evals=1

BenchmarkTools.Trial: 
  memory estimate:  30.52 MiB
  allocs estimate:  6
  --------------
  minimum time:     243.223 ms (0.00% GC)
  median time:      245.986 ms (0.00% GC)
  mean time:        248.211 ms (0.00% GC)
  maximum time:     269.930 ms (0.00% GC)
  --------------
  samples:          10
  evals/sample:     1

In [37]:
# GPU: copy a to GPU, do equivalent work in parallel, copy result back from GPU
@benchmark begin
    c = CuArray(a)
    gpu_result = rank_by_collatz(c)
    result = Array(gpu_result)
end samples=10 evals=1

BenchmarkTools.Trial: 
  memory estimate:  7.63 MiB
  allocs estimate:  99
  --------------
  minimum time:     62.148 ms (0.00% GC)
  median time:      62.536 ms (0.00% GC)
  mean time:        69.392 ms (0.00% GC)
  maximum time:     89.197 ms (0.00% GC)
  --------------
  samples:          10
  evals/sample:     1

# Partial Sort

In [4]:
arr = rand(Float32, 2^12)
cuarr = CuArray(arr);

In [29]:
@benchmark partialsort!(arr, 1:10, rev=true)

BenchmarkTools.Trial: 
  memory estimate:  208 bytes
  allocs estimate:  5
  --------------
  minimum time:     15.074 μs (0.00% GC)
  median time:      15.188 μs (0.00% GC)
  mean time:        15.304 μs (0.00% GC)
  maximum time:     27.958 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [5]:
# CUDA.allowscalar(false)
# @benchmark partialsort!(cuarr, 1:10, rev=true)
@benchmark CUDA.@sync(partialsort!(cuarr, 1:10))

└ @ GPUArrays /home/dk/.julia/packages/GPUArrays/Z5nPF/src/host/indexing.jl:64


BenchmarkTools.Trial: 
  memory estimate:  777.56 KiB
  allocs estimate:  8297
  --------------
  minimum time:     42.614 ms (0.00% GC)
  median time:      44.128 ms (0.00% GC)
  mean time:        44.731 ms (0.10% GC)
  maximum time:     50.668 ms (0.00% GC)
  --------------
  samples:          112
  evals/sample:     1

In [48]:
# sort!(a)

In [None]:
arr = rand(Float32, n)
cushared = cu(shared)

@benchmark CUDA.@sync(partialsort!(cushared, 10))

In [65]:
AbstractPlotting.inline!(true)

a = rand(Float32, 40000)
c = CuArray(a)
partialsort!(c, 1:1000)
lines(c)

FigureAxisPlot()

In [None]:
function quicksort_kernel(A, lo, hi)
    # blockDim().x is the size of the thread block
    # threadIdx().x returns the id of the thread that calls it
    swap = @cuDynamicSharedMem(T, blockDim().x)
    stride = L <= blockDim().x ? Int32(1) : Int32(L ÷ blockDim().x)
    brick_sort(A, lo, hi, swap, stride)
    if hi - lo <= blockDim().x
        return
    end
    pivot = A[lo + stride * blockDim().x ÷ 2]
    # TODO: partition
    # TODO: launch left/right quicksort_kernel
end


arr = rand(Float32, n)
cuarr = cu(arr)

@benchmark quicksort_kernel(cuarr, 0, length(cuarr))

# Eigenvalues

In [9]:
A_full = Symmetric(rand(3000, 3000));

In [11]:
@benchmark eigen(A_full)

BenchmarkTools.Trial: 
  memory estimate:  207.07 MiB
  allocs estimate:  17
  --------------
  minimum time:     3.866 s (0.27% GC)
  median time:      3.884 s (0.15% GC)
  mean time:        3.884 s (0.15% GC)
  maximum time:     3.902 s (0.03% GC)
  --------------
  samples:          2
  evals/sample:     1

In [13]:
A = Symmetric(rand(3000,3000));
A_cuda = CuArray{Float64}(A);
@benchmark CUDA.@sync CUDA.CUSOLVER.syevd!('V','U', A_cuda)

BenchmarkTools.Trial: 
  memory estimate:  1.09 KiB
  allocs estimate:  28
  --------------
  minimum time:     721.569 ms (0.00% GC)
  median time:      724.177 ms (0.00% GC)
  mean time:        723.902 ms (0.00% GC)
  maximum time:     725.986 ms (0.00% GC)
  --------------
  samples:          7
  evals/sample:     1

In [18]:
CUDA.rand(2)

2-element CuArray{Float32, 1}:
 0.7219676
 0.8175036