## CUDA Kernels
### Example from CUDA.jl

In [None]:
using CUDA
using KernelAbstractions, CUDAKernels

In [None]:
N = 2^20
x_d = CUDA.fill(1.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = CUDA.fill(2.0f0, N)  # a vector stored on the GPU filled with 2.0

function gpu_add!(y, x)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    @inbounds y[index] += x[index]
    return
end

numblocks = ceil(Int, N/256)

fill!(y_d, 2)
@cuda threads=256 blocks=numblocks gpu_add3!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

In [None]:
function add_broadcast!(y, x)
    CUDA.@sync y .+= x
    return
end
b_base = @btime add_broadcast!($y_d, $x_d)


function add_kernel!(y, x)
    numblocks = ceil(Int, length(y)/256)
    CUDA.@sync begin
        @cuda threads=256 blocks=numblocks gpu_add!(y, x)
    end
end

b_ker = @btime add_kernel!($y_d, $x_d)

display(b_base)
display(b_ker)


In [None]:

N=2^4
@kernel function add_kernel_abs!(ys,xs)
  I = @index(Global)
  ys[I] += xs[I]
  println(I)
end

x = fill(1.0f0, N)  
y = fill(2.0f0, N)  
kernel = add_kernel_abs!(CPU(), 16)
event = kernel(y,x, ndrange=size(x))
wait(event)
@test y ≈ fill(3.0f0,N)

In [None]:
x_d = CUDA.fill(1.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = CUDA.fill(2.0f0, N)  # a vector stored on the GPU filled with 2.0

A = CUDA.ones(1024, 1024)
kernel = mul2(CUDADevice(), 16)
event = kernel(A, ndrange=size(A))
wait(event)
all(A .== 2.0)

 ev = kernel(input, output, ndrange=size(output))

using CUDAdrv, CUDAnative

function kernel(x)
    c = blockIdx().x
    b = blockIdx().y
    a = threadIdx().x
    x[a, b, c] = x[a, b, c] + 1
    return
end

dx = CuArray{Float32,3}(1024, 1024, 1024)
@cuda ((1024, 1024), 1024) kernel(dx)

In [None]:
@code_warntype add_broadcast!(y_d, x_d)
@code_warntype add_kernel!(y_d, x_d)

@assert isbitstype(Test) == true

using CUDAdrv
attribute(dev, CUDAdrv.MAX_GRID_DIM_X)
2147483647

attribute(dev, CUDAdrv.MAX_GRID_DIM_Y)
65535

attribute(dev, CUDAdrv.MAX_GRID_DIM_Z)

julia> attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)
1024
julia> attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)
2147483647
julia> attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)

if has_cuda_gpu()
    CUDA.allowscalar(false)
end

output = similar(input)

In [None]:
using FoldsCUDA, CUDA, FLoops
using GPUArrays: @allowscalar

N = 2^20
x_d = CUDA.fill(1.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = CUDA.fill(2.0f0, N)  # a vector stored on the GPU filled with 2.0

function floop_map!(ys, xs, ex = CUDAEx())
   @floop ex for i in eachindex(x_d, y_d)
       @inbounds ys[i] += xs[i]
   end
   return ys
end


b_floops = @btime floop_map!($y_d, $x_d)

In [None]:
using Cuba, Distributions
using BenchmarkTools, Test, Traceur
using HCubature, CUDA
using SpecialFunctions

@test Threads.nthreads()>1
@show Threads.nthreads()
#@show CUDA.versioninfo()
has_cuda_gpu()




In [None]:
M=5 # number of independent beta random variables
atol=1e-6
rtol=1e-3
nvec=10000000

# multithread and loop to create product distribution
function int_thread_el(x,f)
   f[1,:] .= 1.0
   Threads.@threads for j in 1:size(x,2)
       for i in 1:size(x, 1)
           f[1, j] *= pdf(Beta(1.0,2.0),@view(x[i,j]))
       end
   end
end

In [None]:
@show result, err = cuhre(int_thread_el, M, 1, atol=atol, rtol=rtol,nvec=nvec);

In [None]:

@show Threads.nthreads()

In [None]:

inth(x)=int(x,zeros(M))

In [None]:
@btime hcubature(int, zeros(M),ones(M); rtol=1e-3, atol=1e-3, maxevals=nvec, initdiv=1)

In [None]:

  a = 1.0f0
  b = 2.0f0
  k = gamma(a+b)/(gamma(a)*gamma(b))
k_d = CuArray(Float32,k)


In [None]:
display(beta_pdf_d(rand(Float32, 2, 3)))
display(beta_pdf_d(CUDA.rand(Float32, 2, 3)))

In [None]:
display(int(zeros(M)))
display(beta_pdf_d(zeros(M)))
yy=rand(M)
display(prod(k*yy.^(a-1.0f0) .* (1.0f0 .- yy).^(b-1.0f0)))
display(prod(pdf(Beta(1.0,2.0),yy)))

display(beta_pdf_d(yy))
display(int(yy))

In [None]:


const M=3 # number of independent beta random variables
const atol=1e-6
const rtol=1e-3
const atolf=1f-6
const rtolf=1f-3
const maxevals=10000000
const z=zeros(M)
const o=ones(M)
const z_d=CUDA.zeros(Float32,M)
const o_d=CUDA.ones(Float32,M)

In [None]:
const a = 1.0
const b = 2.0
const k = gamma(a+b)/(gamma(a)*gamma(b))

In [None]:
k_d=CuArray{Float32}([k])

In [None]:
function m(x::CuArray{T, 2}) where T
   x=1
end

In [None]:
yy=CUDA.rand(Float32,10,10)
@trace m(yy)

In [None]:
@code_typed m(yy)

In [None]:
 @code_lowered m(yy)
@code_llvm m(yy)

In [None]:
function beta_pdf_d(x::CuArray{T, 2}) where T
   prod(k*x.^(a .- CUDA.ones(T)) .* ( CUDA.ones(T) .- x).^(b .- CUDA.ones(T)),dims=1)
end

function beta_pdf(x::Array{T, 2}) where T
   prod(k*x.^(a .- CUDA.ones(T)) .* ( CUDA.ones(T) .- x).^(b .- CUDA.ones(T)),dims=1)
end

In [None]:
@trace beta_pdf_d( CUDA.rand(Float32,10,10) )

In [None]:
@code_warntype beta_pdf_d( CUDA.rand(Float32,10,10) )

In [None]:
@benchmark CUDA.@sync  beta_pdf_d( CUDA.rand(Float32,10,10) )

In [None]:
@benchmark CUDA.@sync blocking=false beta_pdf_d( CUDA.rand(Float32,10,10) )

In [None]:
typeof(rand(Float32,10,10))

In [None]:
@show typeof(beta_pdf_d(rand(Float32,10,10)))
@show typeof(beta_pdf_d(rand(Float32,14,8)))
@show typeof(beta_pdf_d(rand(Float32,1,1)))

@show typeof(CUDA.rand(Float32,10,10))
@show typeof(beta_pdf_d(CUDA.rand(Float32,10,10)))
@show typeof(beta_pdf_d(CUDA.rand(Float32,14,8)))
@show typeof(beta_pdf_d(CUDA.rand(Float32,1,1)))

In [None]:
function int(x)
   pdf(Product(Beta.(1.0,2.0*ones(M))),x)
end

yy=rand(M)
q1 = @benchmark beta_pdf_d(yy)
q2 = @benchmark int(yy)

In [None]:
typeof(k)

In [None]:
function f1()
   hcubature(int, z,o; rtol=rtol, atol=atol, maxevals=maxevals, initdiv=1)[1]
end

function f2()
   hcubature(beta_pdf_d, z,o; rtol=rtol, atol=atol, maxevals=maxevals, initdiv=1)
end

function f3()
   hcubature(beta_pdf_d,CUDA.zeros(Float32,M) ,CUDA.ones(Float32,M); rtol=rtolf, atol=atolf, maxevals=maxevals, initdiv=1)
end

sol1 = f1()
sol2 = f2()
sol3 = f3()
@test sol1 ≈ 1.0
@test sol2[1][1] ≈ 1.0
@test sol3[1][1] ≈ 1.0f0

In [None]:
b1= @benchmark f1()
b2= @benchmark f2()
b3= @benchmark f3()

@show b1,b2,b3

In [None]:
(b1, b2, b3) = (Trial(37.877 μs), Trial(9.625 μs), Trial(115.358 μs))

In [None]:
passing: 
consts: (b1, b2, b3) = (Trial(39.496 μs), Trial(10.695 μs), Trial(86.517 μs))
globals: (b1, b2, b3) = (Trial(68.935 μs), Trial(91.038 μs), Trial(172.503 μs))

In [None]:
BayesianIntegral
QuasiMonteCarlo
PolyChaos
QuadratureRules
MonteCarloIntegration
Surrogates