In [1]:
using QuantEcon, CairoMakie, CUDA

In [2]:
CUDA.name(CUDA.device())

"NVIDIA GeForce RTX 3080 Ti Laptop GPU"

In [50]:
max_threads_per_block = CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)

1024

In [58]:
function params(;nk = 101)
    p = (α = 0.33, # Capital share
         β = 0.99, # Discount factor
         A = 10.0, # TFP
         δ = 0.1, # Depreciation rate
         kmin = 1e-4, # Minimum capital
         kmax = 100, # Maximum capital
         θ = 3.0, # Grid expansion parameter
         γ = 2.0, # Risk aversion
         kgrid = zeros(Float32, nk), # Capital grid
         maxiter = 100000, # Maximum number of iterations
         tol = 1e-6) # Tolerance

    # Create capital grid
    kgrid = LinRange(p.kmin, p.kmax, nk)
    temp_grid = LinRange(0, 1, nk)
    kgrid = p.kmin .+ (p.kmax - p.kmin) * temp_grid.^p.θ
    
    #Return new NamedTuple
    p = (p..., kgrid = kgrid)
    
    return p
end

params (generic function with 1 method)

In [59]:
function gpu_test(grid)
    return grid .+ 1 
end

p = params(nk = 101)
kgrid_d = CuArray(p.kgrid)
@time gpu_test(kgrid_d)

101-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
   1.0001
   1.0001999999
   1.0008999992
   1.0027999973
   1.0064999936
   1.0125999875
   1.0216999784
   1.0343999657
   1.0512999488
   1.0729999271
   ⋮
  78.8688221312
  81.43571956430002
  84.05841694159999
  86.73751426249999
  89.47361152639999
  92.2673087327
  95.1192058808
  98.0299029701
 101.0

In [65]:
function gpu_unpack(p::NamedTuple)

    # Create CuArrays
    V_d = CuArray{Float64}(zeros(length(p.kgrid)))
    Vnew_d = CuArray{Float64}(zeros(length(p.kgrid),length(p.kgrid)))
    Vmax_d = CuArray{Float64}(zeros(length(p.kgrid)))
    pol_d = CuArray{Int64}(undef, length(p.kgrid))

    # Get params
    maxiter = p.maxiter
    tol = p.tol
    γ = p.γ
    β = p.β
    nk = length(p.kgrid)
    
    # Initialise consumption
    cons_d = p.A .* kgrid_d.^p.α .+ (1 - p.δ) .* kgrid_d .- kgrid_d'
    pos_c_d = cons_d .> 0
    cons_d .= cons_d .* pos_c_d

    # Run VFI
    gpu_vfi!(V_d, Vnew_d, Vmax_d, cons_d, pol_d, kgrid_d; maxiter = maxiter, tol = tol, γ = γ, β = β, nk = nk)

    return V_d, pol_d
end

gpu_unpack (generic function with 1 method)

In [98]:
function gpu_vfi!(V, Vnew, Vmax, cons, pol, grid; maxiter, tol, γ, β, nk)
    error = tol + 1.0
    iter = 0
    while ((error > tol) && (iter < maxiter))
        Vnew = (γ == 1.0 ? log.(cons) : (cons.^(1 - γ) .- 1) ./ (1 - γ)) .+ β .* V
        @show Vnew
        Vnew = reshape(Vnew, nk*nk)
        block_size = 1024
        num_blocks = ceil(Int, nk / block_size)
        @cuda threads=block_size blocks=num_blocks max_kernel!(Vnew, Vmax, pol, nk)
        error = maximum(abs.(Vmax .- V))
        @show error
        iter = maxiter
    end
    return V, pol
end

gpu_vfi! (generic function with 1 method)

In [99]:
function max_kernel!(Vnew, Vmax, pol, nk)
    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    if idx <= nk
        row_start = (idx - 1) * nk
        max_val = Vnew[row_start + 1]
        max_pos = 1
        for j in 2:nk
            if Vnew[row_start + j] > max_val
                max_val = Vnew[row_start + j]
                max_pos = j
            end
        end
        Vmax[idx] = max_val
        pol[idx] = max_pos
    end
    return
end

max_kernel! (generic function with 1 method)

In [100]:
gpu_unpack(p)

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [101, 101, 101, 101, 101, 101, 101, 101, 101, 101  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1])