In [3]:
using Random, Distributions
using CUDA
using BenchmarkTools, Base.Threads
using PyPlot

In [1]:
#Initialize Kernels

#Method 1 of Vd
#A += (1-ϕ)* Vd0
function sumdef1(sumdef,Vd,Vd0,V0,ϕ,β,P)
    #sumdef = CUDA.zeros(Ny)
    A = ϕ* V0[:,1]
    A += (1-ϕ)* Vd0
    A.= ϕ.* V0[:,1] .+ (1-ϕ).* Vd0
    temp = P
    temp .*= CUDA.transpose(A)
    temp .*= β
    #temp = β* P .* CUDA.transpose(A)
    sumdef += reduce(+, temp, dims=2) #This gives Vd
    #Then do a value transport to Vd
    Vd = sumdef
end

#line 7.1 Intitializing U((1-τ)iy) to each Vd[iy]
function def_init(sumdef,τ,Y,α)
    iy = threadIdx().x
    stride = blockDim().x
    for i = iy:stride:length(sumdef)
        sumdef[i] = CUDA.pow(exp((1-τ)*Y[i]),(1-α))/(1-α)
    end
    return
end

#adding expected value to sumdef
function def_add(matrix, P, β, V0, Vd0, ϕ, Ny)
    y = (blockIdx().x-1)*blockDim().x + threadIdx().x
    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y

    if (iy <= Ny && y <= Ny)
        matrix[iy,y] = β* P[iy,y]* (ϕ* V0[y,1] + (1-ϕ)* Vd0[y])
        #Note memory transfer of matrices of P and Vd0 are not optimal
    end
    return
end

#Method 2 of Vd
function sumdef2(sumdef) #Calculate sumdef in a kernel
    @cuda threads=threadcount blocks=blockcount def_init(sumdef,τ,Y,α)
    temp = CUDA.zeros(Ny,Ny)
    blockcount = (ceil(Int,Ny/10),ceil(Int,Ny/10))
    @cuda threads=threadcount blocks=blockcount def_add(temp, P, β, V0, Vd0, ϕ, Ny)
    sumdef += reduce(+, temp, dims=2)
end

#@benchmark sumdef1(sumdef) #240.6 μs
#@benchmark sumdef2(sumdef)
#----

#Calculate Cost Matrix C
function vr_C(Ny,Nb,Y,B,Price0,P,C)
    ib = (blockIdx().x-1)*blockDim().x + threadIdx().x
    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y

    if (ib <= Nb && iy <= Ny)
        for b in 1:Nb
            C[iy,ib,b] = -Price0[iy,b]*B[b] + CUDA.exp(Y[iy]) + B[ib]
        end
    end
end

#map C -> U(C), then add β*sumret
function vr_C2(Ny,Nb,Vr,V0,Y,B,Price0,P,C,C2,sumret,α)
    ib = (blockIdx().x-1)*blockDim().x + threadIdx().x
    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y

    if (ib <= Nb && iy <= Ny)
        for b in 1:Nb
            if C[iy,ib,b] > 0
                c = C[iy,ib,b]
                C2[iy,ib,b] = CUDA.pow(c,(1-α)) / (1-α) + B[ib] - Price0[iy,b]*B[b] #Note CUDA.pow only support certain types, need to cast constant to Float32 instead of Float64
            end
        end
    end
end

#----
#Calcuate sumret[iy,ib,b]
function vr_sumret(Ny,Nb,V0,P,sumret)
    ib = (blockIdx().x-1)*blockDim().x + threadIdx().x
    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y

    if (ib <= Nb && iy <= Ny)
        for b in 1:Nb
            sumret[iy,ib,b] = 0
            for y in 1:Ny
                sumret[iy,ib,b] += P[iy,b]*V0[y,b]
            end
        end
    end
end


#---
#write into decision function
function decide(Ny,Nb,Vd,Vr,V,decision)

    ib = (blockIdx().x-1)*blockDim().x + threadIdx().x
    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y

    if (ib <= Nb && iy <= Ny)

        if (Vd[iy] < Vr[iy,ib])
            V[iy,ib] = Vr[iy,ib]
            decision[iy,ib] = 0
        else
            V[iy,ib] = Vd[iy]
            decision[iy,ib] = 1
        end
    end
    return
end

function prob_calc(Ny,Nb,prob,P,decision)
    ib = (blockIdx().x-1)*blockDim().x + threadIdx().x
    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y

    if (ib <= Nb && iy <= Ny)
        #prob[iy,ib] = P[iy,:]'decision[:,ib]
        for y in Ny
            prob[iy,ib] += P[iy,y]*decision[y,ib]
        end
    end
    return
end


Price_calc(x, rstar) = (1-x) / (1+rstar)
#@benchmark Price = Price_calc.(prob, rstar)


#line 7.1 Intitializing U((1-τ)iy) to each Vd[iy] #BATCH UPDATE
function def_init_old(sumdef,τ,Y,α)
    iy = threadIdx().x
    stride = blockDim().x
    for i = iy:stride:length(sumdef)
        sumdef[i] = exp((1-τ)*Y[i])/(1-α)
    end
    return
end

#line 7.2 adding second expected part to calcualte Vd[iy]
function def_add_old(matrix, P, β, V0, Vd0, ϕ, Ny)
    y = (blockIdx().x-1)*blockDim().x + threadIdx().x
    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y

    if (iy <= Ny && y <= Ny)
        matrix[iy,y] = β* P[iy,y]* (ϕ* V0[y,1] + (1-ϕ)* Vd0[y])
    end
    return
end

function vr_old(Nb,Ny,α,β,τ,Vr,V0,Y,B,Price0,P)

    ib = (blockIdx().x-1)*blockDim().x + threadIdx().x
    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y

    if (ib <= Nb && iy <= Ny)

        Max = -Inf
        for b in 1:Nb
            c = Float32(CUDA.exp(Y[iy]) + B[ib] - Price0[iy,b]*B[b])
            if c > 0 #If consumption positive, calculate value of return
                sumret = 0
                for y in 1:Ny
                    sumret += V0[y,b]*P[iy,y]
                end
                Max = CUDA.max(Max, CUDA.pow(c,(1-α))/(1-α) + β * sumret)
            end
        end
        Vr[iy,ib] = Max
    end
    return
end


#line 9-14 debt price update
function Decide_old(Nb,Ny,Vd,Vr,V,decision,decision0,prob,P,Price,rstar)

    ib = (blockIdx().x-1)*blockDim().x + threadIdx().x
    iy = (blockIdx().y-1)*blockDim().y + threadIdx().y

    if (ib <= Nb && iy <= Ny)

        if (Vd[iy] < Vr[iy,ib])
            V[iy,ib] = Vr[iy,ib]
            decision[iy,ib] = 0
        else
            V[iy,ib] = Vd[iy]
            decision[iy,ib] = 1
        end

        for y in 1:Ny
            prob[iy,ib] += P[iy,y] * decision[y,ib]
        end

        Price[iy,ib] = (1-prob[iy,ib]) / (1+rstar)

    end
    return
end


Decide_old (generic function with 1 method)

In [15]:
#Benchmark on old implementation
#Using @benchmark
#add grid_space element N to test on N*N endowment*bond matrix
function bench_old_version()

    Grid_space = [100] #[50 100 150 200 300 400]# 450 500 550 600]
    global BenchResultsMedian = zeros(10,length(Grid_space))

    global iter=1

    for i in Grid_space
        println("round $iter")
        
        #Initialize varaibles
        Ny = i
        Nb = i
        maxInd = Ny * Nb #total grid points
        rstar = Float32(0.017) #r* used in price calculation
        α = Float32(0.5) #α used in utility function

        #lower bound and upper bound for bond initialization
        lbd = -1
        ubd = 0

        #β,ϕ,τ used as in part 4 of original paper
        β = Float32(0.953)
        ϕ = Float32(0.282)
        τ = Float32(0.5)

        δ = Float32(0.8) #weighting average of new and old matrixs

        #ρ,σ For tauchen method
        ρ = Float32(0.9)
        σ = Float32(0.025)


        #Initializing Bond matrix
        minB = lbd
        maxB = ubd
        step = (maxB-minB) / (Nb-1)
        B = CuArray(minB:step:maxB) #Bond

        #Intitializing Endowment matrix
        σ_z = sqrt((σ^2)/(1-ρ^2))
        Step = 10*σ_z/(Ny-1)
        Y = CuArray(-5*σ_z:Step:5*σ_z) #Endowment

        Pcpu = zeros(Ny,Ny)  #Conditional probability matrix
        V = CUDA.fill(1/((1-β)*(1-α)), Ny, Nb) #Value
        Price = CUDA.fill(1/(1+rstar), Ny, Nb) #Debt price
        Vr = CUDA.zeros(Ny, Nb) #Value of good standing
        Vd = CUDA.zeros(Ny) #Value of default
        C = CUDA.zeros(Ny,Nb,Nb)
        VR = CUDA.zeros(Ny,Nb,Nb)
        sumret = CUDA.zeros(Ny,Nb,Nb)
        V0 = CUDA.deepcopy(V)
        Vd0 = CUDA.deepcopy(Vd)
        Price0 = CUDA.deepcopy(Price)
        prob = CUDA.zeros(Ny,Nb)
        decision = CUDA.ones(Ny,Nb)
        decision0 = CUDA.deepcopy(decision)
        sumdef = CUDA.zeros(Ny)
        C2 = CUDA.zeros(Ny,Nb,Nb)
        global vr
        elem = 1
        global temp = CUDA.zeros(Ny,Ny)
        tauchen(ρ, σ, Ny, Pcpu)
        P = CuArray(Pcpu)

        global threadcount = (16,16) #set up defualt thread numbers per block
        global blockcount = (ceil(Int,Ny/10),ceil(Int,Ny/10))

        println("begin benchmark")
    
        
        #Part 1, get total time for value of default calculation, t0+t1+t2+t3
        t0 = @benchmark @cuda threads=50 def_init_old(sumdef,τ,Y,α)
        BenchResultsMedian[elem,iter] = time(median(t0))
        elem += 1
        
        t1 = @benchmark @cuda threads=threadcount blocks=blockcount def_add_old(temp, P, β, V0, Vd0, ϕ, Ny)
        BenchResultsMedian[elem,iter] = time(median(t1))
        elem += 1
        
        t2 = @benchmark temp2 = sum(temp,dims=2)
        global temp2 = sum(temp,dims=2)
        t3 = @benchmark sumdef2 = sumdef + temp2
        BenchResultsMedian[elem,iter] = time(median(t0)) + time(median(t1)) + time(median(t2)) + time(median(t3))
        elem += 1

        #Part 2, get total time for value of repayment calculation
        t = @benchmark @cuda threads=threadcount blocks=blockcount vr_old(Nb,Ny,α,β,τ,Vr,V0,Y,B,Price0,P)
        BenchResultsMedian[elem,iter] = time(median(t))
        elem += 1
        
        #Part 3, get total time for decision calculation
        t = @benchmark @cuda threads=threadcount blocks=blockcount Decide_old(Nb,Ny,Vd,Vr,V,decision,decision0,prob,P,Price,rstar)
        BenchResultsMedian[elem,iter] = time(median(t))
        elem += 1

        println("iter $iter over")
        iter+=1
        display(BenchResultsMedian)

    end

end

bench_old_version (generic function with 1 method)

In [16]:
bench_old_version()

round 1
begin benchmark


UndefVarError: UndefVarError: temp not defined

In [5]:
const plt = PyPlot;