# Setup

In [1]:
import Pkg

In [2]:
using CUDA
using Distributions
using BenchmarkTools

# Create graph

In [3]:
N = 16

# graph = zeros(2^N)
graph = rand(Uniform(-5,5),N, N)
graph = graph * graph'

qubo = zeros(N,N)
constant = 0

for i in 1:N
    qubo[i,i] = 2 * graph[i,i]
    constant += graph[i,i]
    
    for j in 1:N
        if i!=j
            low, high = sort!([i, j])
            constant -= graph[low,high]*0.5
            qubo[i,i] -= 2*graph[low,high]
            qubo[low,high] += graph[low,high]*2
        end
    end
end


cu_graph = graph |> cu 
cu_qubo = qubo |> cu 

16×16 CuArray{Float32, 2}:
 437.291  -153.178   67.6214     7.36081  …   -43.0893   -33.5755   110.051
   0.0     324.474   44.3128  -109.389        -56.6543   -11.5727  -165.295
   0.0       0.0    -31.4193    -5.50019      -33.946    -57.6708   115.111
   0.0       0.0      0.0      290.238        108.296     23.2164   -12.7123
   0.0       0.0      0.0        0.0          182.239      6.0594   -92.7182
   0.0       0.0      0.0        0.0      …  -122.536    -96.7812   -18.5453
   0.0       0.0      0.0        0.0           19.2072   220.204     76.8722
   0.0       0.0      0.0        0.0           43.9545    66.9508   -14.5232
   0.0       0.0      0.0        0.0         -183.674    -22.5728    15.7184
   0.0       0.0      0.0        0.0          165.841     64.7047    -8.24424
   0.0       0.0      0.0        0.0      …  -206.106   -157.78     -92.4672
   0.0       0.0      0.0        0.0          -14.6739  -202.072     53.0884
   0.0       0.0      0.0        0.0          -57.5

# Brute forece

In [4]:
function energy(graph, state_code)
    F = 0
    N = size(graph)[1]
    q = digits(state_code, base=2, pad=N) #|> reverse

    for i in 1:N
        F -= graph[i,i]*q[i]  
        for j in 1:N
            low, high = sort!([i, j])
            F -= graph[low,high]*q[i]*q[j]
        end
    end
    return F
end

@btime begin
    res = zeros(2^N)

    for k in 1:2^N
        F = energy(qubo, k)
        res[k] = F
    end

    sort!(res)
end

  844.075 ms (17103879 allocations: 1.52 GiB)


65536-element Vector{Float64}:
 -9016.135356216362
 -8961.867715646391
 -8907.711229468185
 -8893.618220835628
 -8886.340112942977
 -8882.983865296797
 -8825.551337531875
 -8770.563401209605
 -8769.833410807401
 -8755.147452825902
 -8731.853599300772
 -8699.991979226768
 -8685.128969104973
     ⋮
  -575.5108830493963
  -563.888666218201
  -563.8456838554023
  -536.2803312779048
  -506.63636568895583
  -452.9298969851137
  -421.91484546074537
  -386.32488460640445
  -368.89648537844505
  -255.5943536095814
     0.0
    62.8386052479541

# Brute-force GPU

In [5]:
function _energy(state_code, graph)
    F = 0
    N = size(graph)[1]
    
    sc1 = state_code    
    for i in 1:N
        qi = sc1%2
        sc1 = div(sc1,2)
        
        F -= graph[i,i]*qi  
        
        sc2 = state_code
        for j in 1:N
            qj = sc2%2
            sc2 = div(sc2,2)
                       
            low, high = i < j ? (i, j) : (j, i) 
            F -= graph[low,high]*qi*qj
        end
    end
    return F
end

function kernel(qubo, energies)
    threadsPerBlock = blockDim().x

    N = size(qubo)[1]

    i = blockIdx().x
    j = threadIdx().x

    state_code = (i - 1) * blockDim().x + j 

    F = _energy(state_code, qubo) |> Float32
    
    energies[state_code] = F
          
    return
end

function main()
    k = 2
    
    energies = CUDA.zeros(2^N) #CUDA.zeros(2*2^k)
    
    threadsPerBlock::Int64 = 2^k
    blocksPerGrid::Int64 = 2^(N-k)

    @cuda blocks=(blocksPerGrid) threads=(threadsPerBlock) kernel(cu_qubo, energies)
   
#     states = sortperm(energies)
#     energies[states]
    
    sort!(energies)
    
end

@btime begin
    main()
end

main()

  13.349 μs (81 allocations: 2.86 KiB)


65536-element CuArray{Float32, 1}:
 -9016.139
 -8961.868
 -8907.712
 -8893.62
 -8886.341
 -8882.984
 -8825.553
 -8770.564
 -8769.834
 -8755.148
 -8731.855
 -8699.993
 -8685.13
     ⋮
  -575.51086
  -563.8887
  -563.8457
  -536.28033
  -506.63635
  -452.9299
  -421.9149
  -386.3249
  -368.89648
  -255.59436
     0.0
    62.838604

In [7]:
function _energy(state_code, graph)
    F = 0
    N = size(graph)[1]
    
    sc1 = state_code    
    for i in 1:N
        qi = sc1%2
        sc1 = div(sc1,2)
        
        F -= graph[i,i]*qi  
        
        sc2 = state_code
        for j in 1:N
            qj = sc2%2
            sc2 = div(sc2,2)
                       
            low, high = i < j ? (i, j) : (j, i) 
            F -= graph[low,high]*qi*qj
        end
    end
    return F
end

function kernel2(qubo, energies, part_lst, part_st)
    threadsPerBlock = blockDim().x

    N = size(qubo)[1]

    i = blockIdx().x
    j = threadIdx().x

    state_code = (i - 1) * blockDim().x + j 

    F = _energy(state_code, qubo) |> Float32
    
    energies[state_code] = F
    
    sync_threads()
    
    if j == 1
        k = (i - 1) * blockDim().x + 1
        n = blockDim().x
        nr_of_block = gridDim().x
        
#         for ii in k:k+n-1
#             value = low_en[ii+1]
#             jj = ii
#             while jj > 0 && low_en[jj] > value
#                 low_en[jj+1] = low_en[jj]
#                 jj -= 1
#             end
#             low_en[i] = value
#         end 
        
        value=energies[k]
        st=k
        for ii in k:k+n-1
            if value > energies[ii]
                value = energies[ii]
                st=k
            end
        end 
               
        sync_threads()
        
        part_lst[i] = value # low_en[k]
        part_st[i] = st
    end
    
    return
end

function main()
    k = 2
    
    energies = CUDA.zeros(2^N) #CUDA.zeros(2*2^k)
    
    part_st = CUDA.zeros(2^(N-k))
    part_lst = CUDA.zeros(2^(N-k))
    
    threadsPerBlock::Int64 = 2^k
    blocksPerGrid::Int64 = 2^(N-k)

    @cuda blocks=(blocksPerGrid) threads=(threadsPerBlock) kernel2(cu_qubo, energies, part_lst, part_st)
   
#     states = sortperm(energies)
#     energies[states]
    
    sort!(part_lst)
    
end

@btime begin
    main()
end

  25.359 μs (107 allocations: 3.59 KiB)


16384-element CuArray{Float32, 1}:
 -9016.139
 -8961.868
 -8907.712
 -8893.62
 -8886.341
 -8770.564
 -8769.834
 -8731.855
 -8699.993
 -8668.459
 -8628.413
 -8591.251
 -8564.142
     ⋮
 -1743.6213
 -1733.8799
 -1638.6465
 -1620.257
 -1593.592
 -1525.581
 -1497.1493
 -1475.5824
 -1413.394
 -1404.0112
 -1378.2034
 -1217.1733