In [9]:
using SIMD

In [43]:
using BenchmarkTools
using LoopVectorization

n_clusters = 32
n_examples = 1_000_000
n_features = 128

PQ = Int8.(rand(1:n_clusters, n_features, n_examples))   # PQcodes
y =  Int8.(rand(1:n_clusters, n_features));


@inline function abs_dist(y::Array{T}, X::Array{T}, j) where T
    res = Int32(0)
    @inbounds @simd  for k in eachindex(y)
        res += abs(X[k, j] - y[k])
    end
    return res
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{Int32}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, X, j)    
    end
    return distances
end

PQ = Int8.(rand(1:n_clusters, n_features, n_examples))   # PQcodes
y =  Int8.(rand(1:n_clusters, n_features));
PQcols = [Array(x) for x in eachcol(PQ)];

@benchmark linear_scann_exact(abs_dist, y, PQ)

BenchmarkTools.Trial: 374 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m10.479 ms[22m[39m … [35m46.009 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 68.15%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m13.209 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m13.358 ms[22m[39m ± [32m 2.984 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.80% ±  6.13%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▇[39m▇[39m▅[39m▃[39m [39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m▆[39m▁[34m▂[39m[39m▄[32m▅[39m[39m▂[39m▃[39m▂[39m█[39m [39m [39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▄[39m▃[39m▁[39m▅[39m▃[39m▆[3

In [44]:
PQ = Int8.(rand(1:n_clusters, n_features, n_examples))   # PQcodes
y =  Int8.(rand(1:n_clusters, n_features));
PQcols = [Array(x) for x in eachcol(PQ)];

In [45]:
x_vec = vload(Vec{128,Int8}, PQcols[1], 1);
y_vec = vload(Vec{128,Int8}, y, 1)

@show abs(x_vec - y_vec)
@show sum(abs(x_vec - y_vec))

abs(x_vec - y_vec) = <128 x Int8>[3, 6, 6, 13, 11, 0, 5, 1, 15, 1, 4, 16, 0, 2, 8, 18, 2, 7, 15, 19, 5, 22, 2, 1, 16, 3, 7, 0, 14, 8, 17, 17, 5, 7, 16, 16, 11, 2, 4, 20, 15, 19, 2, 2, 2, 11, 10, 14, 7, 10, 6, 5, 8, 9, 13, 16, 3, 7, 14, 11, 15, 5, 22, 10, 10, 13, 13, 9, 6, 24, 3, 4, 22, 5, 25, 7, 12, 5, 17, 4, 20, 12, 8, 18, 9, 14, 4, 8, 25, 11, 5, 4, 8, 23, 9, 2, 4, 4, 22, 10, 26, 3, 11, 27, 1, 12, 17, 16, 20, 8, 2, 3, 4, 25, 26, 0, 7, 14, 25, 13, 3, 23, 1, 9, 2, 4, 7, 8]
sum(abs(x_vec - y_vec)) = 17


17

In [49]:
PQ_16 = Int16.(rand(1:n_clusters, n_features, n_examples))   # PQcodes
y_16 =  Int16.(rand(1:n_clusters, n_features));
PQcols_16 = [Array(x) for x in eachcol(PQ_16)];

In [50]:
x_vec = vload(Vec{128,Int16}, PQcols_16[1], 1);
y_vec = vload(Vec{128,Int16}, y, 1)

@show abs(x_vec - y_vec)
@show sum(abs(x_vec - y_vec))

abs(x_vec - y_vec) = <128 x Int16>[14, 9, 13, 3, 22, 18, 11, 1, 12, 0, 0, 10, 4, 18, 17, 19, 9, 3, 26, 1, 3, 4, 1, 5, 26, 9, 6, 1, 10, 20, 9, 11, 19, 26, 3, 6, 5, 19, 2, 10, 11, 7, 22, 19, 13, 3, 4, 4, 4, 9, 17, 5, 1, 1, 0, 21, 2, 1, 23, 26, 0, 7, 14, 13, 0, 11, 13, 1, 10, 21, 2, 6, 3, 18, 8, 23, 7, 11, 6, 4, 8, 6, 12, 6, 4, 3, 19, 21, 20, 19, 10, 4, 17, 14, 17, 24, 12, 6, 8, 1, 10, 3, 5, 1, 9, 4, 1, 7, 20, 4, 28, 0, 21, 11, 30, 15, 9, 5, 20, 10, 5, 20, 12, 18, 9, 5, 8, 1]
sum(abs(x_vec - y_vec)) = 1298


1298

In [51]:

@inline function abs_dist_vec(y, X, j) where T

    x_vec = vload(Vec{128,Int16}, X[j], 1);
    return sum(abs(x_vec - y_vec))
end

abs_dist_vec (generic function with 1 method)

In [54]:
y_vec = vload(Vec{128,Int16}, y, 1)

@benchmark abs_dist_vec(y_vec, PQcols_16, 1)

BenchmarkTools.Trial: 10000 samples with 939 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 99.111 ns[22m[39m … [35m129.707 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 99.84%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m152.240 ns               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m268.637 ns[22m[39m ± [32m  3.397 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m37.79% ±  2.99%

  [39m▄[39m [39m▂[39m [39m [39m [39m [39m [39m [39m [39m█[34m▇[39m[39m▄[39m▄[39m▆[39m▃[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m▂
  [39m█[39m█

In [66]:
@benchmark abs_dist($y, $PQ, 2) 

BenchmarkTools.Trial: 10000 samples with 999 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 9.571 ns[22m[39m … [35m171.335 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m 9.651 ns               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m10.195 ns[22m[39m ± [32m  3.516 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▆[39m█[34m▄[39m[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m▃[39m▇[39m▅[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[34m█[39m[39

In [62]:
@show abs_dist(y, PQ, 1) 
@show abs_dist_vec(y_vec, PQcols_16, 1)

abs_dist(y, PQ, 1) = 1324
abs_dist_vec(y_vec, PQcols_16, 1) = 1298


1298

In [None]:
@inline function abs_dist(y::Array{T}, X::Array{T}, j) where T
    res = Int32(0)
    @inbounds @simd  for k in eachindex(y)
        res += abs(X[k, j] - y[k])
    end
    return res
end



## Problem: vgather does not work in int32

In [3]:
reals = rand(32);
idx = rand(1:32, 1000);
idx_slice_int64 = vload(Vec{4,Int64}, idx, 1)
vgather(reals, idx_slice_int64)

<4 x Float64>[0.8823653383057072, 0.8665864500605303, 0.2618289858895918, 0.20873656011552566]

In [8]:
reals = rand(32);
idx = Int32.(rand(1:32, 1000));
idx_slice_int32 = vload(Vec{4,Int32}, idx, 1)
# This does not work
#vgather(reals, idx_slice_int32)

<4 x Int32>[23, 19, 22, 25]

In [5]:
using BenchmarkTools
using LoopVectorization

n_clusters = 32
n_examples = 1_000_000
n_features = 128

T = Float32.(rand(n_clusters, n_features));                   # ADC table
PQ = Int8.(rand(1:n_clusters, n_features, n_examples))    # PQcodes
y = Float32.(rand(n_features));

function lsh(PQ, T)
    
    n_features, n_examples = size(PQ)
    d = zeros(eltype(T), n_examples)
    
    @inbounds @fastmath for n in 1:n_examples
        res = zero(eltype(T))

        for j in 1:n_features
            res += T[PQ[j,n],j]    
        end
        d[n] = res
    end
    return d
end



lsh (generic function with 1 method)

In [6]:
@benchmark lsh($PQ, $T)

BenchmarkTools.Trial: 107 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m40.741 ms[22m[39m … [35m57.702 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 12.99%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m47.008 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m46.982 ms[22m[39m ± [32m 2.794 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.56% ±  2.41%

  [39m [39m [39m [39m [39m [39m▃[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m▆[39m▁[39m [39m [39m▆[39m▆[39m▁[34m▆[39m[39m▁[39m▃[39m█[39m [39m [39m▆[39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▄[39m▁[39m▄[39m▄[39m▁[39m█[39m▁[

In [17]:

n_clusters = 32
n_examples = 1_000_000
n_features = 128

T = Float32.(rand(n_clusters, n_features));                   # ADC table
PQ = Int8.(rand(1:n_clusters, n_features, n_examples))    # PQcodes
y = Float32.(rand(n_features));

function lsh(PQ, T)
    
    n_features, n_examples = size(PQ)
    d = zeros(eltype(T), n_examples)
    
    @inbounds @fastmath for n in 1:n_examples
        res = zero(eltype(T))

        for j in 1:n_features
            res += T[PQ[j,n],j]    
        end
        d[n] = res
    end
    return d
end






lsh (generic function with 1 method)

In [18]:
T

32×128 Matrix{Float32}:
 0.261508   0.119709   0.494325    …  0.199487   0.504201   0.0128225
 0.878577   0.155021   0.672723       0.833461   0.0218257  0.346586
 0.595066   0.143906   0.554177       0.216249   0.219969   0.385533
 0.635224   0.937321   0.607402       0.688434   0.444384   0.53211
 0.968566   0.0841903  0.70269        0.719249   0.951361   0.0479293
 0.273914   0.109733   0.366922    …  0.130734   0.96038    0.906284
 0.173745   0.62694    0.0451187      0.400976   0.417098   0.158388
 0.147494   0.762203   0.14395        0.192883   0.889719   0.257744
 0.276965   0.922209   0.521877       0.539143   0.619699   0.268971
 0.377723   0.951114   0.675585       0.585497   0.384385   0.861055
 0.650519   0.300227   0.0162584   …  0.521274   0.0160092  0.704236
 0.459765   0.153149   0.587107       0.0908395  0.983381   0.790655
 0.225926   0.281678   0.412811       0.460725   0.545153   0.880115
 ⋮                                 ⋱  ⋮                     
 0.282486   0.901

In [16]:
@benchmark lsh($PQ, $T)

LoadError: UndefVarError: j not defined

In [None]:
using BenchmarkTools
using LoopVectorization

n_clusters = 32
n_examples = 1_000_000
n_features = 128

T = Float32.(rand(n_clusters, n_features));                   # ADC table
T_int = Int8.(round.(T *100));
PQ = Int8.(rand(1:n_clusters, n_features, n_examples))    # PQcodes
y = Float32.(rand(n_features));

function lsh_int(PQ, T)
    
    n_features, n_examples = size(PQ)
    d = zeros(eltype(T), n_examples)
    
    @inbounds @fastmath for n in 1:n_examples
        res = zero(eltype(T))

        for j in 1:n_features
            res += T[PQ[j,n],j]    
        end
        d[n] = res
    end
    return d
end



In [None]:
@benchmark lsh_int($PQ, $T_int)

In [None]:
using BenchmarkTools
using LoopVectorization

n_clusters = 32
n_examples = 1_000_000
n_features = 128

T = Float32.(rand(n_clusters, n_features));                   # ADC table
PQ = Int8.(rand(1:n_clusters, n_features, n_examples))    # PQcodes
y = Float32.(rand(n_features));

function lsh_unrolled(PQ, T)
    
    n_features, n_examples = size(PQ)
    d = zeros(eltype(T), n_examples)
    
    @inbounds @fastmath for n in 1:n_examples
        res1 = zero(eltype(T))
        res2 = zero(eltype(T))
        res3 = zero(eltype(T))
        res4 = zero(eltype(T))
    
        for j in 1:4:n_features
            res1 += T[PQ[j,n],j] 
            res2 += T[PQ[j+1,n],j+1] 
            res3 += T[PQ[j+2,n],j+2] 
            res4 += T[PQ[j+3,n],j+3] 
            
        end
        d[n] = res1 + res2 + res3 + res4
    end
    return d
end

In [None]:
@benchmark lsh_unrolled($PQ, $T)

In [None]:
T_int = UInt8.(round.(T *100));

PQ_cols = [Vector(col) for col in eachrow(PQ)];
T_cols_int = [Vector(col) for col in eachcol(T_int)];
typemax(UInt16) > maximum(T *100)

In [None]:
function process_N_pqcodes(j, PQ_cols64, T_cols)
    acc = Vec{8, UInt8}((0,0,0,0,0,0,0,0))
    @inbounds for j in 1:128
        pq_j_idx_vec = vload(Vec{8,Int64}, PQ_cols64[j], 1)
        acc .=  acc + vgather(T_cols[j], pq_j_idx_vec)
        #println(pq_j_idx_vec)
    end
    return acc
end

In [None]:
T_cols[1][[1,2,3]]

In [None]:

println(process_N_pqcodes(1, PQ_cols, T_cols_int) )
#println(lsh(PQ_16, T))

In [None]:
@benchmark process_N_pqcodes(1, PQ_cols, T_cols_int)