In [7]:
using BenchmarkTools
using LoopVectorization

n_clusters = 32
n_examples = 1_000_000
n_features = 128

T = rand(n_clusters, n_features);                   # ADC table
PQ = rand(1:n_clusters, n_features, n_examples)    # PQcodes
PQ_trans =  Matrix(PQ');
y = rand(n_features);


function lsh_transposed(PQ_trans, T)
    
    n_examples, n_features = size(PQ_trans)
    d = zeros(eltype(T), n_examples)
    @inbounds @fastmath for j in 1:n_features
        for n in 1:n_examples
            d[n] += T[ PQ_trans[n,j], j ]    
        end
    end
    return d
end


function lsh(PQ, T)
    
    n_features, n_examples = size(PQ)
    d = zeros(eltype(T), n_examples)
    
    @inbounds for n in 1:n_examples
        res = zero(eltype(T))
        @fastmath for j in 1:n_features
            res += T[PQ[j,n],j]    
        end
        d[n] = res
    end
    return d
end

res_lsh_transposed = lsh_transposed(PQ_trans, T);
res_lsh = lsh(PQ, T);

@assert isapprox(res_lsh_transposed, res_lsh)
@benchmark lsh($PQ, $T)


BenchmarkTools.Trial: 117 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m42.106 ms[22m[39m … [35m 44.896 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m42.641 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m42.731 ms[22m[39m ± [32m285.077 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▄[39m [39m█[39m█[39m▁[34m [39m[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▁[39m▁[39m▁[39m▁[

In [8]:
@benchmark lsh_transposed($PQ_trans, $T)

BenchmarkTools.Trial: 64 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m77.849 ms[22m[39m … [35m 81.274 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m79.327 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m79.302 ms[22m[39m ± [32m465.395 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m [39m▁[39m [34m█[39m[39m█[39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▄[39m▁[39m▁[39m▁[39m▁[39m▄[

## Other tests

In [140]:
using StaticArrays

PQ = [SVector{n_features,UInt8}(rand(1:n_clusters, n_features)) for i in 1:n_examples]

function lsh_static(PQ, T)
     n_features, n_examples = length(PQ[1]), length(PQ)
     d = Array{eltype(T)}(undef, n_examples)
     @inbounds for n in 1:n_examples
         res = zero(eltype(T))
         p = PQ[n]
         @simd for j in 1:n_features
             res += T[p[j],j]
         end
         d[n] = res
     end
     return d
 end

lsh_static (generic function with 1 method)

In [141]:
@benchmark lsh_static($PQ, $T)

BenchmarkTools.Trial: 113 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m43.712 ms[22m[39m … [35m 46.491 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m44.183 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m44.406 ms[22m[39m ± [32m520.554 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m▃[39m█[39m [34m [39m[39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▄[39m▁[39m▁[39m▁[39m▁[

Maybe using a dict...

In [128]:
T_dict = Dict()

for n in 1:n_clusters
    for j in 1:n_features
        T_dict[(n,j)] = T[n,j]
    end
end

In [131]:

function lsh_dict(PQ, T_dict::Dict)
    
    n_features, n_examples = size(PQ)
    d = zeros(eltype(T_dict[1,1]), n_examples)
    
    @inbounds @fastmath for n in 1:n_examples
        res = zero(eltype(T))
        for j in 1:n_features
            res += T_dict[PQ[j,n],j]    
        end
        d[n] = res
    end
    return d
end

lsh_dict (generic function with 1 method)

In [None]:
res_lsh_dict = lsh_dict(PQ, T_dict)