# Read SIFT 1 million 

In [1]:
using HDF5
using BenchmarkTools
using Distances
using LoopVectorization
using SIMD
using Clustering
using ProgressMeter
using StaticArrays

In [2]:
Sys.cpu_info()[1].model

"Intel(R) Core(TM) i5-1038NG7 CPU @ 2.00GHz"

In [3]:
path = joinpath(homedir(), "Datasets", "SIFT1M",
                "sift-128-euclidean.hdf5")

"/Users/davidbuchaca/Datasets/SIFT1M/sift-128-euclidean.hdf5"

In [4]:
f = h5open(path, "r")

🗂️ HDF5.File: (read-only) /Users/davidbuchaca/Datasets/SIFT1M/sift-128-euclidean.hdf5
├─ 🏷️ distance
├─ 🔢 distances
├─ 🔢 neighbors
├─ 🔢 test
└─ 🔢 train

In [5]:
X_tr_vecs = read(f["train"])
X_te_vecs = read(f["test"]);
neighbors = read(f["neighbors"])
distances = read(f["distances"])

@show size(X_tr_vecs)
@show size(X_te_vecs)
@show size(neighbors)
@show size(distances)

size(X_tr_vecs) = (128, 1000000)
size(X_te_vecs) = (128, 10000)
size(neighbors) = (100, 10000)
size(distances) = (100, 10000)


(100, 10000)

## Clustering.jl

Create the pqcodes for the training points

In [6]:
n_features = size(X_tr_vecs,1)
n_features

128

In [7]:
unique_values = sort(unique(X_tr_vecs))

213-element Vector{Float32}:
   0.0
   1.0
   2.0
   3.0
   4.0
   5.0
   6.0
   7.0
   8.0
   9.0
  10.0
  11.0
  12.0
   ⋮
 202.0
 204.0
 205.0
 206.0
 207.0
 208.0
 210.0
 211.0
 212.0
 214.0
 216.0
 218.0

In [8]:
n_clusters = 16
R_per_feature = []
@showprogress for j in  1:n_features
    R = kmeans(X_tr_vecs[[j],:], n_clusters; maxiter=200)
    push!(R_per_feature, R)
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:41[39m


In [9]:
P = hcat([r.centers[:] for r in R_per_feature]...);
println(P[:,1])
println(size(P))

Float32[111.930756, 12.4460125, 65.22103, 3.7663112, 136.06526, 31.82116, 88.83238, 53.24114, 23.867374, 123.2678, 8.419607, 99.81901, 0.22927776, 17.379864, 77.30371, 41.767414]
(16, 128)


In [10]:
P = hcat([r.centers[:] for r in R_per_feature]...);
println(P[:,1])
println(size(P))

Float32[111.930756, 12.4460125, 65.22103, 3.7663112, 136.06526, 31.82116, 88.83238, 53.24114, 23.867374, 123.2678, 8.419607, 99.81901, 0.22927776, 17.379864, 77.30371, 41.767414]
(16, 128)


In [11]:
@benchmark P = hcat([r.centers[:] for r in R_per_feature]...)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m11.193 μs[22m[39m … [35m94.351 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m12.668 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m13.548 μs[22m[39m ± [32m 3.716 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m▅[39m█[39m█[39m▆[39m▅[34m▂[39m[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▆[39m█[39m█[39m█[39m█[3

Another way to construct the matrix would be

In [12]:
function build_prototype_matrix(R, n_clusters, n_features)
    
    prototypes = Array{Float32}(undef, n_clusters, n_features);
    for j in 1:n_features
        prototypes[:,j] .= vec(R_per_feature[j].centers)
    end
    return prototypes
end

build_prototype_matrix (generic function with 1 method)

In [13]:
@benchmark P = build_prototype_matrix(R_per_feature, n_clusters, n_features)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m58.192 μs[22m[39m … [35m696.073 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m66.397 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m82.068 μs[22m[39m ± [32m 32.333 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m█[39m▇[39m▁[34m [39m[39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▅[39m█[39m█[39m█[34m▅

In [14]:
P2 = build_prototype_matrix(R_per_feature, n_clusters, n_features);

In [15]:
isapprox(P, P2)

true

We can generate the PQcodes for the data as follows:

In [16]:
PQcodes = hcat([Int32.(r.assignments[:]) for r in R_per_feature]...)';

In [17]:
size(PQcodes)

(128, 1000000)

Here each vector prototype is in fact a single scalar (vecause subvectors have a single coordinate)

In [18]:
size(P)

(16, 128)

We need a method to, given a vector and a collection of prototypes per feature, find which are the closest

In [19]:
query = X_te_vecs[:,1];

In [20]:
closest_prototypes = Array{UInt8}(undef, n_features, 1);

In [21]:
typeof(typemax(Float32))

Float32

In [22]:
@inline function seuclidean(x::T,y::T) where {T}
    aux::T = (x -y)
    return aux * aux
end

function find_closest_coordinate(dist, 
                                 value::T,
                                 vector) where T
    best_coordinate = 1
    min_distance::T = typemax(T)
    for (j,x) in enumerate(vector)
        current_dist = dist(x, value)
        if current_dist < min_distance
            best_coordinate = j
            min_distance = current_dist
        end
    end
    return best_coordinate
end

find_closest_coordinate (generic function with 1 method)

In [23]:
find_closest_coordinate(seuclidean, Float32(48.), query)
print(query)

Float32[1.0, 3.0, 11.0, 110.0, 62.0, 22.0, 4.0, 0.0, 43.0, 21.0, 22.0, 18.0, 6.0, 28.0, 64.0, 9.0, 11.0, 1.0, 0.0, 0.0, 1.0, 40.0, 101.0, 21.0, 20.0, 2.0, 4.0, 2.0, 2.0, 9.0, 18.0, 35.0, 1.0, 1.0, 7.0, 25.0, 108.0, 116.0, 63.0, 2.0, 0.0, 0.0, 11.0, 74.0, 40.0, 101.0, 116.0, 3.0, 33.0, 1.0, 1.0, 11.0, 14.0, 18.0, 116.0, 116.0, 68.0, 12.0, 5.0, 4.0, 2.0, 2.0, 9.0, 102.0, 17.0, 3.0, 10.0, 18.0, 8.0, 15.0, 67.0, 63.0, 15.0, 0.0, 14.0, 116.0, 80.0, 0.0, 2.0, 22.0, 96.0, 37.0, 28.0, 88.0, 43.0, 1.0, 4.0, 18.0, 116.0, 51.0, 5.0, 11.0, 32.0, 14.0, 8.0, 23.0, 44.0, 17.0, 12.0, 9.0, 0.0, 0.0, 19.0, 37.0, 85.0, 18.0, 16.0, 104.0, 22.0, 6.0, 2.0, 26.0, 12.0, 58.0, 67.0, 82.0, 25.0, 12.0, 2.0, 2.0, 25.0, 18.0, 8.0, 2.0, 19.0, 42.0, 48.0, 11.0]

In [24]:
@benchmark find_closest_coordinate(seuclidean, Float32(48.), query)


BenchmarkTools.Trial: 10000 samples with 750 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m161.549 ns[22m[39m … [35m665.689 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m170.799 ns               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m175.221 ns[22m[39m ± [32m 21.567 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▄[39m▄[39m▂[39m▅[39m█[34m█[39m[39m█[39m▃[32m▂[39m[39m▁[39m▁[39m▁[39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█

In [25]:
@assert 3 == find_closest_coordinate(seuclidean, 2., [3.,5,2,1])

@assert 4 == find_closest_coordinate(seuclidean, 1., [3.,5,2,1])

In [26]:
function encode(dist, vector, prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    @inbounds for (j,x) in enumerate(vector)
            closest_prototypes[j] = find_closest_coordinate(dist, x, prototypes[:,j])
    end
    return closest_prototypes
end

encode (generic function with 1 method)

In [27]:
find_closest_coordinate(sqeuclidean, query[1], P[:,1])

13

In [28]:
encode(sqeuclidean, query, P)

128×1 Matrix{Int8}:
 13
 10
  6
 11
 15
 15
  1
  1
  1
  4
  4
 11
 13
  ⋮
  8
  6
  4
  8
  8
 13
  7
  9
  7
  8
  9
  6

In [29]:
@benchmark encode(sqeuclidean, query, P)

BenchmarkTools.Trial: 10000 samples with 5 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m6.752 μs[22m[39m … [35m438.091 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 96.39%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m7.023 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m8.410 μs[22m[39m ± [32m 16.152 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m7.83% ±  4.04%

  [39m█[34m█[39m[39m▆[39m▅[39m▂[39m [39m [39m▃[32m▂[39m[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[39m█[39m█[39m█

In [30]:
function encode_fast(dist, vector::Array{T}, prototypes::Array{T}) where T
    n_clusters, n_features = size(prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    
    @inbounds for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(prototypes[k,j],x )
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
           #println(k, ' ', j, ' ', best_coordinate, ' ',min_distance )
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

encode_fast (generic function with 1 method)

In [31]:
(query[1] .- P[:,1]) .* (query[1] .- P[:,1])

16-element Vector{Float32}:
 12305.633
   131.0112
  4124.341
     7.6524773
 18242.625
   949.94385
  7714.5273
  2729.1365
   522.9168
 14949.415
    55.05057
  9765.196
     0.5940128
   268.29993
  5822.2563
  1661.982

In [32]:
println(find_closest_coordinate(sqeuclidean, query[1], P[:,1]))
println(find_closest_coordinate(sqeuclidean, query[2], P[:,2]))

13
10


In [33]:
encode_fast(sqeuclidean, query, P)[1:4]

4-element Vector{Int8}:
 13
 10
  6
 11

In [34]:
@assert isapprox(encode(sqeuclidean, query, P),
                 encode_fast(sqeuclidean, query, P))

In [35]:
@benchmark encode_fast(sqeuclidean, query, P)

BenchmarkTools.Trial: 10000 samples with 9 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.121 μs[22m[39m … [35m121.612 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.170 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.445 μs[22m[39m ± [32m  1.443 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m█[34m▆[39m[39m▄[39m▄[39m▃[32m▂[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[39m█[39m█[39m█[3

In [36]:
function compute_ADC(query, prototypes, dist)
    n_clusters, n_features = size(P)
    
    ADC_table = Array{Float32}(undef, n_clusters, n_features)
    
    for j in 1:n_features
        for p in 1:n_clusters
            ADC_table[p,j] = dist(query[j], @view prototypes[p,:] )
        end
    end
    return  ADC_table
end

compute_ADC (generic function with 1 method)

In [37]:
size(P)

(16, 128)

In [38]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds   for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

SEuclidean0 (generic function with 1 method)

In [39]:
@benchmark compute_ADC(query, P, SEuclidean0)

BenchmarkTools.Trial: 4722 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m649.287 μs[22m[39m … [35m 11.942 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 87.04%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m895.685 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m  1.039 ms[22m[39m ± [32m670.127 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m3.79% ±  6.49%

  [39m▇[39m█[39m▃[39m▂[39m [39m [39m [39m [34m [39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[

In [40]:
adc_table = compute_ADC(query, P, SEuclidean0);

In [41]:
function compute_ADC_fast(query, prototypes, dist)
    n_clusters, n_features = size(P)
    
    ADC_table = Array{Float32}(undef, n_clusters, n_features)
    
    for j in 1:n_features
        for p in 1:n_clusters
            
            ADC_table[p,j] = dist(query[j],  prototypes[p,:] )
        end
    end
    return  ADC_table
end

compute_ADC_fast (generic function with 1 method)

In [42]:
@benchmark compute_ADC_fast(query, P, SEuclidean0)

BenchmarkTools.Trial: 4628 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m750.469 μs[22m[39m … [35m  6.588 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 72.73%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m911.789 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m  1.073 ms[22m[39m ± [32m671.277 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m11.17% ± 14.03%

  [39m▅[39m▆[39m█[34m▆[39m[39m▅[32m▄[39m[39m▃[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[39m█[39m

In [43]:
function adc_dist(query_code, x_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @fastmath for j in eachindex(x_code)
        res+= adc_table[x_code[j], j]
    end
    return res
end

adc_dist (generic function with 1 method)

In [44]:
query_code = encode_fast(sqeuclidean, query, P)
x_code = PQcodes[:,1];
adc_table = compute_ADC(query, P, SEuclidean0);
adc_dist(query_code, x_code,  adc_table)

462355.06f0

In [45]:
@benchmark adc_dist(query_code, x_code,  adc_table)

BenchmarkTools.Trial: 10000 samples with 982 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m53.150 ns[22m[39m … [35m 3.430 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 96.70%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m62.236 ns              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m64.059 ns[22m[39m ± [32m56.726 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.47% ±  1.67%

  [39m▄[39m▇[39m▇[39m█[39m▅[39m▂[39m▁[39m▇[34m█[39m[32m▇[39m[39m▆[39m▅[39m▃[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[39m█[39m█[39

In [46]:
x = X_tr_vecs[:,1];
@benchmark SEuclidean0(x, query)

BenchmarkTools.Trial: 10000 samples with 807 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m134.072 ns[22m[39m … [35m 4.207 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 95.48%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m155.967 ns              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m159.366 ns[22m[39m ± [32m61.801 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.50% ±  1.34%

  [39m▅[39m▄[39m [39m▆[39m▇[39m▁[39m [39m [39m [39m█[34m█[39m[32m▃[39m[39m▂[39m▂[39m▁[39m [39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m [39m [39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m▄[39m█

##  Share prototypes across features

https://groups.google.com/g/julia-users/c/xBcQRebyi_o



In [47]:
n_clusters = 32
#R_shared = kmeans(Matrix(vec(X_tr_vecs)'), n_clusters; maxiter=200)
R_shared = kmeans(X_tr_vecs[[1],:], n_clusters; maxiter=200)

KmeansResult{Matrix{Float32}, Float32, Int64}(Float32[25.465425 119.0213 … 137.81308 17.97515], [5, 13, 5, 22, 26, 12, 5, 13, 29, 9  …  8, 22, 29, 18, 18, 7, 17, 27, 5, 23], Float32[0.0, 0.19229126, 0.0, 1.0778503, 0.19084167, 2.025879, 0.0, 2.438202, 0.22853088, 0.21289062  …  2.3737793, 0.92507935, 0.27244568, 0.0034179688, 4.2390137, 9.437988, 0.8599529, 1.0335693, 0.0, 0.6191406], [37570, 11830, 43568, 14460, 156099, 22456, 30253, 26481, 42826, 1812  …  13505, 22525, 8159, 82460, 25125, 18530, 38804, 14622, 6238, 36177], [37570, 11830, 43568, 14460, 156099, 22456, 30253, 26481, 42826, 1812  …  13505, 22525, 8159, 82460, 25125, 18530, 38804, 14622, 6238, 36177], 1.2947946f6, 7, true)

In [123]:
function encode_shared(dist, vector::Array{T}, shared_prototypes::Array{T}) where T
    n_clusters = length(shared_prototypes)
    
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    
    @inbounds for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(shared_prototypes[k,j],x )
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
           #println(k, ' ', j, ' ', best_coordinate, ' ',min_distance )
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

encode_shared (generic function with 1 method)

In [116]:
maximum(R_shared.assignments)

32

In [147]:
P_shared = R_shared.centers

1×32 Matrix{Float32}:
 25.4654  119.021  6.48306  83.4358  …  8.47804  72.9639  137.813  17.9751

In [160]:
j=3
encode_shared(sqeuclidean, X_tr_vecs[:,j] ,P_shared) 

128×1 Matrix{Int8}:
  5
 25
 15
 14
  2
 23
  3
  6
 14
  6
 14
 18
 16
  ⋮
 13
 12
 11
 10
  1
  8
  7
  6
  5
  4
  3
  2

In [157]:
size(x), size(X_tr_vecs[:,1])

((128,), (128,))

In [164]:
PQcodes_shared = Array{Int16}(undef, n_features, n_examples);

for j in 1:n_examples
    PQcodes_shared[:,j] = encode_shared(sqeuclidean, X_tr_vecs[:,j] ,P_shared)  
end

In [165]:
maximum(PQcodes), maximum(PQcodes_shared)

(16, 32)

Now we need a new encoding function that uses a single vector of prototypes

In [132]:
function encode_fast_shared(dist, vector::Array{T}, prototypes::Array{T}) where T
    
    n_features = length(vector)
    n_clusters = length(prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    
    @inbounds @fastmath for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(prototypes[k],x )
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

encode_fast_shared (generic function with 1 method)

In [55]:
@benchmark encode_fast_shared(sqeuclidean, query, P_shared)

BenchmarkTools.Trial: 10000 samples with 8 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m3.608 μs[22m[39m … [35m 19.210 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m3.981 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m4.032 μs[22m[39m ± [32m428.495 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▅[39m▄[39m▂[39m▁[39m [39m▁[39m▁[39m [39m█[34m█[39m[32m▆[39m[39m▄[39m▃[39m▃[39m▅[39m▄[39m▃[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[39m█[39m▇[39m█[

In [56]:
function compute_ADC_shared(query, prototypes, dist)
    n_clusters = length(prototypes)
    ADC_table = Array{Float32}(undef, n_clusters)
    
    @inbounds  for p in 1:n_clusters            
        ADC_table[p] = dist(query[p],  prototypes[p] )
    end
    return  ADC_table
end

compute_ADC_shared (generic function with 1 method)

In [57]:
query_code = encode_fast_shared(sqeuclidean, query, P_shared)
x_code = PQcodes[:,1];
adc_table_shared = Float32.(compute_ADC_shared(query, P_shared, SEuclidean0));
#adc_dist(query_code, x_code,  adc_table)

In [58]:
@benchmark adc_table = compute_ADC_shared(query, P_shared, SEuclidean0)

BenchmarkTools.Trial: 10000 samples with 988 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m50.287 ns[22m[39m … [35m  2.538 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 94.32%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m52.753 ns               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m64.790 ns[22m[39m ± [32m124.279 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m11.83% ±  6.02%

  [39m▅[39m█[34m▆[39m[39m▆[39m▄[39m▂[39m▁[39m [39m [39m [39m▁[39m▂[32m▁[39m[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[39m█[34m█[39

## Improving distance between vectors with ADC_distance

In [59]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds   for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

SEuclidean0 (generic function with 1 method)

In [60]:
@benchmark SEuclidean0($x, $query)

BenchmarkTools.Trial: 10000 samples with 883 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m106.514 ns[22m[39m … [35m396.851 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m130.601 ns               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m127.724 ns[22m[39m ± [32m 17.072 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▆[39m▁[39m▂[39m▂[39m▇[39m▆[39m [39m [39m▁[39m▁[39m▄[39m▃[39m▂[39m▂[32m▃[39m[39m▇[34m▇[39m[39m█[39m▃[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█

In [61]:
function adc_dist_shared(query_code, x_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(x_code)
        res+= adc_table[x_code[j]]
    end
    return res
end

adc_dist_shared (generic function with 1 method)

In [62]:
@benchmark adc_dist_shared($query_code, $x_code, $adc_table_shared)

BenchmarkTools.Trial: 10000 samples with 993 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m34.695 ns[22m[39m … [35m177.225 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m39.201 ns               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m38.665 ns[22m[39m ± [32m  6.592 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▇[39m [39m▄[39m [39m [39m [39m [32m█[39m[34m▁[39m[39m▂[39m▅[39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m▃[39m█[39m▆[3

In [63]:
function adc_dist_shared_unrolled(query_code, x_code,  adc_table)
    res1 = zero(eltype(adc_table))
    res2 = zero(eltype(adc_table))
    res3 = zero(eltype(adc_table))
    res4 = zero(eltype(adc_table))
    
    @inbounds @fastmath for j in 1:4:length(x_code)
        res1 += adc_table[x_code[j]]
        res2 += adc_table[x_code[j+1]]
        res3 += adc_table[x_code[j+2]]
        res4 += adc_table[x_code[j+3]]
    end
    
    return res1 + res2 + res3 + res4
end

adc_dist_shared_unrolled (generic function with 1 method)

In [64]:
@benchmark adc_dist_shared_unrolled($query_code, $x_code,  $adc_table_shared)

BenchmarkTools.Trial: 10000 samples with 986 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m39.119 ns[22m[39m … [35m378.519 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m51.881 ns               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m50.858 ns[22m[39m ± [32m  9.715 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m█[34m▂[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▂[39m▁[39m▃[39m▁[3

In [65]:
adc_dist_shared(query_code, x_code,  adc_table_shared)

409149.06f0

In [66]:
adc_dist_shared_unrolled(query_code, x_code,  adc_table_shared)

409149.1f0

Note that euclidean squared distance with `@fastmath` is still faster

In [67]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

SEuclidean0 (generic function with 1 method)

In [68]:
@benchmark SEuclidean0($x, $query)

BenchmarkTools.Trial: 10000 samples with 999 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m7.959 ns[22m[39m … [35m68.835 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m8.024 ns              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m8.264 ns[22m[39m ± [32m 1.849 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m█[34m▆[39m[39m▃[32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[39m█[32m█[39m[39m▇[39m

What if we use an adctable cotaining only integers?

In [69]:
function adc_dist_shared(query_code, x_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(x_code)
        res+= adc_table[x_code[j]]
    end
    return res
end

adc_dist_shared (generic function with 1 method)

In [70]:
adc_table_shared_int = Int32.(round.(adc_table_shared));

In [71]:
@benchmark adc_dist_shared($query_code, $x_code,  $adc_table_shared_int)

BenchmarkTools.Trial: 10000 samples with 992 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m36.757 ns[22m[39m … [35m168.388 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m37.901 ns               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m40.045 ns[22m[39m ± [32m  8.070 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▄[39m█[34m▁[39m[39m▄[32m▄[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[39m█[34m█[39m[39

## BWcode

Since different the same value on a pqcode found in different coordinates, refers to the same real number  when using a SharedQuantizer, we can iterate over the 128 vector coordinates of a pqcode and add up the value sin the `adc_table_shared`  or.... we can iterate over a `K`-dimensional vector that we call the BW code and add the values in `adc_table_shared` weighted by the amount of each prototype has been seen in a pqcode, which is precisely what the BW code stores.

In [79]:
function pq_code_to_bw_code(pqcode, n_clusters)
    bw_code = zeros(eltype(pqcode), n_clusters)
    for c in pqcode
        bw_code[c] += 1
    end
    return bw_code
end

pq_code_to_bw_code (generic function with 1 method)

In [80]:
n_examples = size(X_tr_vecs,2)
BWcodes = Array{Int16}(undef, n_clusters, n_examples);
bw_code = pq_code_to_bw_code(x_code, n_clusters);

Now let´s verify that the distance between a query

In [81]:
function bw_adc_dist_shared(query_code, bw_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(bw_code)
        res+= adc_table[j] * bw_code[j]
    end
    return res
end

bw_adc_dist_shared (generic function with 1 method)

In [82]:
@benchmark bw_adc_dist_shared($query_code, $bw_code, $adc_table_shared)

BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m5.378 ns[22m[39m … [35m56.360 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m5.407 ns              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m5.660 ns[22m[39m ± [32m 2.440 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m█[34m▁[39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m▃[39m▃[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[39m▁[39m▁[39m▁[39m▁[3

In [83]:
bw_adc_dist_shared(query_code, bw_code, adc_table_shared)

409149.06f0

In [84]:
adc_dist_shared(query_code, x_code, adc_table_shared)

409149.06f0

## Linear scann study

### Exact version

In [85]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

function linear_scann_exact(query, X)

    n_features, n_examples = size(X)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath  for j in 1:n_examples
        distances[j] = SEuclidean0(query, X[:,j])    
    end
    return distances
end

@benchmark linear_scann_exact(query, X_tr_vecs)

BenchmarkTools.Trial: 30 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m141.676 ms[22m[39m … [35m309.229 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m14.21% … 54.46%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m161.113 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m14.54%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m169.089 ms[22m[39m ± [32m 38.592 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m19.17% ± 10.17%

  [39m [39m [39m [39m [39m [39m [39m [34m█[39m[39m [39m▄[32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▆[39m▄[39m▆

In [86]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

function linear_scann_exact(query, X)

    n_features, n_examples = size(X)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath Threads.@threads for j in 1:n_examples
        distances[j] = SEuclidean0(query, X[:,j])    
    end
    return distances
end

@benchmark linear_scann_exact(query, X_tr_vecs)

BenchmarkTools.Trial: 33 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m101.794 ms[22m[39m … [35m186.096 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m41.21% … 57.87%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m167.326 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m58.84%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m153.642 ms[22m[39m ± [32m 26.907 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m54.62% ±  8.17%

  [39m [39m [39m [39m [39m [39m [39m▁[39m [39m▁[39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [34m▁[39m[39m█[39m [39m▄[39m [39m [39m▄[39m [39m [39m▁[39m [39m [39m [39m [39m [39m 
  [39m▆[39m▁[39m▁

In [87]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

function linear_scann_exact(query, X)

    n_features, n_examples = size(X)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = SEuclidean0(query, view(X,:,j))    
    end
    return distances
end

@benchmark linear_scann_exact(query, X_tr_vecs)

BenchmarkTools.Trial: 166 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m27.191 ms[22m[39m … [35m38.141 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m29.508 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m30.159 ms[22m[39m ± [32m 2.148 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.82% ± 2.60%

  [39m [39m [39m [39m [39m [39m [39m▆[39m [39m▂[39m [39m [39m [39m [39m█[34m▃[39m[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▆[39m▅[39m▆[39m▄[39m▆[39m▇[39m

### ADC version

In [88]:
function linear_scann(query, PQcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath for j in 1:n_examples
        distances[j] = adc_dist_shared(query_code,  view(PQcodes,:,j) ,  adc_table_shared)    
    end
    return distances
end

@benchmark distances = linear_scann($query, $PQcodes, $adc_table_shared, $P_shared)

BenchmarkTools.Trial: 35 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m144.335 ms[22m[39m … [35m158.341 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 2.30%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m145.778 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m146.941 ms[22m[39m ± [32m  3.356 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.13% ± 0.51%

  [39m█[39m▄[39m [39m▄[39m█[39m [34m█[39m[39m▁[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m▆[39m█

In [89]:
function linear_scann(query, PQcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath Threads.@threads for j in 1:n_examples
        distances[j] = adc_dist_shared(query_code,  PQcodes[:,j] ,  adc_table_shared)    
    end
    return distances
end

linear_scann (generic function with 1 method)

In [90]:
function linear_scann_unrolled(query, PQcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath Threads.@threads for j in 1:n_examples
        distances[j] = adc_dist_shared_unrolled(query_code,  PQcodes[:,j] ,  adc_table_shared)    
    end
    return distances
end

@benchmark distances = linear_scann_unrolled($query, $PQcodes, $adc_table_shared, $P_shared)

BenchmarkTools.Trial: 34 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m112.797 ms[22m[39m … [35m179.750 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m37.09% … 59.89%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m159.331 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m54.84%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m149.065 ms[22m[39m ± [32m 22.592 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m51.25% ±  8.27%

  [39m▄[39m [39m [39m [39m [39m [39m▁[39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m█[34m▄[39m[39m▄[39m [39m▁[39m [39m [39m▁[39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m▆[39m▁

### ADC version with BW codes

In [166]:
n_examples = size(X_tr_vecs,2)
BWcodes = Array{Int8}(undef, n_clusters, n_examples);
bw_code = pq_code_to_bw_code(x_code, n_clusters);

In [167]:

for j in 1:n_examples
   BWcodes[:,j] .= pq_code_to_bw_code(PQcodes_shared[:,j], n_clusters)
end

In [169]:
function bw_adc_dist_shared(query_code, bw_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(bw_code)
        res+= adc_table[j] * bw_code[j]
    end
    return res
end

function linear_scann_bw(query, BWcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(BWcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    n_clusters = size(BWcodes,1)
    #@show n_clusters
    @inbounds for i in 1:n_examples
        
        # Following code is equivalent to bw_adc_dist_shared(query_code, BWcodes[:,i],  adc_table_shared)
        res = zero(eltype(adc_table_shared))
        @simd for j in 1:n_clusters
            res+= adc_table_shared[j] * BWcodes[j,i]
        end        
        distances[i] = res  
    
    end
    return distances
end

#@benchmark distances = linear_scann($query, $PQcodes, $adc_table_shared, $P_shared)

linear_scann_bw (generic function with 1 method)

In [170]:
@benchmark linear_scann_bw($query, $BWcodes, $adc_table_shared, $P_shared)

BenchmarkTools.Trial: 1198 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m3.313 ms[22m[39m … [35m9.752 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 58.32%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m3.736 ms             [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m4.163 ms[22m[39m ± [32m1.321 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m9.05% ± 14.85%

  [39m [39m [39m▅[39m█[34m▄[39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▅[39m▅[39m█[39m█[34m█[39m[39m█[39m▅[39m▇[3

In [97]:
function bw_adc_dist_shared(query_code, bw_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(bw_code)
        res+= adc_table[j] * bw_code[j]
    end
    return res
end

function linear_scann_bw_multithreaded(query, BWcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(BWcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    n_clusters = size(BWcodes,1)
    #@show n_clusters
    @inbounds @fastmath Threads.@threads  for i in 1:n_examples
       distances[i] = bw_adc_dist_shared(query_code, view(BWcodes,:,i),  adc_table_shared)
    end    
    
    return distances
end

@benchmark distances = linear_scann_bw_multithreaded($query, $BWcodes, $adc_table_shared, $P_shared)

BenchmarkTools.Trial: 2430 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.101 ms[22m[39m … [35m8.669 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 72.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.849 ms             [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.032 ms[22m[39m ± [32m1.200 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m16.38% ± 19.19%

  [39m▆[39m█[39m▄[39m▅[39m▄[39m▃[39m▄[39m▆[34m█[39m[39m▇[32m▅[39m[39m▃[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[39m█[39m█[39m█[39m█[39m█[39m█[39m█[34m

In [98]:
distances = linear_scann_bw_multithreaded(query, BWcodes, adc_table_shared, P_shared)
distances

1000000-element Vector{Float32}:
 409149.06
 419367.38
 466659.5
 440893.12
 420404.66
 348275.0
 417521.94
 419140.44
 465665.88
 500682.62
 523472.6
 500980.62
 405562.62
      ⋮
 357651.12
 450247.84
 405551.75
 437638.4
 472304.38
 432268.03
 308139.88
 423069.38
 353762.25
 409128.7
 481940.75
 438060.0

# Quality verification

We want to see with respect to exact search what we get

## Euclidean  Squared Distance

In [440]:
function SEuclidean(X, query)
    d = (query .- X) .* (query .- X)
    return sum(d, dims=1)
end

SEuclidean (generic function with 1 method)

In [441]:
function SEuclidean_2(X, query)
    n_features, n_examples = size(X)
    result = zeros(n_examples)
    for m in 1:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        result[m] = res
    end
    return result
end

SEuclidean_2 (generic function with 1 method)

In [442]:
query = X_te_vecs[:,1];

In [443]:
@benchmark SEuclidean(X_te_vecs, query)

BenchmarkTools.Trial: 3753 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m928.708 μs[22m[39m … [35m32.129 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 96.23%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m  1.211 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m  1.328 ms[22m[39m ± [32m 1.582 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m7.66% ±  6.17%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▅[39m▇[39m█[39m█[39m▇[39m▆[39m▆[39m▅[39m▅[34m▅[39m[39m▄[39m▂[39m [39m [39m [39m [39m [39m▂[39m▃[39m▃[39m▂[39m▂[32m▁[39m[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▅[39m▅[39m▄[39m▃[39m

In [444]:
@benchmark SEuclidean_2(X_te_vecs, query)

BenchmarkTools.Trial: 6369 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m769.583 μs[22m[39m … [35m 2.304 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m780.625 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m783.792 μs[22m[39m ± [32m22.694 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m▂[39m▂[39m▄[39m▅[39m▆[39m█[39m▆[34m▇[39m[39m▆[39m▃[32m▁[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▃[39m▃[39m█[39m█

In [445]:
@benchmark SEuclidean(X_tr_vecs, query)

BenchmarkTools.Trial: 39 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m115.288 ms[22m[39m … [35m213.858 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m121.950 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m131.411 ms[22m[39m ± [32m 21.901 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m5.18% ± 6.91%

  [39m▄[39m█[39m [34m [39m[39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m▇[34m▆

In [446]:
@benchmark SEuclidean_2(X_tr_vecs, query)

BenchmarkTools.Trial: 65 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m77.253 ms[22m[39m … [35m 79.056 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m77.805 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m77.872 ms[22m[39m ± [32m390.832 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m▆[39m█[39m [39m▆[39m [39m [39m [39m [34m [39m[39m [39m [39m [32m [39m[39m█[39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▇[39m▁[39m▁[39m▁[39m▁[3

## Finding top k distances (and their ids)


The first naive thing we can do consist on computing all distances and then sorting them to get the top k closest vectors to the query vector

In [447]:
function top_k_ids(X, query)
    distances = SEuclidean_2(X, query)
    top_k_indices = sortperm(distances)
    return top_k_indices
end

top_k_ids (generic function with 1 method)

In [448]:
@benchmark top_k_ids(X_te_vecs, query)[1:10]

BenchmarkTools.Trial: 3988 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.228 ms[22m[39m … [35m 1.796 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.247 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m1.252 ms[22m[39m ± [32m20.688 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m▁[39m▅[39m▅[39m▇[39m█[39m▅[39m▆[34m▃[39m[39m▂[39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▂[39m▃[39m▃[39m▅[39m▆[39m█[39m█[39m█

A slightly better approach consist on using `partialsortperm` to simply sort a subset of the distances vector.

In [449]:
function top_k_ids_2(X, query, k)
    distances = SEuclidean_2(X, query)
    top_k_indices = partialsortperm(distances, 1:k)
    return top_k_indices
end

top_k_ids_2 (generic function with 1 method)

In [450]:
@benchmark top_k_ids_2(X_te_vecs, query, 10)

BenchmarkTools.Trial: 6056 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m785.709 μs[22m[39m … [35m 1.346 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m808.042 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m822.902 μs[22m[39m ± [32m37.672 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m▁[39m▇[39m█[39m▄[39m▁[34m [39m[39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▃[39m▃[39m█[39m█

### Storing top k distances in a priority queue

A better alternative consists on using a priority queue. This. queue will keep only k distances in memory (no need to store all distances between the query point and all possible candidates).

In [451]:
sort!([1,54,3,24,10])

5-element Vector{Int64}:
  1
  3
 10
 24
 54

In [452]:
a = [1,2,3,4,5]

5-element Vector{Int64}:
 1
 2
 3
 4
 5

In [453]:
function SEuclidean_3(X, query, top_k)
    result = sort(SEuclidean_2(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    for m in top_k:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        dist = res/n_features
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end
            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end

    end
    return result
end

SEuclidean_3 (generic function with 1 method)

In [454]:
@benchmark SEuclidean_3(X_tr_vecs, query, 10) 

BenchmarkTools.Trial: 62 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m79.984 ms[22m[39m … [35m90.506 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m80.591 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m81.169 ms[22m[39m ± [32m 1.771 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m█[34m▂[39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▆[39m▇[39m▇[39m▆[39m▇[39m█[34m█

In [455]:
@benchmark top_k_ids_2(X_tr_vecs, query, 10)

BenchmarkTools.Trial: 54 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m86.799 ms[22m[39m … [35m150.780 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m89.566 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m93.604 ms[22m[39m ± [32m 11.673 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m▃[39m [39m▃[34m█[39m[39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▄[39m█[39m█[39m█[34m█[3

In [456]:
function SEuclidean_4(X, query, top_k)
    
    result = sort(SEuclidean_2(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    @inbounds @fastmath for m in top_k:n_examples
        res = zero(eltype(X))
        @simd for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        dist = res
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end
    end
    return result
end


SEuclidean_4 (generic function with 1 method)

In [462]:
@benchmark SEuclidean_4(X_tr_vecs, query, 10) 

BenchmarkTools.Trial: 416 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m11.872 ms[22m[39m … [35m 13.198 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m11.996 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m12.029 ms[22m[39m ± [32m161.525 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▄[39m█[39m [39m [39m [39m [39m [39m [39m [34m [39m[39m [39m [32m [39m[39m [39m [39m [39m▁[39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[39m▆[39m▅[

In [458]:
X_tr_200k = X_tr_vecs[:,1:200_000]
@benchmark SEuclidean_4(X_tr_200k, query, 10) 

BenchmarkTools.Trial: 2022 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.424 ms[22m[39m … [35m 3.359 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.448 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.463 ms[22m[39m ± [32m52.802 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▃[39m█[39m▅[39m▄[39m▁[39m [34m [39m[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[39m█[39m█[39m█[34m▇[39m[

In [None]:
SEuclidean_4(X_tr_vecs, query, 10)

In [459]:
function SEuclidean_5(X, query, top_k)
    
    result = sort(SEuclidean_2(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    for m in top_k:n_examples
        res = zero(eltype(X))
        @turbo  for j in 1:n_features
            aux = (query[j] - X[j,m])
            res += aux * aux
        end
        dist = res
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end
    end
    return result
end

SEuclidean_5 (generic function with 1 method)

In [460]:
@benchmark SEuclidean_5(X_tr_200k, query, 10) 

BenchmarkTools.Trial: 2251 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.148 ms[22m[39m … [35m 2.755 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.197 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.211 ms[22m[39m ± [32m58.715 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m▄[39m▇[39m█[39m█[39m▇[39m▆[34m▆[39m[39m▇[32m▇[39m[39m▆[39m▅[39m▄[39m▃[39m▂[39m▂[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m▄[39m▄[39m█[39m█[39m█[39m█[39m█[39m█

# Comparison with Distances.jl


Distances.jl provides the method 'pairwise' to compute pairwise distances between two matrices.

In [None]:
@show typeof(query)
@show size(query)
@show typeof(X_tr_vecs)
@show size(X_tr_vecs)

In [None]:
v1 = [1,2,3]
v2 = [0,0,1]

y = [0,0,2]
X = [v1 v2]

In [None]:
function SEuclidean_5(X, query, top_k)
    
    result = sort(SEuclidean_5(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    for m in top_k:n_examples
        dist = zero(eltype(X))
        @turbo  for j in 1:n_features
            dist = (query[j] - X[j,m])
            dist += aux * aux
        end
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end
    end
    return result
end

In [None]:
function SEuclidean_2(X, query)
    n_features, n_examples = size(X)
    result = zeros(eltype(X), n_examples)
    for m in 1:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        result[m] = res
    end
    return result
end

In [None]:
y_colvec = [y y][:,[1]]
query_mat = [query query][:,[1]];

In [None]:
SEuclidean_2(X, y)

In [None]:
pairwise(SqEuclidean(), y_colvec, X)

In [None]:
@benchmark pairwise(SqEuclidean(), query_mat, X_tr_vecs)

In [None]:
@benchmark SEuclidean_2(X_tr_vecs, query)

In [None]:
@benchmark SEuclidean_4(X_tr_vecs, query, 10)

We can verify that the sorted distances by `SEuclidean_4` are the same as the results provided with `pairwise` from `Distances.jl`

In [None]:
res2 = SEuclidean_4(X_tr_vecs, query, 10);
res = pairwise(SqEuclidean(), query_mat, X_tr_vecs)
res = sort(res, dims=2)[1:10]
res == res2

## Parallel implementation

# SIMD tests

In [None]:

function find_val_in_array_simd(x::Array{T}, val::T) where {T}
    n_simd = 64
    last_pos_simd_chunk = length(x)-n_simd
    @inbounds for i in 1:n_simd:last_pos_simd_chunk
        vec_i = vload(Vec{n_simd, T}, x, i)
        sum_equality = sum(vec_i == val)
        if sum_equality >0
            return true
        end
    end

    @inbounds for i in last_pos_simd_chunk:length(x)
        if x[i] == val
            return true
        end
    end

    return false
end

In [None]:
function jdotavx(a, b)
    s = zero(eltype(a))
    @turbo for i ∈ eachindex(a, b)
        s += a[i] * b[i]
    end
    s
end