# Read SIFT 1 million 

In [1]:
using HDF5
using BenchmarkTools
using Distances
using LoopVectorization
using SIMD
using Clustering
using ProgressMeter
using StaticArrays

In [2]:
Sys.cpu_info()[1].model

"Apple M1 Pro"

In [3]:
path = joinpath(homedir(), "Datasets", "SIFT1M",
                "sift-128-euclidean.hdf5")

"/Users/dbuchaca/Datasets/SIFT1M/sift-128-euclidean.hdf5"

In [4]:
f = h5open(path, "r")

üóÇÔ∏è HDF5.File: (read-only) /Users/dbuchaca/Datasets/SIFT1M/sift-128-euclidean.hdf5
‚îú‚îÄ üè∑Ô∏è distance
‚îú‚îÄ üî¢ distances
‚îú‚îÄ üî¢ neighbors
‚îú‚îÄ üî¢ test
‚îî‚îÄ üî¢ train

In [5]:
X_tr_vecs = read(f["train"])
X_te_vecs = read(f["test"]);
neighbors = read(f["neighbors"])
distances = read(f["distances"])

@show size(X_tr_vecs)
@show size(X_te_vecs)
@show size(neighbors)
@show size(distances)

size(X_tr_vecs) = (128, 1000000)
size(X_te_vecs) = (128, 10000)
size(neighbors) = (100, 10000)
size(distances) = (100, 10000)


(100, 10000)

## Clustering.jl

Create the pqcodes for the training points

In [6]:
n_features = size(X_tr_vecs,1)
n_features

128

In [7]:
unique_values = sort(unique(X_tr_vecs))

213-element Vector{Float32}:
   0.0
   1.0
   2.0
   3.0
   4.0
   5.0
   6.0
   7.0
   8.0
   9.0
  10.0
  11.0
  12.0
   ‚ãÆ
 202.0
 204.0
 205.0
 206.0
 207.0
 208.0
 210.0
 211.0
 212.0
 214.0
 216.0
 218.0

In [8]:
n_clusters = 16
R_per_feature = []
@showprogress for j in  1:n_features
    R = kmeans(X_tr_vecs[[j],:], n_clusters; maxiter=200)
    push!(R_per_feature, R)
end

[32mProgress: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Time: 0:01:32[39m


In [9]:
P = hcat([r.centers[:] for r in R_per_feature]...);
println(P[:,1])
println(size(P))

Float32[99.267654, 5.2500305, 47.840622, 88.36083, 28.804195, 38.28978, 123.6862, 0.49666122, 12.244287, 111.930756, 20.320831, 64.371445, 79.39354, 136.85092, 56.391323, 71.9072]
(16, 128)


In [10]:
P = hcat([r.centers[:] for r in R_per_feature]...);
println(P[:,1])
println(size(P))

Float32[99.267654, 5.2500305, 47.840622, 88.36083, 28.804195, 38.28978, 123.6862, 0.49666122, 12.244287, 111.930756, 20.320831, 64.371445, 79.39354, 136.85092, 56.391323, 71.9072]
(16, 128)


In [11]:
@benchmark P = hcat([r.centers[:] for r in R_per_feature]...)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m11.625 Œºs[22m[39m ‚Ä¶ [35m43.334 Œºs[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m12.916 Œºs              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m13.280 Œºs[22m[39m ¬± [32m 1.536 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m [39m [39m‚ñá[39m‚ñà[39m‚ñÅ[39m [39m‚ñÇ[39m‚ñà[34m‚ñÜ[39m[39m‚ñÅ[39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  

Another way to construct the matrix would be

In [12]:
function build_prototype_matrix(R, n_clusters, n_features)
    
    prototypes = Array{Float32}(undef, n_clusters, n_features);
    for j in 1:n_features
        prototypes[:,j] .= vec(R_per_feature[j].centers)
    end
    return prototypes
end

build_prototype_matrix (generic function with 1 method)

In [13]:
@benchmark P = build_prototype_matrix(R_per_feature, n_clusters, n_features)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m57.333 Œºs[22m[39m ‚Ä¶ [35m162.583 Œºs[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m60.833 Œºs               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m61.130 Œºs[22m[39m ¬± [32m  2.240 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÅ[39m‚ñÉ[39m‚ñÖ[39m‚ñÜ[39m‚ñÜ[39m‚ñà[39m‚ñÜ[39m‚ñÜ[39m‚ñÜ[39m‚ñÜ[34m‚ñÖ[39m[39m‚ñÖ[32m‚ñà[39m[39m‚ñÑ[39m‚ñÑ[39m‚ñÉ[39m‚ñÉ[39m‚ñÉ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [

In [14]:
P2 = build_prototype_matrix(R_per_feature, n_clusters, n_features);

In [531]:
isapprox(P, P2)

true

We can generate the PQcodes for the data as follows:

In [525]:
PQcodes = hcat([Int32.(r.assignments[:]) for r in R_per_feature]...)';

In [527]:
size(PQcodes)

(128, 1000000)

Here each vector prototype is in fact a single scalar (vecause subvectors have a single coordinate)

In [18]:
size(P)

(16, 128)

We need a method to, given a vector and a collection of prototypes per feature, find which are the closest

In [19]:
query = X_te_vecs[:,1];

In [20]:
closest_prototypes = Array{UInt8}(undef, n_features, 1);

In [21]:
typeof(typemax(Float32))

Float32

In [22]:
@inline function seuclidean(x::T,y::T) where {T}
    aux::T = (x -y)
    return aux * aux
end

function find_closest_coordinate(dist, 
                                 value::T,
                                 vector) where T
    best_coordinate = 1
    min_distance::T = typemax(T)
    for (j,x) in enumerate(vector)
        current_dist = dist(x, value)
        if current_dist < min_distance
            best_coordinate = j
            min_distance = current_dist
        end
    end
    return best_coordinate
end

find_closest_coordinate (generic function with 1 method)

In [23]:
find_closest_coordinate(seuclidean, Float32(48.), query)
print(query)

Float32[1.0, 3.0, 11.0, 110.0, 62.0, 22.0, 4.0, 0.0, 43.0, 21.0, 22.0, 18.0, 6.0, 28.0, 64.0, 9.0, 11.0, 1.0, 0.0, 0.0, 1.0, 40.0, 101.0, 21.0, 20.0, 2.0, 4.0, 2.0, 2.0, 9.0, 18.0, 35.0, 1.0, 1.0, 7.0, 25.0, 108.0, 116.0, 63.0, 2.0, 0.0, 0.0, 11.0, 74.0, 40.0, 101.0, 116.0, 3.0, 33.0, 1.0, 1.0, 11.0, 14.0, 18.0, 116.0, 116.0, 68.0, 12.0, 5.0, 4.0, 2.0, 2.0, 9.0, 102.0, 17.0, 3.0, 10.0, 18.0, 8.0, 15.0, 67.0, 63.0, 15.0, 0.0, 14.0, 116.0, 80.0, 0.0, 2.0, 22.0, 96.0, 37.0, 28.0, 88.0, 43.0, 1.0, 4.0, 18.0, 116.0, 51.0, 5.0, 11.0, 32.0, 14.0, 8.0, 23.0, 44.0, 17.0, 12.0, 9.0, 0.0, 0.0, 19.0, 37.0, 85.0, 18.0, 16.0, 104.0, 22.0, 6.0, 2.0, 26.0, 12.0, 58.0, 67.0, 82.0, 25.0, 12.0, 2.0, 2.0, 25.0, 18.0, 8.0, 2.0, 19.0, 42.0, 48.0, 11.0]

In [24]:
@benchmark find_closest_coordinate(seuclidean, Float32(48.), query)


BenchmarkTools.Trial: 10000 samples with 776 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m162.532 ns[22m[39m ‚Ä¶ [35m325.709 ns[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m162.747 ns               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m164.051 ns[22m[39m ¬± [32m  3.392 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñà[34m‚ñÜ[39m[39m‚ñÉ[39m‚ñÉ[39m‚ñÅ[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÉ[39m‚ñÅ[39m [39m [39m [39m [39m‚ñÇ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m

In [25]:
@assert 3 == find_closest_coordinate(seuclidean, 2., [3.,5,2,1])

@assert 4 == find_closest_coordinate(seuclidean, 1., [3.,5,2,1])

In [26]:
function encode(dist, vector, prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    @inbounds for (j,x) in enumerate(vector)
            closest_prototypes[j] = find_closest_coordinate(dist, x, prototypes[:,j])
    end
    return closest_prototypes
end

encode (generic function with 1 method)

In [27]:
find_closest_coordinate(sqeuclidean, query[1], P[:,1])

8

In [28]:
encode(sqeuclidean, query, P)

128√ó1 Matrix{Int8}:
  8
 16
  5
  3
 13
 12
  8
  7
  7
 11
 10
  4
 14
  ‚ãÆ
 11
 10
  2
  4
  8
  1
 11
  8
  9
  6
  3
 13

In [29]:
@benchmark encode(sqeuclidean, query, P)

BenchmarkTools.Trial: 10000 samples with 7 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m4.030 Œºs[22m[39m ‚Ä¶ [35m 12.721 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m 0.00% ‚Ä¶ 99.93%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m5.137 Œºs               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m6.503 Œºs[22m[39m ¬± [32m127.162 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m19.55% ¬±  1.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÇ[39m‚ñÜ[39m‚ñÜ[39m‚ñÖ[39m‚ñÖ[39m‚ñÖ[39m‚ñà[34m‚ñÜ[39m[39m‚ñÑ[39m‚ñÇ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39

In [30]:
function encode_fast(dist, vector::Array{T}, prototypes::Array{T}) where T
    n_clusters, n_features = size(prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    
    @inbounds for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(prototypes[k,j],x )
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
           #println(k, ' ', j, ' ', best_coordinate, ' ',min_distance )
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

encode_fast (generic function with 1 method)

In [31]:
(query[1] .- P[:,1]) .* (query[1] .- P[:,1])

16-element Vector{Float32}:
  9656.532
    18.06276
  2194.044
  7631.915
   773.0733
  1390.5277
 15051.904
     0.25334996
   126.43398
 12305.633
   373.29453
  4015.94
  6145.547
 18455.473
  3068.1987
  5027.8315

In [32]:
println(find_closest_coordinate(sqeuclidean, query[1], P[:,1]))
println(find_closest_coordinate(sqeuclidean, query[2], P[:,2]))

8
16


In [33]:
encode_fast(sqeuclidean, query, P)[1:4]

4-element Vector{Int8}:
  8
 16
  5
  3

In [34]:
@assert isapprox(encode(sqeuclidean, query, P),
                 encode_fast(sqeuclidean, query, P))

In [35]:
@benchmark encode_fast(sqeuclidean, query, P)

BenchmarkTools.Trial: 10000 samples with 19 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m953.947 ns[22m[39m ‚Ä¶ [35m 5.230 Œºs[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m984.684 ns              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m  1.005 Œºs[22m[39m ¬± [32m94.458 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m [39m [39m [39m‚ñà[34m [39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39

In [36]:
function compute_ADC(query, prototypes, dist)
    n_clusters, n_features = size(P)
    
    ADC_table = Array{Float32}(undef, n_clusters, n_features)
    
    for j in 1:n_features
        for p in 1:n_clusters
            ADC_table[p,j] = dist(query[j], @view prototypes[p,:] )
        end
    end
    return  ADC_table
end

compute_ADC (generic function with 1 method)

In [37]:
size(P)

(16, 128)

In [38]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds   for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

SEuclidean0 (generic function with 1 method)

In [39]:
@benchmark compute_ADC(query, P, SEuclidean0)

BenchmarkTools.Trial: 7920 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m586.000 Œºs[22m[39m ‚Ä¶ [35m95.076 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 99.30%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m613.041 Œºs              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m630.068 Œºs[22m[39m ¬± [32m 1.062 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m1.89% ¬±  1.12%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÇ[39m‚ñÉ[39m‚ñá[39m‚ñà[39m‚ñá[34m‚ñÖ[39m[39m‚ñÉ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m

In [40]:
adc_table = compute_ADC(query, P, SEuclidean0);

In [41]:
function compute_ADC_fast(query, prototypes, dist)
    n_clusters, n_features = size(P)
    
    ADC_table = Array{Float32}(undef, n_clusters, n_features)
    
    for j in 1:n_features
        for p in 1:n_clusters
            
            ADC_table[p,j] = dist(query[j],  prototypes[p,:] )
        end
    end
    return  ADC_table
end

compute_ADC_fast (generic function with 1 method)

In [42]:
@benchmark compute_ADC_fast(query, P, SEuclidean0)

BenchmarkTools.Trial: 4789 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m828.000 Œºs[22m[39m ‚Ä¶ [35m105.093 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m 0.00% ‚Ä¶ 99.04%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m913.167 Œºs               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m  1.043 ms[22m[39m ¬± [32m  3.478 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m11.80% ¬±  3.50%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÇ[39m‚ñÜ[39m‚ñà[39m‚ñà[39m‚ñá[39m‚ñÖ[34m‚ñÖ[39m[39m‚ñÉ[39m‚ñÉ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 

In [43]:
function adc_dist(query_code, x_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @fastmath for j in eachindex(x_code)
        res+= adc_table[x_code[j], j]
    end
    return res
end

adc_dist (generic function with 1 method)

In [44]:
query_code = encode_fast(sqeuclidean, query, P)
x_code = PQcodes[:,1];
adc_table = compute_ADC(query, P, SEuclidean0);
adc_dist(query_code, x_code,  adc_table)

549694.4f0

In [45]:
@benchmark adc_dist(query_code, x_code,  adc_table)

BenchmarkTools.Trial: 10000 samples with 970 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m77.835 ns[22m[39m ‚Ä¶ [35m146.865 ns[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m80.068 ns               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m80.770 ns[22m[39m ¬± [32m  2.474 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñÅ[39m‚ñÉ[39m‚ñÉ[39m [39m‚ñÉ[39m‚ñÉ[39m‚ñÅ[39m‚ñÑ[39m‚ñà[39m‚ñà[34m‚ñá[39m[39m‚ñÑ[39m‚ñÉ[32m‚ñÇ[39m[39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m‚ñÇ[39m‚ñÑ[39m‚ñÑ[39m‚ñÉ[39m‚ñÉ[39m‚ñÇ[39m‚ñÅ[39m‚ñÅ[39m [39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39

In [46]:
x = X_tr_vecs[:,1];
@benchmark SEuclidean0(x, query)

BenchmarkTools.Trial: 10000 samples with 923 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m112.495 ns[22m[39m ‚Ä¶ [35m235.193 ns[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m114.166 ns               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m115.594 ns[22m[39m ¬± [32m  4.273 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñÇ[39m‚ñÜ[39m‚ñÅ[39m‚ñÉ[39m‚ñá[39m‚ñà[34m‚ñá[39m[39m‚ñÜ[39m‚ñÑ[39m‚ñÇ[39m‚ñÇ[39m‚ñÇ[32m‚ñÇ[39m[39m‚ñÑ[39m‚ñÑ[39m‚ñÉ[39m‚ñÉ[39m‚ñÑ[39m‚ñÉ[39m‚ñÉ[39m‚ñÇ[39m‚ñÇ[39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m‚ñÇ[39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m‚ñÉ[39m‚ñÇ[39m‚ñÅ[39m [39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [3

##  Share prototypes across features

https://groups.google.com/g/julia-users/c/xBcQRebyi_o



In [543]:
n_clusters = 32
#R_shared = kmeans(Matrix(vec(X_tr_vecs)'), n_clusters; maxiter=200)
R_shared = kmeans(X_tr_vecs[[1],:], n_clusters; maxiter=200)

KmeansResult{Matrix{Float32}, Float32, Int64}(Float32[8.419607 137.42932 ‚Ä¶ 29.992058 40.49322], [16, 30, 16, 15, 16, 4, 16, 30, 1, 27  ‚Ä¶  8, 1, 1, 26, 5, 29, 19, 31, 16, 7], Float32[0.05256829, 0.9406433, 0.05256829, 0.0007019043, 0.5940128, 0.0034179688, 0.05256829, 1.0611572, 0.17607117, 2.107788  ‚Ä¶  0.2590332, 2.497635, 0.33685303, 0.0, 0.9626465, 2.2998047, 0.2913227, 0.00012207031, 0.05256829, 8.056641], [77320, 7852, 15690, 25974, 20854, 1463, 12399, 12794, 36379, 74528  ‚Ä¶  12937, 17030, 11698, 22548, 41496, 11602, 16793, 41154, 24681, 12463], [77320, 7852, 15690, 25974, 20854, 1463, 12399, 12794, 36379, 74528  ‚Ä¶  12937, 17030, 11698, 22548, 41496, 11602, 16793, 41154, 24681, 12463], 1.2748595f6, 6, true)

In [None]:
function encode_shared(dist, vector::Array{T}, shared_prototypes::Array{T}) where T
    n_clusters = legnth(shared_prototypes)
    
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    
    @inbounds for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(prototypes[k,j],x )
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
           #println(k, ' ', j, ' ', best_coordinate, ' ',min_distance )
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

In [542]:
maximum(R_shared.assignments)

32

In [536]:
PQcodesShared = hcat([Int32.(r.assignments[:]) for r in R_shared]...)';

LoadError: MethodError: no method matching length(::KmeansResult{Matrix{Float32}, Float32, Int64})
[0mClosest candidates are:
[0m  length([91m::Union{Base.KeySet, Base.ValueIterator}[39m) at abstractdict.jl:58
[0m  length([91m::Union{ZMQ._Message, Base.RefValue{ZMQ._Message}}[39m) at ~/.julia/packages/ZMQ/R3wSD/src/_message.jl:31
[0m  length([91m::Union{HDF5.Attribute, HDF5.Dataset}[39m) at ~/.julia/packages/HDF5/T4H0V/src/dataspaces.jl:196
[0m  ...

In [48]:
length(unique(R_shared.centers))

32

In [49]:
println(maximum(R_shared.centers))
println(maximum(X_tr_vecs[[1],:]))

153.08621
172.0


In [50]:
P_shared = R_shared.centers

1√ó32 Matrix{Float32}:
 44.9633  100.415  2.90723  128.883  ‚Ä¶  114.634  15.9686  124.474  26.4908

Now we need a new encoding function that uses a single vector of prototypes

In [51]:
function encode_fast_shared(dist, vector::Array{T}, prototypes::Array{T}) where T
    
    n_features = length(vector)
    n_clusters = length(prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    
    @inbounds @fastmath for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(prototypes[k],x )
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

encode_fast_shared (generic function with 1 method)

In [52]:
@benchmark encode_fast_shared(sqeuclidean, query, P_shared)

BenchmarkTools.Trial: 10000 samples with 10 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m1.729 Œºs[22m[39m ‚Ä¶ [35m 3.554 Œºs[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.771 Œºs              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m1.783 Œºs[22m[39m ¬± [32m86.371 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñÖ[39m‚ñà[39m‚ñÑ[39m‚ñÉ[39m‚ñÇ[39m‚ñà[34m‚ñÜ[39m[39m‚ñÑ[32m‚ñÖ[39m[39m‚ñÇ[39m‚ñÇ[39m‚ñÇ[39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m‚ñÇ[39m‚ñÅ[39m‚ñÇ[39m‚ñÇ[39m‚ñÅ[39m‚ñÇ[39m‚ñÇ[39m [39m‚ñÇ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 

In [53]:
function compute_ADC_shared(query, prototypes, dist)
    n_clusters = length(prototypes)
    ADC_table = Array{Float32}(undef, n_clusters)
    
    @inbounds  for p in 1:n_clusters            
        ADC_table[p] = dist(query[p],  prototypes[p] )
    end
    return  ADC_table
end

compute_ADC_shared (generic function with 1 method)

In [54]:
query_code = encode_fast_shared(sqeuclidean, query, P_shared)
x_code = PQcodes[:,1];
adc_table_shared = Float32.(compute_ADC_shared(query, P_shared, SEuclidean0));
#adc_dist(query_code, x_code,  adc_table)

In [55]:
@benchmark adc_table = compute_ADC_shared(query, P_shared, SEuclidean0)

BenchmarkTools.Trial: 10000 samples with 981 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m63.710 ns[22m[39m ‚Ä¶ [35m 97.715 Œºs[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m 0.00% ‚Ä¶ 99.89%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m73.989 ns               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m84.853 ns[22m[39m ¬± [32m976.411 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m11.50% ¬±  1.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÅ[39m‚ñá[39m‚ñà[39m‚ñÉ[39m‚ñÅ[39m‚ñÉ[34m‚ñÉ[39m[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 

## Improving distance between vectors with ADC_distance

In [207]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds   for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

SEuclidean0 (generic function with 1 method)

In [208]:
@benchmark SEuclidean0($x, $query)

BenchmarkTools.Trial: 10000 samples with 969 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m77.442 ns[22m[39m ‚Ä¶ [35m178.277 ns[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m77.615 ns               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m79.027 ns[22m[39m ¬± [32m  3.064 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñà[34m‚ñÑ[39m[39m‚ñÅ[39m [39m [39m [32m [39m[39m [39m [39m‚ñÑ[39m‚ñÜ[39m‚ñÇ[39m‚ñÇ[39m [39m [39m [39m [39m [39m [39m [39m‚ñÅ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39

In [339]:
function adc_dist_shared(query_code, x_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(x_code)
        res+= adc_table[x_code[j]]
    end
    return res
end

adc_dist_shared (generic function with 1 method)

In [340]:
@benchmark adc_dist_shared($query_code, $x_code, $adc_table_shared)

BenchmarkTools.Trial: 10000 samples with 992 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m38.978 ns[22m[39m ‚Ä¶ [35m83.459 ns[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m39.231 ns              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m39.488 ns[22m[39m ¬± [32m 1.096 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m‚ñÅ[39m‚ñÜ[39m‚ñá[34m‚ñà[39m[39m‚ñà[39m‚ñÑ[39m‚ñÉ[39m‚ñÑ[32m‚ñÅ[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÅ[39m‚ñÇ[39m [39m [39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 

In [329]:
function adc_dist_shared_unrolled(query_code, x_code,  adc_table)
    res1 = zero(eltype(adc_table))
    res2 = zero(eltype(adc_table))
    res3 = zero(eltype(adc_table))
    res4 = zero(eltype(adc_table))
    
    @inbounds @fastmath for j in 1:4:length(x_code)
        res1 += adc_table[x_code[j]]
        res2 += adc_table[x_code[j+1]]
        res3 += adc_table[x_code[j+2]]
        res4 += adc_table[x_code[j+3]]
    end
    
    return res1 + res2 + res3 + res4
end

adc_dist_shared_unrolled (generic function with 1 method)

In [330]:
@benchmark adc_dist_shared_unrolled($query_code, $x_code,  $adc_table_shared)

BenchmarkTools.Trial: 10000 samples with 995 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m30.694 ns[22m[39m ‚Ä¶ [35m86.307 ns[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m30.905 ns              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m31.661 ns[22m[39m ¬± [32m 2.603 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñà[34m‚ñÜ[39m[39m‚ñÅ[39m‚ñÅ[32m‚ñÖ[39m[39m‚ñÑ[39m‚ñÇ[39m‚ñÅ[39m‚ñÉ[39m‚ñÉ[39m‚ñÇ[39m [39m‚ñÅ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m

In [288]:
adc_dist_shared(query_code, x_code,  adc_table_shared)

430293.53f0

In [318]:
adc_dist_shared_unrolled(query_code, x_code,  adc_table_shared)

430293.47f0

Note that euclidean squared distance with `@fastmath` is still faster

In [212]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

SEuclidean0 (generic function with 1 method)

In [213]:
@benchmark SEuclidean0($x, $query)

BenchmarkTools.Trial: 10000 samples with 999 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m9.510 ns[22m[39m ‚Ä¶ [35m22.480 ns[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m9.635 ns              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m9.771 ns[22m[39m ¬± [32m 0.512 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m‚ñÅ[39m‚ñÖ[39m‚ñà[34m [39m[39m‚ñÑ[39m‚ñÇ[39m‚ñÑ[32m [39m[39m‚ñÇ[39m [39m‚ñÇ[39m‚ñÜ[39m [39m‚ñÇ[39m‚ñÅ[39m‚ñÇ[39m [39m‚ñÅ[39m [39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚

What if we use an adctable cotaining only integers?

In [415]:
function adc_dist_shared(query_code, x_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(x_code)
        res+= adc_table[x_code[j]]
    end
    return res
end

adc_dist_shared (generic function with 1 method)

In [439]:
adc_table_shared_int = Int32.(round.(adc_table_shared));

In [431]:
@benchmark adc_dist_shared($query_code, $x_code,  $adc_table_shared_int)

BenchmarkTools.Trial: 10000 samples with 992 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m37.341 ns[22m[39m ‚Ä¶ [35m111.350 ns[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m37.676 ns               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m38.597 ns[22m[39m ¬± [32m  3.020 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñÜ[34m‚ñà[39m[39m‚ñÉ[39m‚ñÇ[39m‚ñÇ[32m‚ñÑ[39m[39m‚ñÖ[39m‚ñÇ[39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m‚ñÇ[39m‚ñÇ[39m‚ñÇ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39

## BWcode

Since different the same value on a pqcode found in different coordinates, refers to the same real number  when using a SharedQuantizer, we can iterate over the 128 vector coordinates of a pqcode and add up the value sin the `adc_table_shared`  or.... we can iterate over a `K`-dimensional vector that we call the BW code and add the values in `adc_table_shared` weighted by the amount of each prototype has been seen in a pqcode, which is precisely what the BW code stores.

In [611]:
n_examples = size(X_tr_vecs,2)
BWcodes = Array{Int16}(undef, n_clusters, n_examples);
bw_code = pq_code_to_bw_code(x_code, n_clusters);

In [612]:
function pq_code_to_bw_code(pqcode, n_clusters)
    bw_code = zeros(eltype(pqcode), n_clusters)
    for c in pqcode
        bw_code[c] += 1
    end
    return bw_code
end

pq_code_to_bw_code (generic function with 1 method)

Now let¬¥s verify that the distance between a query

In [609]:
function bw_adc_dist_shared(query_code, bw_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(bw_code)
        res+= adc_table[j] * bw_code[j]
    end
    return res
end

bw_adc_dist_shared (generic function with 1 method)

In [610]:
@benchmark bw_adc_dist_shared($query_code, $bw_code, $adc_table_shared)

BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m4.875 ns[22m[39m ‚Ä¶ [35m9.834 ns[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m5.000 ns             [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m4.995 ns[22m[39m ¬± [32m0.085 ns[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÇ[39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[34m‚ñà[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m‚ñÇ[39m‚ñÅ[39m‚ñÅ[39m

In [613]:
bw_adc_dist_shared(query_code, bw_code, adc_table_shared)

430293.53f0

In [614]:
adc_dist_shared(query_code, x_code, adc_table_shared)

430293.53f0

## Linear scann study

### Exact version

In [303]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

function linear_scann_exact(query, X)

    n_features, n_examples = size(X)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath  for j in 1:n_examples
        distances[j] = SEuclidean0(query, X[:,j])    
    end
    return distances
end

@benchmark linear_scann_exact(query, X_tr_vecs)

BenchmarkTools.Trial: 27 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m140.659 ms[22m[39m ‚Ä¶ [35m242.125 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m 0.00% ‚Ä¶ 41.35%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m149.052 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m189.397 ms[22m[39m ¬± [32m 47.933 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m24.96% ¬± 21.00%

  [39m [39m‚ñà[39m‚ñÇ[34m‚ñÇ[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÇ[39m [39m‚ñÖ[39m [39

In [297]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

function linear_scann_exact(query, X)

    n_features, n_examples = size(X)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath Threads.@threads for j in 1:n_examples
        distances[j] = SEuclidean0(query, X[:,j])    
    end
    return distances
end

@benchmark linear_scann_exact(query, X_tr_vecs)

BenchmarkTools.Trial: 58 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m45.220 ms[22m[39m ‚Ä¶ [35m   1.013 s[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m 0.00% ‚Ä¶ 95.17%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m49.153 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m98.816 ms[22m[39m ¬± [32m212.307 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m49.78% ¬± 21.25%

  [34m‚ñà[39m[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [34m‚

In [471]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

function linear_scann_exact(query, X)

    n_features, n_examples = size(X)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = SEuclidean0(query, view(X,:,j))    
    end
    return distances
end

@benchmark linear_scann_exact(query, X_tr_vecs)

BenchmarkTools.Trial: 488 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m 9.681 ms[22m[39m ‚Ä¶ [35m36.988 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 72.23%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m10.143 ms              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m10.247 ms[22m[39m ¬± [32m 1.226 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.53% ¬±  3.27%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÉ[39m‚ñà[39m‚ñÖ[39m‚ñÑ[39m‚ñÇ[39m [34m [39m[39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m‚ñÇ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m‚ñ

### ADC version

In [470]:
function linear_scann(query, PQcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath for j in 1:n_examples
        distances[j] = adc_dist_shared(query_code,  view(PQcodes,:,j) ,  adc_table_shared)    
    end
    return distances
end

@benchmark distances = linear_scann($query, $PQcodes, $adc_table_shared, $P_shared)

BenchmarkTools.Trial: 63 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m79.867 ms[22m[39m ‚Ä¶ [35m 83.343 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m80.252 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m80.596 ms[22m[39m ¬± [32m823.068 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñÅ[39m‚ñÅ[39m‚ñÜ[39m‚ñÅ[39m [39m‚ñÅ[39m‚ñà[39m‚ñÜ[34m‚ñÜ[39m[39m‚ñÅ[39m [39m [39m [39m [39m [39m‚ñÅ[32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 

In [291]:
function linear_scann(query, PQcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath Threads.@threads for j in 1:n_examples
        distances[j] = adc_dist_shared(query_code,  PQcodes[:,j] ,  adc_table_shared)    
    end
    return distances
end

BenchmarkTools.Trial: 126 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m23.466 ms[22m[39m ‚Ä¶ [35m881.834 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m 0.00% ‚Ä¶ 96.80%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m25.155 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m39.815 ms[22m[39m ¬± [32m107.178 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m33.96% ¬± 12.15%

  [34m‚ñà[39m[32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [34m

In [294]:
function linear_scann_unrolled(query, PQcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath Threads.@threads for j in 1:n_examples
        distances[j] = adc_dist_shared_unrolled(query_code,  PQcodes[:,j] ,  adc_table_shared)    
    end
    return distances
end

@benchmark distances = linear_scann_unrolled($query, $PQcodes, $adc_table_shared, $P_shared)

BenchmarkTools.Trial: 138 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m22.545 ms[22m[39m ‚Ä¶ [35m847.083 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m 0.00% ‚Ä¶ 97.26%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m23.738 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m36.216 ms[22m[39m ¬± [32m 98.161 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m32.78% ¬± 11.67%

  [34m‚ñà[39m[32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [34m

### ADC version with BW codes

In [627]:
n_examples = size(X_tr_vecs,2)
BWcodes = Array{Int8}(undef, n_clusters, n_examples);
bw_code = pq_code_to_bw_code(x_code, n_clusters);

In [628]:

for j in 1:n_examples
   BWcodes[:,j] .= pq_code_to_bw_code(PQcodes[:,j], n_clusters)
end

In [691]:
function bw_adc_dist_shared(query_code, bw_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(bw_code)
        res+= adc_table[j] * bw_code[j]
    end
    return res
end

function linear_scann_bw(query, BWcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(BWcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    n_clusters = size(BWcodes,1)
    #@show n_clusters
    @inbounds for i in 1:n_examples
        
        # Following code is equivalent to bw_adc_dist_shared(query_code, BWcodes[:,i],  adc_table_shared)
        res = zero(eltype(adc_table_shared))
        @simd for j in 1:n_clusters
            res+= adc_table_shared[j] * BWcodes[j,i]
        end        
        distances[i] = res  
    
    end
    return distances
end

#@benchmark distances = linear_scann($query, $PQcodes, $adc_table_shared, $P_shared)

linear_scann_bw (generic function with 1 method)

In [694]:
@benchmark linear_scann_bw($query, $BWcodes, $adc_table_shared, $P_shared)

BenchmarkTools.Trial: 1078 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m3.622 ms[22m[39m ‚Ä¶ [35m  6.427 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 15.38%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m4.494 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m4.629 ms[22m[39m ¬± [32m432.548 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m1.60% ¬±  4.52%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÖ[39m‚ñà[39m‚ñá[39m‚ñá[39m‚ñÇ[39m‚ñÉ[39m [39m‚ñÇ[34m [39m[39m‚ñÇ[39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  

#### TODO
How can we leverage a fast multithreaded Linearscann?


In [699]:
BWCodes

LoadError: UndefVarError: BWCode not defined

In [708]:
function bw_adc_dist_shared(query_code, bw_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(bw_code)
        res+= adc_table[j] * bw_code[j]
    end
    return res
end

function linear_scann_bw_multithreaded(query, BWcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(BWcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    n_clusters = size(BWcodes,1)
    #@show n_clusters
    @inbounds @fastmath Threads.@threads  for i in 1:n_examples
       distances[i] = bw_adc_dist_shared(query_code, view(BWcodes,:,i),  adc_table_shared)
    end    
    
    return distances
end

@benchmark distances = linear_scann_bw_multithreaded($query, $BWcodes, $adc_table_shared, $P_shared)

BenchmarkTools.Trial: 4774 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m674.583 Œºs[22m[39m ‚Ä¶ [35m87.179 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m 0.00% ‚Ä¶ 98.88%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m858.750 Œºs              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m  1.037 ms[22m[39m ¬± [32m 1.377 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m12.79% ¬± 16.03%

  [39m [39m‚ñÑ[39m‚ñÜ[39m‚ñà[39m‚ñà[34m‚ñá[39m[39m‚ñÜ[39m‚ñÖ[39m‚ñÉ[39m‚ñÇ[32m‚ñÇ[39m[39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [3

In [709]:
distances = linear_scann_bw_multithreaded(query, BWcodes, adc_table_shared, P_shared)
distances

1000000-element Vector{Float32}:
 430293.53
 409572.12
 523334.03
 411390.56
 483739.44
 465726.72
 469184.3
 465421.94
 471216.94
 472431.4
 505966.56
 441302.97
 504491.28
      ‚ãÆ
 353777.56
 407598.38
 390484.38
 529897.25
 490457.53
 362624.6
 516215.8
 429939.62
 412462.94
 407520.47
 499928.0
 393646.03

# Quality verification

In [689]:
length(linear_scann_bw(query, BWcodes, adc_table_shared, P_shared))

1000000

## Euclidean  Squared Distance

In [440]:
function SEuclidean(X, query)
    d = (query .- X) .* (query .- X)
    return sum(d, dims=1)
end

SEuclidean (generic function with 1 method)

In [441]:
function SEuclidean_2(X, query)
    n_features, n_examples = size(X)
    result = zeros(n_examples)
    for m in 1:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        result[m] = res
    end
    return result
end

SEuclidean_2 (generic function with 1 method)

In [442]:
query = X_te_vecs[:,1];

In [443]:
@benchmark SEuclidean(X_te_vecs, query)

BenchmarkTools.Trial: 3753 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m928.708 Œºs[22m[39m ‚Ä¶ [35m32.129 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 96.23%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m  1.211 ms              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m  1.328 ms[22m[39m ¬± [32m 1.582 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m7.66% ¬±  6.17%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÖ[39m‚ñá[39m‚ñà[39m‚ñà[39m‚ñá[39m‚ñÜ[39m‚ñÜ[39m‚ñÖ[39m‚ñÖ[34m‚ñÖ[39m[39m‚ñÑ[39m‚ñÇ[39m [39m [39m [39m [39m [39m‚ñÇ[39m‚ñÉ[39m‚ñÉ[39m‚ñÇ[39m‚ñÇ[32m‚ñÅ[39m[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [3

In [444]:
@benchmark SEuclidean_2(X_te_vecs, query)

BenchmarkTools.Trial: 6369 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m769.583 Œºs[22m[39m ‚Ä¶ [35m 2.304 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m780.625 Œºs              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m783.792 Œºs[22m[39m ¬± [32m22.694 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m [39m [39m‚ñÇ[39m‚ñÇ[39m‚ñÑ[39m‚ñÖ[39m‚ñÜ[39m‚ñà[39m‚ñÜ[34m‚ñá[39m[39m‚ñÜ[39m‚ñÉ[32m‚ñÅ[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 

In [445]:
@benchmark SEuclidean(X_tr_vecs, query)

BenchmarkTools.Trial: 39 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m115.288 ms[22m[39m ‚Ä¶ [35m213.858 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m121.950 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m131.411 ms[22m[39m ¬± [32m 21.901 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m5.18% ¬± 6.91%

  [39m‚ñÑ[39m‚ñà[39m [34m [39m[39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 


In [446]:
@benchmark SEuclidean_2(X_tr_vecs, query)

BenchmarkTools.Trial: 65 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m77.253 ms[22m[39m ‚Ä¶ [35m 79.056 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m77.805 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m77.872 ms[22m[39m ¬± [32m390.832 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m‚ñÉ[39m‚ñÜ[39m‚ñà[39m [39m‚ñÜ[39m [39m [39m [39m [34m [39m[39m [39m [39m [32m [39m[39m‚ñà[39m [39m [39m [39m [39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  

## Finding top k distances (and their ids)


The first naive thing we can do consist on computing all distances and then sorting them to get the top k closest vectors to the query vector

In [447]:
function top_k_ids(X, query)
    distances = SEuclidean_2(X, query)
    top_k_indices = sortperm(distances)
    return top_k_indices
end

top_k_ids (generic function with 1 method)

In [448]:
@benchmark top_k_ids(X_te_vecs, query)[1:10]

BenchmarkTools.Trial: 3988 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m1.228 ms[22m[39m ‚Ä¶ [35m 1.796 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.247 ms              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m1.252 ms[22m[39m ¬± [32m20.688 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m [39m [39m [39m‚ñÅ[39m‚ñÖ[39m‚ñÖ[39m‚ñá[39m‚ñà[39m‚ñÖ[39m‚ñÜ[34m‚ñÉ[39m[39m‚ñÇ[39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m‚ñÇ[

A slightly better approach consist on using `partialsortperm` to simply sort a subset of the distances vector.

In [449]:
function top_k_ids_2(X, query, k)
    distances = SEuclidean_2(X, query)
    top_k_indices = partialsortperm(distances, 1:k)
    return top_k_indices
end

top_k_ids_2 (generic function with 1 method)

In [450]:
@benchmark top_k_ids_2(X_te_vecs, query, 10)

BenchmarkTools.Trial: 6056 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m785.709 Œºs[22m[39m ‚Ä¶ [35m 1.346 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m808.042 Œºs              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m822.902 Œºs[22m[39m ¬± [32m37.672 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m [39m‚ñÅ[39m‚ñá[39m‚ñà[39m‚ñÑ[39m‚ñÅ[34m [39m[39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 

### Storing top k distances in a priority queue

A better alternative consists on using a priority queue. This. queue will keep only k distances in memory (no need to store all distances between the query point and all possible candidates).

In [451]:
sort!([1,54,3,24,10])

5-element Vector{Int64}:
  1
  3
 10
 24
 54

In [452]:
a = [1,2,3,4,5]

5-element Vector{Int64}:
 1
 2
 3
 4
 5

In [453]:
function SEuclidean_3(X, query, top_k)
    result = sort(SEuclidean_2(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    for m in top_k:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        dist = res/n_features
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end
            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end

    end
    return result
end

SEuclidean_3 (generic function with 1 method)

In [454]:
@benchmark SEuclidean_3(X_tr_vecs, query, 10) 

BenchmarkTools.Trial: 62 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m79.984 ms[22m[39m ‚Ä¶ [35m90.506 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m80.591 ms              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m81.169 ms[22m[39m ¬± [32m 1.771 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m [39m [39m [39m‚ñà[34m‚ñÇ[39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m‚ñÜ[39m‚ñá[

In [455]:
@benchmark top_k_ids_2(X_tr_vecs, query, 10)

BenchmarkTools.Trial: 54 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m86.799 ms[22m[39m ‚Ä¶ [35m150.780 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m89.566 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m93.604 ms[22m[39m ¬± [32m 11.673 ms[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m‚ñÉ[39m [39m‚ñÉ[34m‚ñà[39m[39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m‚ñ

In [456]:
function SEuclidean_4(X, query, top_k)
    
    result = sort(SEuclidean_2(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    @inbounds @fastmath for m in top_k:n_examples
        res = zero(eltype(X))
        @simd for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        dist = res
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end
    end
    return result
end


SEuclidean_4 (generic function with 1 method)

In [462]:
@benchmark SEuclidean_4(X_tr_vecs, query, 10) 

BenchmarkTools.Trial: 416 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m11.872 ms[22m[39m ‚Ä¶ [35m 13.198 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m11.996 ms               [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m12.029 ms[22m[39m ¬± [32m161.525 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñÑ[39m‚ñà[39m [39m [39m [39m [39m [39m [39m [34m [39m[39m [39m [32m [39m[39m [39m [39m [39m‚ñÅ[39m [39m [39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [3

In [458]:
X_tr_200k = X_tr_vecs[:,1:200_000]
@benchmark SEuclidean_4(X_tr_200k, query, 10) 

BenchmarkTools.Trial: 2022 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m2.424 ms[22m[39m ‚Ä¶ [35m 3.359 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.448 ms              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m2.463 ms[22m[39m ¬± [32m52.802 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m‚ñÉ[39m‚ñà[39m‚ñÖ[39m‚ñÑ[39m‚ñÅ[39m [34m [39m[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m‚ñà[39m‚ñà[

In [None]:
SEuclidean_4(X_tr_vecs, query, 10)

In [459]:
function SEuclidean_5(X, query, top_k)
    
    result = sort(SEuclidean_2(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    for m in top_k:n_examples
        res = zero(eltype(X))
        @turbo  for j in 1:n_features
            aux = (query[j] - X[j,m])
            res += aux * aux
        end
        dist = res
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end
    end
    return result
end

SEuclidean_5 (generic function with 1 method)

In [460]:
@benchmark SEuclidean_5(X_tr_200k, query, 10) 

BenchmarkTools.Trial: 2251 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m ‚Ä¶ [35mmax[39m[90m):  [39m[36m[1m2.148 ms[22m[39m ‚Ä¶ [35m 2.755 ms[39m  [90m‚îä[39m GC [90m([39mmin ‚Ä¶ max[90m): [39m0.00% ‚Ä¶ 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.197 ms              [22m[39m[90m‚îä[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ¬± [32mœÉ[39m[90m):   [39m[32m[1m2.211 ms[22m[39m ¬± [32m58.715 Œºs[39m  [90m‚îä[39m GC [90m([39mmean ¬± œÉ[90m):  [39m0.00% ¬± 0.00%

  [39m [39m [39m‚ñÑ[39m‚ñá[39m‚ñà[39m‚ñà[39m‚ñá[39m‚ñÜ[34m‚ñÜ[39m[39m‚ñá[32m‚ñá[39m[39m‚ñÜ[39m‚ñÖ[39m‚ñÑ[39m‚ñÉ[39m‚ñÇ[39m‚ñÇ[39m‚ñÅ[39m‚ñÅ[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [3

# Comparison with Distances.jl


Distances.jl provides the method 'pairwise' to compute pairwise distances between two matrices.

In [None]:
@show typeof(query)
@show size(query)
@show typeof(X_tr_vecs)
@show size(X_tr_vecs)

In [None]:
v1 = [1,2,3]
v2 = [0,0,1]

y = [0,0,2]
X = [v1 v2]

In [None]:
function SEuclidean_5(X, query, top_k)
    
    result = sort(SEuclidean_5(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    for m in top_k:n_examples
        dist = zero(eltype(X))
        @turbo  for j in 1:n_features
            dist = (query[j] - X[j,m])
            dist += aux * aux
        end
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end
    end
    return result
end

In [None]:
function SEuclidean_2(X, query)
    n_features, n_examples = size(X)
    result = zeros(eltype(X), n_examples)
    for m in 1:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        result[m] = res
    end
    return result
end

In [None]:
y_colvec = [y y][:,[1]]
query_mat = [query query][:,[1]];

In [None]:
SEuclidean_2(X, y)

In [None]:
pairwise(SqEuclidean(), y_colvec, X)

In [None]:
@benchmark pairwise(SqEuclidean(), query_mat, X_tr_vecs)

In [None]:
@benchmark SEuclidean_2(X_tr_vecs, query)

In [None]:
@benchmark SEuclidean_4(X_tr_vecs, query, 10)

We can verify that the sorted distances by `SEuclidean_4` are the same as the results provided with `pairwise` from `Distances.jl`

In [None]:
res2 = SEuclidean_4(X_tr_vecs, query, 10);
res = pairwise(SqEuclidean(), query_mat, X_tr_vecs)
res = sort(res, dims=2)[1:10]
res == res2

## Parallel implementation

# SIMD tests

In [None]:

function find_val_in_array_simd(x::Array{T}, val::T) where {T}
    n_simd = 64
    last_pos_simd_chunk = length(x)-n_simd
    @inbounds for i in 1:n_simd:last_pos_simd_chunk
        vec_i = vload(Vec{n_simd, T}, x, i)
        sum_equality = sum(vec_i == val)
        if sum_equality >0
            return true
        end
    end

    @inbounds for i in last_pos_simd_chunk:length(x)
        if x[i] == val
            return true
        end
    end

    return false
end

In [None]:
function jdotavx(a, b)
    s = zero(eltype(a))
    @turbo for i ‚àà eachindex(a, b)
        s += a[i] * b[i]
    end
    s
end