# Read SIFT 1 million 

In [710]:
using HDF5
using BenchmarkTools
using Distances
using LoopVectorization
using SIMD
using Clustering
using ProgressMeter
using StaticArrays
using DataFrames
using Plots

In [711]:
profile_flag = false
Sys.cpu_info()[1].model

"Apple M1 Pro"

In [712]:
path = joinpath(homedir(), "Datasets", "SIFT1M",  "sift-128-euclidean.hdf5")

"/Users/dbuchaca/Datasets/SIFT1M/sift-128-euclidean.hdf5"

In [713]:
f = h5open(path, "r")

X_tr_vecs = read(f["train"])
X_te_vecs = read(f["test"]);
true_neighbors = read(f["neighbors"])
true_distances = read(f["distances"])

true_neighbors .= true_neighbors .+ 1;

@show size(X_tr_vecs)
@show size(X_te_vecs)
@show size(true_neighbors)
@show size(true_distances)

size(X_tr_vecs) = (128, 1000000)
size(X_te_vecs) = (128, 10000)
size(true_neighbors) = (100, 10000)
size(true_distances) = (100, 10000)


(100, 10000)

## Clustering.jl

Create the pqcodes for the training points

In [714]:
n_features = size(X_tr_vecs,1)
n_clusters = 32
R_per_feature = []

@showprogress for j in  1:n_features
    R = kmeans(X_tr_vecs[[j],:], n_clusters; maxiter=200)
    push!(R_per_feature, R)
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:40[39m


In [1104]:
P = hcat([r.centers[:] for r in R_per_feature]...);
println(P[:,1])
println(size(P))

Float32[0.22927776, 141.73946, 42.455154, 83.46414, 21.489227, 121.49573, 28.951014, 111.161705, 10.426989, 58.44279, 33.95053, 97.429085, 52.43881, 103.97232, 73.453156, 132.03961, 3.9273367, 2.0, 38.468975, 78.937996, 6.9644637, 13.964964, 91.959656, 117.058464, 24.462994, 125.92032, 46.94962, 19.490341, 16.974878, 68.46652, 63.950676, 87.47854]
(32, 128)


In [1105]:
P = hcat([r.centers[:] for r in R_per_feature]...);
println(P[:,1])
println(size(P))

Float32[0.22927776, 141.73946, 42.455154, 83.46414, 21.489227, 121.49573, 28.951014, 111.161705, 10.426989, 58.44279, 33.95053, 97.429085, 52.43881, 103.97232, 73.453156, 132.03961, 3.9273367, 2.0, 38.468975, 78.937996, 6.9644637, 13.964964, 91.959656, 117.058464, 24.462994, 125.92032, 46.94962, 19.490341, 16.974878, 68.46652, 63.950676, 87.47854]
(32, 128)


In [1106]:
if profile_flag
    @benchmark P = hcat([r.centers[:] for r in R_per_feature]...)
end

Another way to construct the matrix would be

In [1107]:
function build_prototype_matrix(R, n_clusters, n_features)
    
    prototypes = Array{Float32}(undef, n_clusters, n_features);
    for j in 1:n_features
        prototypes[:,j] .= vec(R_per_feature[j].centers)
    end
    return prototypes
end

if profile_flag
    @benchmark P = build_prototype_matrix(R_per_feature, n_clusters, n_features)
end

In [1108]:
P2 = build_prototype_matrix(R_per_feature, n_clusters, n_features);
isapprox(P, P2)

true

We can generate the PQcodes for the data as follows:

In [1109]:
PQcodes = hcat([Int32.(r.assignments[:]) for r in R_per_feature]...)';
size(PQcodes)

(128, 1000000)

Here each vector prototype is in fact a single scalar (because subvectors have a single coordinate)

In [1110]:
size(P)

(32, 128)

We need a method to, given a vector and a collection of prototypes per feature, find which are the closest

In [1111]:

@inline function seuclidean(x::T,y::T) where {T}
    aux::T = (x - y)
    return aux * aux
end

@inline function euclidean(x::T,y::T) where {T}
    aux::T = (x - y)
    return sqrt(aux * aux)
end

function find_closest_coordinate(dist, 
                                 value::T,
                                 vector) where T
    best_coordinate = 1
    min_distance::T = typemax(T)
    for (j,x) in enumerate(vector)
        current_dist = dist(x, value)
        if current_dist < min_distance
            best_coordinate = j
            min_distance = current_dist
        end
    end
    return best_coordinate
end

query = X_te_vecs[:,1];
closest_prototypes = Array{UInt8}(undef, n_features, 1);

if profile_flag
    @benchmark find_closest_coordinate(euclidean, Float32(48.), query)
end

In [1112]:
@assert 3 == find_closest_coordinate(euclidean, 2., [3.,5,2,1])

@assert 4 == find_closest_coordinate(euclidean, 1., [3.,5,2,1])

In [1113]:
function encode(dist, vector, prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    @inbounds for (j,x) in enumerate(vector)
        closest_prototypes[j] = find_closest_coordinate(dist, x, prototypes[:,j])
    end
    return closest_prototypes
end

if profile_flag
    @benchmark encode(euclidean, query, P)
end

In [1114]:
n_features, n_examples_test = size(X_te_vecs)

PQcodes_test = Array{Int16}(undef, n_features, n_examples_test);

for j in 1:n_examples_test
    PQcodes_test[:,j] = encode(euclidean, X_te_vecs[:,j] , P)  
end

In [1115]:
function encode_fast(dist, vector::Array{T}, prototypes::Array{T}) where T
    n_clusters, n_features = size(prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    
    @inbounds for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(prototypes[k,j],x )
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
           #println(k, ' ', j, ' ', best_coordinate, ' ',min_distance )
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

@assert isapprox(encode(euclidean, query, P),
                 encode_fast(euclidean, query, P))

if profile_flag
    @benchmark encode_fast(euclidean, query, P)
end

In [1116]:
println(find_closest_coordinate(euclidean, query[1], P[:,1]))
println(find_closest_coordinate(euclidean, query[2], P[:,2]))
println(find_closest_coordinate(euclidean, query[3], P[:,3]))
println(encodquery = X_te_vecs[:,1];

query = X_te_vecs[:,1];

query = X_te_vecs[:,1];

e_fast(sqeuclidean, query, P)[1:3])

1
32
1


LoadError: syntax: more than one semicolon in argument list around In[1116]:4

In [1117]:
function SEuclidean0(x, query)
    @assert length(x) == length(query)
    res = zero(eltype(x))
    @inbounds   for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

function Euclidean0(x, query)
    @assert length(x) == length(query)
    res = zero(eltype(x))
    @inbounds   for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return sqrt(res)
end

function compute_ADC(query, prototypes, dist)
    n_clusters, n_features = size(prototypes)
    
    ADC_table = Array{Float32}(undef, n_clusters, n_features)
    
    for j in 1:n_features
        for p in 1:n_clusters
            ADC_table[p,j] = dist(query[j], prototypes[p,j] )
        end
    end
    return  ADC_table
end

adc_table = compute_ADC(query, P, euclidean)

if profile_flag
    @benchmark compute_ADC(query, P, euclidean)
end

In [1118]:
function adc_dist(query_code, x_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @fastmath for j in eachindex(x_code)
        res+= adc_table[x_code[j], j]
    end
    return res
end

query_code = encode_fast(euclidean, query, P)
x_code = PQcodes[:,1];
adc_table = compute_ADC(query, P, euclidean);
print(adc_dist(query_code, x_code,  adc_table))

if profile_flag
    @benchmark adc_dist(query_code, x_code,  adc_table)
end

5199.315

In [1119]:
function linear_scann(query, PQcodes, adc_table, P)
    
    query_code = encode_fast(sqeuclidean, query, P)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath for j in 1:n_examples
        distances[j] = adc_dist(query_code,  view(PQcodes,:,j) ,  adc_table)    
    end
    return distances
end

if profile_flag
    @benchmark distances = linear_scann($query, $PQcodes, $adc_table, $P)
end

In [1120]:
function recall(predicted, relevant, eval_at)
    """
    fraction of the relevant documents that are successfully retrieved
    """
    if eval_at == 0
        return 0.0
    end
    
    predicted_at_k = predicted[1:eval_at]
    n_predicted_and_relevant =  length(intersect( Set(predicted_at_k), Set(relevant))) 
    return n_predicted_and_relevant/ length(relevant)
end

query_id = 1
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

pq_distances = linear_scann(query, PQcodes, adc_table, P);
top_k_pq = sortperm(pq_distances)[1:top_k];

@show recall(top_k_pq, query_true_neighbors, 100);

recall(top_k_pq, query_true_neighbors, 100) = 0.68


#### Decoding vectors from pqcode

In [1121]:
function get_top_k_from_distances(X, distances, top_k)
    top_k_pq = sortperm(distances)[1:top_k];
    X_top = X[:,top_k_pq]
    return X_top
end

function decode(x_pqcode, P)
    n_features = length(x_pqcode)
    x_decoded = zeros(eltype(P), n_features)
    
    for j in eachindex(x_pqcode)
        x_decoded[j] = P[x_pqcode[j], j]
    end
    return x_decoded
end

x = X_te_vecs[:,1]
x_pqcode = vec(encode(Euclidean0, x, P));
x_decoded = decode(x_pqcode, P);

In [1122]:
query = X_te_vecs[:,1]
adc_table = compute_ADC(query, P, Euclidean0)
pq_distances = linear_scann(query, PQcodes_test, adc_table, P);
x_closest = vec(get_top_k_from_distances(X_te_vecs, pq_distances, 1));

df_x = DataFrame(Dict(:x => x, 
                      :x_decoded => x_decoded,
                      :x_pqcode => x_pqcode,
                      :x_closest => x_closest
                      ))
first(df_x, 5)

Unnamed: 0_level_0,x,x_closest,x_decoded,x_pqcode
Unnamed: 0_level_1,Float32,Float32,Float32,Int8
1,1.0,1.0,0.229278,1
2,3.0,3.0,3.0,32
3,11.0,11.0,11.949,1
4,110.0,110.0,110.705,8
5,62.0,62.0,61.9183,4


##  PQLinearscann Sharing prototypes across features

https://groups.google.com/g/julia-users/c/xBcQRebyi_o



In [1123]:
n_clusters = 32
#R_shared = kmeans(Matrix(vec(X_tr_vecs)'), n_clusters; maxiter=200)
#R_shared = kmeans(X_tr_vecs[[1],:], n_clusters; maxiter=200)
R_shared = R_per_feature[1]

KmeansResult{Matrix{Float32}, Float32, Int64}(Float32[0.22927776 141.73946 … 63.950676 87.47854], [1, 22, 1, 9, 1, 27, 1, 29, 21, 5  …  19, 9, 9, 11, 11, 10, 17, 7, 1, 8], Float32[0.05256829, 0.0012207031, 0.05256829, 2.4743652, 0.5940128, 1.1030273, 0.05256829, 0.9503784, 1.0723343, 0.23931885  …  0.28198242, 0.18232727, 2.0363007, 0.90356445, 1.1015625, 0.3100586, 0.8599529, 1.1003418, 0.05256829, 8.056641], [202536, 5903, 23537, 9704, 21536, 9600, 42257, 12399, 68044, 24612  …  10361, 11409, 38763, 10843, 26737, 22674, 37735, 13099, 18206, 8992], [202536, 5903, 23537, 9704, 21536, 9600, 42257, 12399, 68044, 24612  …  10361, 11409, 38763, 10843, 26737, 22674, 37735, 13099, 18206, 8992], 1.3330955f6, 5, true)

In [1253]:
function encode_shared(dist, vector::Array{T}, shared_prototypes::Array{T}) where T
    n_clusters = length(shared_prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    
    @inbounds for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(shared_prototypes[k], x)
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
           #println(k, ' ', j, ' ', best_coordinate, ' ',min_distance )
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

"""
function encode_fast_shared(dist, vector::Array{T}, prototypes::Array{T}) where T
    
    n_features = length(vector)
    n_clusters = length(prototypes)
    closest_prototypes = Array{Int32}(undef, n_features, 1);
    
    @inbounds @fastmath for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(prototypes[k], x)
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

if profile_flag
    @benchmark encode_fast_shared(euclidean, query, P_shared)
end
"""

P_shared = vec(R_shared.centers)

j = 3
x = X_tr_vecs[:,j]
x_pqcode_shared = encode_shared(sqeuclidean, x, P_shared);


P_shared_copied = hcat( [P_shared for i in 1:128]...);
@assert P_shared_copied[:,1] == P_shared
x_pqcode = encode(sqeuclidean, x, P_shared_copied);
x_decoded = decode(x_pqcode, P_shared_copied);

@assert x_pqcode_shared == x_pqcode

df_x = DataFrame(Dict(:x => x, 
                      :x_pqcode => vec(x_pqcode),
                      :x_pqcode_shared => vec(x_pqcode_shared),
                      :x_decoded => x_decoded,
                      #:centroid => vec(P_shared[x_pqcode])
                      ))

first(df_x, 5)

Unnamed: 0_level_0,x,x_decoded,x_pqcode,x_pqcode_shared
Unnamed: 0_level_1,Float32,Float32,Int8,Int8
1,0.0,0.229278,1,1
2,1.0,0.229278,1,1
3,5.0,3.92734,17,17
4,3.0,3.92734,17,17
5,44.0,42.4552,3,3


In [1234]:
n_features, n_examples = size(X_tr_vecs)

PQcodes_shared = Array{Int32}(undef, n_features, n_examples);

for j in 1:n_examples
    PQcodes_shared[:,j] = encode_shared(euclidean, X_tr_vecs[:,j], P_shared)  
    #PQcodes_shared[:,j] = encode(euclidean, X_tr_vecs[:,j] ,P_shared_copied)  
end

@show maximum(PQcodes), maximum(PQcodes_shared)

(maximum(PQcodes), maximum(PQcodes_shared)) = (32, 32)


(32, 32)

Because we have created the shared quantizer from the quantizer of the PQquantizer of the first feature the following assertion must hold

In [1259]:
@assert PQcodes_shared[1,:] == PQcodes[1,:]

Now we need a new encoding function that uses a single vector of prototypes

In [1235]:
function encode_fast_shared(dist, vector::Array{T}, prototypes::Array{T}) where T
    
    n_features = length(vector)
    n_clusters = length(prototypes)
    closest_prototypes = Array{Int32}(undef, n_features, 1);
    
    @inbounds @fastmath for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(prototypes[k], x)
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

if profile_flag
    @benchmark encode_fast_shared(euclidean, query, P_shared)
end

@assert encode_fast_shared(euclidean, x ,vec(P_shared)) ==  encode(euclidean, x ,P_shared_copied);
@assert encode_fast_shared(euclidean, x ,vec(P_shared)) ==  encode(sqeuclidean, x ,P_shared_copied);

Now we will ensure that `compute_ADC_shared` provides the same ADC table computation as `compute_ADC`
in the case where `compute_ADC` is called with all prototypes being copies of column vector used in `compute_ADC_shared` 

In [1249]:

function compute_ADC_shared(query, prototypes, dist)
    @assert ndims(prototypes) ==1
    n_clusters = length(prototypes)
    ADC_table = Array{Float32}(undef, n_clusters)
    
    @inbounds  for p in 1:n_clusters      
        ADC_table[p] = dist(query[p],  prototypes[p])
    end
    return  ADC_table
end


function compute_ADC(query, prototypes, dist)

    @assert ndims(prototypes) ==2

    n_clusters, n_features = size(prototypes)
    ADC_table = Array{Float32}(undef, n_clusters, n_features)
    
    for j in 1:n_features
        for p in 1:n_clusters
            ADC_table[p,j] = dist(query[j], prototypes[p,j] )
        end
    end
    return  ADC_table
end

query = X_tr_vecs[:,1]
adc_table_shared = compute_ADC_shared(query, P[:,1], euclidean);
adc_table = compute_ADC(query, P_shared_copied, euclidean);
@assert adc_table_shared == adc_table[:,1]





LoadError: AssertionError: adc_table_shared == adc_table[:, 1]

In [1244]:
adc_table_shared

32-element Vector{Float32}:
   0.22927776
 125.739456
   7.4551544
  78.46414
  10.510773
  90.49573
  14.951014
 101.161705
   0.5730114
  19.557209
  21.049469
  87.429085
   7.4388084
   ⋮
  13.035536
   5.964964
  88.959656
 112.058464
  42.537006
 108.92032
  27.94962
   6.509659
  11.974878
  68.46652
  62.950676
  65.47854

In [1245]:

df_query = DataFrame(Dict(:adc_table_shared => adc_table_shared,
                           :adc_table_=> adc_table[:,1] ))
df_query

Unnamed: 0_level_0,adc_table_,adc_table_shared
Unnamed: 0_level_1,Float32,Float32
1,0.229278,0.229278
2,141.739,125.739
3,42.4552,7.45515
4,83.4641,78.4641
5,21.4892,10.5108
6,121.496,90.4957
7,28.951,14.951
8,111.162,101.162
9,10.427,0.573011
10,58.4428,19.5572


In [1213]:
#compute_ADC(query, P_shared_copied, euclidean)

In [1219]:


function linear_scann_shared(query, PQcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath for j in 1:n_examples
        distances[j] = adc_dist_shared(query_code,  view(PQcodes,:,j) ,  adc_table_shared)    
    end
    return distances
end

if profile_flag
    @benchmark distances = linear_scann($query, $PQcodes, $adc_table_shared, $P_shared)
end

query = X_tr_vecs[:,1]
query_code = encode_fast_shared(euclidean, query, P_shared)
adc_table_shared = compute_ADC_shared(query, vec(P_shared), euclidean)
#adc_table = compute_ADC(query, P_shared_copied, euclidean);

if profile_flag
    @benchmark adc_table = compute_ADC_shared(query, P_shared, euclidean)
end

In [1216]:

df_x = DataFrame(Dict(:query => query, 
                      :query_code => vec(query_code),
                
                      ))

first(df_x, 5)

Unnamed: 0_level_0,query,query_code
Unnamed: 0_level_1,Float32,Int32
1,0.0,1
2,16.0,29
3,35.0,11
4,5.0,17
5,32.0,11


In [1217]:
#pq_distances = linear_scann(query, PQcodes_shared, adc_table_shared, P);
pq_distances = linear_scann_shared(query, PQcodes_shared, adc_table_shared, P_shared);

x_closest = vec(get_top_k_from_distances(X_tr_vecs, pq_distances, 1));


df_x = DataFrame(Dict(:x => x, 
                      :x_pqcode => vec(x_pqcode),
                      :x_closest => vec(x_closest)
                      ))

first(df_x, 5)

Unnamed: 0_level_0,x,x_closest,x_pqcode
Unnamed: 0_level_1,Float32,Float32,Int8
1,0.0,57.0,1
2,1.0,3.0,1
3,5.0,0.0,17
4,3.0,0.0,17
5,44.0,0.0,3


In [1218]:
function recall(predicted, relevant, eval_at)
    """
    fraction of the relevant documents that are successfully retrieved
    """
    if eval_at == 0
        return 0.0
    end
    
    predicted_at_k = predicted[1:eval_at]
    n_predicted_and_relevant =  length(intersect( Set(predicted_at_k), Set(relevant))) 
    return n_predicted_and_relevant/ length(relevant)
end

query_id = 1
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

pq_distances = linear_scann(query, PQcodes_shared, adc_table, P);
top_k_pq = sortperm(pq_distances)[1:top_k];

@show recall(top_k_pq, query_true_neighbors, 100);

recall(top_k_pq, query_true_neighbors, 100) = 0.0


## Improving distance between vectors with ADC_distance

In [797]:
function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds   for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

function Euclidean0(x, query)
    res = zero(eltype(x))
    @inbounds   for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return sqrt(res)
end

if profile_flag
    x = X_tr_vecs[:,1]
    @benchmark Euclidean0($x, $query)
end

In [330]:
function adc_dist_shared(query_code, x_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(x_code)
        res+= adc_table[x_code[j]]
    end
    return res
end

if profile_flag
    @benchmark adc_dist_shared($query_code, $x_code, $adc_table_shared)
end

In [329]:
function adc_dist_shared_unrolled(query_code, x_code,  adc_table)
    res1 = zero(eltype(adc_table))
    res2 = zero(eltype(adc_table))
    res3 = zero(eltype(adc_table))
    res4 = zero(eltype(adc_table))
    
    @inbounds @fastmath for j in 1:4:length(x_code)
        res1 += adc_table[x_code[j]]
        res2 += adc_table[x_code[j+1]]
        res3 += adc_table[x_code[j+2]]
        res4 += adc_table[x_code[j+3]]
    end
    
    return res1 + res2 + res3 + res4
end

if profile_flag
    @benchmark adc_dist_shared_unrolled($query_code, $x_code,  $adc_table_shared)
end

In [327]:

@assert adc_dist_shared(query_code, x_code,  adc_table_shared) == adc_dist_shared_unrolled(query_code, x_code,  adc_table_shared)

LoadError: AssertionError: adc_dist_shared(query_code, x_code, adc_table_shared) == adc_dist_shared_unrolled(query_code, x_code, adc_table_shared)

Note that euclidean squared distance with `@fastmath` is still faster

In [333]:
function Euclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return sqrt(res)
end

if profile_flag
    @benchmark Euclidean0($x, $query)
end

## BWcode

When using a SharedQuantizer a value `c` in a pqcode found in different coordinates refers to the same real number. Note that the computation of an ADC distance  is a sum over  over ADC_table entries provided by the pqcode. This sum is done over the entire length of the pqcode. 

We propose to create a  `K`-dimensional vector, that we call BWcode, that compresses a pqcode into a vector containing the counts of the different assigments to each centroid prototype over the pqcode. For example if we had a pqcode `[1,2,4,1,1,3,3,4]` represented with 4 centroids, we would create a bw code `[3,1,2,2]`. Then the adcomputation over the pqcode is the same as the adc computation over a bwcode provided that we multiply each adc_table entry by the value in the bwcode.

To sum up, we can iterate over a `K`-dimensional vector that we call the BWcode and add the values in `adc_table_shared` weighted by the amount of times each prototype has been seen in the pqcode, making the computation faster as long as `K` is lower than the length of the pqcode.

In [569]:
function pq_code_to_bw_code(pqcode, n_clusters)
    bw_code = zeros(eltype(pqcode), n_clusters)
    for c in pqcode
        bw_code[c] += 1
    end
    return bw_code
end

pq_code_to_bw_code (generic function with 1 method)

In [570]:
n_examples = size(X_tr_vecs,2)
bw_code = pq_code_to_bw_code(x_code, n_clusters);

Now let´s verify that the distance between a query

In [571]:
function bw_adc_dist_shared(query_code, bw_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(bw_code)
        res+= adc_table[j] * bw_code[j]
    end
    return res
end

if profile_flag
    @benchmark bw_adc_dist_shared($query_code, $bw_code, $adc_table_shared)
end

In [575]:
@assert isapprox(bw_adc_dist_shared(query_code, bw_code, adc_table_shared),
                 adc_dist_shared(query_code, x_code, adc_table_shared))

## Linear scann study

### Exact version

In [557]:
function Euclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return sqrt(res)
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath  for j in 1:n_examples
        distances[j] = dist(query, X[:,j])    
    end
    return distances
end

if profile_flag
    @benchmark linear_scann_exact(Euclidean0, query, X_tr_vecs)
end

In [558]:
function Euclidean0(x, query)
    res = zero(eltype(x))
    @inbounds for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath Threads.@threads for j in 1:n_examples
        distances[j] = dist(query, X[:,j])    
    end
    return distances
end

if profile_flag
    @benchmark linear_scann_exact(Euclidean0, query, X_tr_vecs)
end

In [559]:
function Euclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return sqrt(res)
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, view(X,:,j))    
    end
    return distances
end

if profile_flag
    @benchmark linear_scann_exact(Euclidean0, query, X_tr_vecs)
end

### ADC version

In [560]:
function linear_scann_shared(query, PQcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath for j in 1:n_examples
        distances[j] = adc_dist_shared(query_code,  view(PQcodes,:,j) ,  adc_table_shared)    
    end
    return distances
end

if profile_flag
    @benchmark distances = linear_scann($query, $PQcodes, $adc_table_shared, $P_shared)
end

In [561]:
function linear_scann_shared_unrolled(query, PQcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath Threads.@threads for j in 1:n_examples
        distances[j] = adc_dist_shared_unrolled(query_code,  PQcodes[:,j] ,  adc_table_shared)    
    end
    return distances
end

if profile_flag
    @benchmark distances = linear_scann_unrolled($query, $PQcodes, $adc_table_shared, $P_shared)
end

### ADC version with BW codes

Now we will build a Matrix containing BW codes and use it to compute the ADC distance

In [562]:
n_examples = size(X_tr_vecs,2)
BWcodes = Array{Int8}(undef, n_clusters, n_examples);
bw_code = pq_code_to_bw_code(x_code, n_clusters);

for j in 1:n_examples
   BWcodes[:,j] .= pq_code_to_bw_code(PQcodes_shared[:,j], n_clusters)
end

In [563]:
function bw_adc_dist_shared(query_code, bw_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(bw_code)
        res+= adc_table[j] * bw_code[j]
    end
    return res
end

function linear_scann_bw(query, BWcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(BWcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    n_clusters = size(BWcodes,1)
    #@show n_clusters
    @inbounds for i in 1:n_examples
        
        # Following code is equivalent to bw_adc_dist_shared(query_code, BWcodes[:,i],  adc_table_shared)
        res = zero(eltype(adc_table_shared))
        @simd for j in 1:n_clusters
            res+= adc_table_shared[j] * BWcodes[j,i]
        end        
        distances[i] = res  
    
    end
    return distances
end

if profile_flag
    @benchmark linear_scann_bw($query, $BWcodes, $adc_table_shared, $P_shared)
end

In [566]:
function bw_adc_dist_shared(query_code, bw_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @simd for j in eachindex(bw_code)
        res+= adc_table[j] * bw_code[j]
    end
    return res
end

function linear_scann_bw_multithreaded(query, BWcodes, adc_table_shared, P_shared)
    
    query_code = encode_fast_shared(sqeuclidean, query, P_shared)
    n_features, n_examples = size(BWcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    n_clusters = size(BWcodes,1)
    #@show n_clusters
    @inbounds @fastmath Threads.@threads  for i in 1:n_examples
       distances[i] = bw_adc_dist_shared(query_code, view(BWcodes,:,i),  adc_table_shared)
    end    
    
    return distances
end

if profile_flag
    @benchmark distances = linear_scann_bw_multithreaded($query, $BWcodes, $adc_table_shared, $P_shared)
end

In [568]:
query = X_te_vecs[:,1]
adc_table = compute_ADC_shared(query, P_shared, euclidean)
#pq_distances = linear_scann(query, PQcodes_test, adc_table, P);
#closest_id = sortperm(pq_distances)[1]
#hcat(query, X_te_vecs[:, closest_id])[1:4,:]

32-element Vector{Float32}:
   0.77072227
  46.943527
  82.36796
  88.51077
  65.90065
   5.0251217
  64.46652
  79.91107
   8.535732
  96.571106
  79.91213
  11.948233
  48.95486
   ⋮
 142.58289
   2.4551544
  41.042263
 101.97097
  17.09277
  43.979607
  10.488089
  36.468975
  21.490381
  77.46107
   9.049472
  29.478813

In [543]:
query = X_te_vecs[:,1]
adc_table = compute_ADC(query, P, Euclidean0)
pq_distances = linear_scann(query, PQcodes_test, adc_table, P);
x_closest = vec(get_top_k_from_distances(X_te_vecs, pq_distances, 1));

df_x = DataFrame(Dict(:x => x, 
                      :x_decoded => x_decoded,
                      :x_pqcode => x_pqcode,
                      :x_closest => x_closest
                      ))

first(df_x, 5)

Unnamed: 0_level_0,x,x_closest,x_decoded,x_pqcode
Unnamed: 0_level_1,Float32,Float32,Float32,Int8
1,1.0,1.0,0.229278,8
2,3.0,3.0,2.45516,6
3,11.0,11.0,9.93614,3
4,110.0,110.0,103.101,10
5,62.0,62.0,60.9214,15


# Quality verification

Let's inspect the recall of pqlinearscann with respect to exact search:

In [396]:
function recall(predicted, relevant, eval_at)
    """
    fraction of the relevant documents that are successfully retrieved
    """
    if eval_at == 0
        return 0.0
    end
    
    predicted_at_k = predicted[1:eval_at]
    n_predicted_and_relevant =  length(intersect( Set(predicted_at_k), Set(relevant))) 
    return n_predicted_and_relevant/ length(relevant)
end

query_id = 1
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

n_examples = size(X_tr_vecs,2)
BWcodes = Array{Int8}(undef, n_clusters, n_examples);
bw_code = pq_code_to_bw_code(x_code, n_clusters);

for j in 1:n_examples
   BWcodes[:,j] .= pq_code_to_bw_code(PQcodes_shared[:,j], n_clusters)
end

adc_table_shared  = Float32.(compute_ADC_shared(query, P_shared, euclidean));
bw_distances = linear_scann_bw(query, BWcodes, adc_table_shared, P_shared);
top_k_bw = sortperm(bw_distances)[1:top_k];


In [364]:

pq_distances = linear_scann(query, PQcodes, adc_table, P);
top_k_pq = sortperm(pq_distances)[1:top_k];



LoadError: InexactError: trunc(Int8, 246)

In [None]:

pq_distances = linear_scann(query, PQcodes, adc_table, P);
top_k_pq = sortperm(pq_distances)[1:top_k];

ex_distances = linear_scann_exact(query, X_tr_vecs)
top_k_ex = sortperm(ex_distances)[1:top_k];

@show recall(top_k_ex, query_true_neighbors, 100)
@show recall(top_k_pq, query_true_neighbors, 100)
@show recall(top_k_bw, query_true_neighbors, 100)

df = DataFrame(Dict(:top_k_ex => top_k_ex,
                    :top_k_pq => top_k_pq,
                    :top_k_bw => top_k_bw))

In [None]:
bw_distances = linear_scann_bw(query, BWcodes, adc_table_shared, P_shared);
bw_distances

In [None]:
query

In [None]:
PQcodes[:,1]

In [None]:
plot(x, y)

In [None]:
X_tr_vecs[:,1]

In [None]:
n = 3
sqeuclidean(query, X_tr_vecs[:,n]) , distances[n], sqeuclidean(sqeuclidean(query, X_tr_vecs[:,n]), distances[n])

In [None]:
n = 4
sqeuclidean(distances[n], sqeuclidean(sqeuclidean(query, X_tr_vecs[:,n]), distances[n])

In [None]:
n = 1
sqeuclidean(query, X_tr_vecs[:,n]) , distances[n], sqeuclidean(sqeuclidean(query, X_tr_vecs[:,n]), distances[n])

## Euclidean  Squared Distance

In [None]:
function SEuclidean(X, query)
    d = (query .- X) .* (query .- X)
    return sum(d, dims=1)
end

In [None]:
function SEuclidean_2(X, query)
    n_features, n_examples = size(X)
    result = zeros(n_examples)
    for m in 1:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        result[m] = res
    end
    return result
end

In [None]:
query = X_te_vecs[:,1];

In [None]:
@benchmark SEuclidean(X_te_vecs, query)

In [None]:
@benchmark SEuclidean_2(X_te_vecs, query)

In [None]:
@benchmark SEuclidean(X_tr_vecs, query)

In [None]:
@benchmark SEuclidean_2(X_tr_vecs, query)

## Finding top k distances (and their ids)


The first naive thing we can do consist on computing all distances and then sorting them to get the top k closest vectors to the query vector

In [None]:
function top_k_ids(X, query)
    distances = SEuclidean_2(X, query)
    top_k_indices = sortperm(distances)
    return top_k_indices
end

In [None]:
@benchmark top_k_ids(X_te_vecs, query)[1:10]

A slightly better approach consist on using `partialsortperm` to simply sort a subset of the distances vector.

In [None]:
function top_k_ids_2(X, query, k)
    distances = SEuclidean_2(X, query)
    top_k_indices = partialsortperm(distances, 1:k)
    return top_k_indices
end

In [None]:
@benchmark top_k_ids_2(X_te_vecs, query, 10)

### Storing top k distances in a priority queue

A better alternative consists on using a priority queue. This. queue will keep only k distances in memory (no need to store all distances between the query point and all possible candidates).

In [None]:
sort!([1,54,3,24,10])

In [None]:
a = [1,2,3,4,5]

In [None]:
function SEuclidean_3(X, query, top_k)
    result = sort(SEuclidean_2(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    for m in top_k:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        dist = res/n_features
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end
            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end

    end
    return result
end

In [None]:
@benchmark SEuclidean_3(X_tr_vecs, query, 10) 

In [None]:
@benchmark top_k_ids_2(X_tr_vecs, query, 10)

In [None]:
function SEuclidean_4(X, query, top_k)
    
    result = sort(SEuclidean_2(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    @inbounds @fastmath for m in top_k:n_examples
        res = zero(eltype(X))
        @simd for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        dist = res
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end
    end
    return result
end


In [None]:
@benchmark SEuclidean_4(X_tr_vecs, query, 10) 

In [None]:
X_tr_200k = X_tr_vecs[:,1:200_000]
@benchmark SEuclidean_4(X_tr_200k, query, 10) 

In [None]:
SEuclidean_4(X_tr_vecs, query, 10)

In [None]:
function SEuclidean_5(X, query, top_k)
    
    result = sort(SEuclidean_2(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    for m in top_k:n_examples
        res = zero(eltype(X))
        @turbo  for j in 1:n_features
            aux = (query[j] - X[j,m])
            res += aux * aux
        end
        dist = res
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end
    end
    return result
end

In [None]:
@benchmark SEuclidean_5(X_tr_200k, query, 10) 

# Comparison with Distances.jl


Distances.jl provides the method 'pairwise' to compute pairwise distances between two matrices.

In [None]:
@show typeof(query)
@show size(query)
@show typeof(X_tr_vecs)
@show size(X_tr_vecs)

In [None]:
v1 = [1,2,3]
v2 = [0,0,1]

y = [0,0,2]
X = [v1 v2]

In [None]:
function SEuclidean_5(X, query, top_k)
    
    result = sort(SEuclidean_5(X[:, 1:top_k], query))
    n_features, n_examples = size(X)
    
    for m in top_k:n_examples
        dist = zero(eltype(X))
        @turbo  for j in 1:n_features
            dist = (query[j] - X[j,m])
            dist += aux * aux
        end
        
        # see if current mse is in the top pile
        if dist < result[end]
            j = top_k 
            #revisamos la lista resultado de izq a derecha
            while dist < result[j-1]
                j = j-1
                if j == 1
                    break
                end
            end            
            result[j+1:end] .= result[j:end-1]
            result[j] = dist
         end
    end
    return result
end

In [None]:
function SEuclidean_2(X, query)
    n_features, n_examples = size(X)
    result = zeros(eltype(X), n_examples)
    for m in 1:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        result[m] = res
    end
    return result
end

In [None]:
y_colvec = [y y][:,[1]]
query_mat = [query query][:,[1]];

In [None]:
SEuclidean_2(X, y)

In [None]:
pairwise(SqEuclidean(), y_colvec, X)

In [None]:
@benchmark pairwise(SqEuclidean(), query_mat, X_tr_vecs)

In [None]:
@benchmark SEuclidean_2(X_tr_vecs, query)

In [None]:
@benchmark SEuclidean_4(X_tr_vecs, query, 10)

We can verify that the sorted distances by `SEuclidean_4` are the same as the results provided with `pairwise` from `Distances.jl`

In [None]:
res2 = SEuclidean_4(X_tr_vecs, query, 10);
res = pairwise(SqEuclidean(), query_mat, X_tr_vecs)
res = sort(res, dims=2)[1:10]
res == res2

## Parallel implementation

# SIMD tests

In [None]:

function find_val_in_array_simd(x::Array{T}, val::T) where {T}
    n_simd = 64
    last_pos_simd_chunk = length(x)-n_simd
    @inbounds for i in 1:n_simd:last_pos_simd_chunk
        vec_i = vload(Vec{n_simd, T}, x, i)
        sum_equality = sum(vec_i == val)
        if sum_equality >0
            return true
        end
    end

    @inbounds for i in last_pos_simd_chunk:length(x)
        if x[i] == val
            return true
        end
    end

    return false
end

In [None]:
function jdotavx(a, b)
    s = zero(eltype(a))
    @turbo for i ∈ eachindex(a, b)
        s += a[i] * b[i]
    end
    s
end