# Replacing ADC table by L1 distance between PQ codes

In [1]:
using HDF5
using BenchmarkTools
using Distances
using LoopVectorization
using SIMD
using Clustering
using ProgressMeter
using StaticArrays
using DataFrames
using Plots
using NPZ

In [2]:
profile_flag = false
Sys.cpu_info()[1].model

"Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz"

In [3]:
path = joinpath(homedir(), "TFM", "ann-benchmarks",  "sift-128-euclidean.hdf5")

"/home/adegaray/TFM/ann-benchmarks/sift-128-euclidean.hdf5"

In [4]:
f = h5open(path, "r")

X_tr_vecs = read(f["train"])
X_te_vecs = read(f["test"]);
true_neighbors = read(f["neighbors"])
true_distances = read(f["distances"])

true_neighbors .= true_neighbors .+ 1;

@show size(X_tr_vecs)
@show size(X_te_vecs)
@show size(true_neighbors)
@show size(true_distances)

size(X_tr_vecs) = (128, 1000000)
size(X_te_vecs) = (128, 10000)
size(true_neighbors) = (100, 10000)
size(true_distances) = (100, 10000)


(100, 10000)

### Compute Recall PQLinearScann 

In [5]:
function recall(predicted, relevant, eval_at)
    """
    fraction of the relevant documents that are successfully retrieved
    """
    if eval_at == 0
        return 0.0
    end
    
    predicted_at_k = predicted[1:eval_at]
    n_predicted_and_relevant =  length(intersect( Set(predicted_at_k), Set(relevant))) 
    return n_predicted_and_relevant/ length(relevant)
end

recall (generic function with 1 method)

##  PQLinearscann Sharing prototypes across features

https://groups.google.com/g/julia-users/c/xBcQRebyi_o



In [29]:
n_features, n_examples = size(X_tr_vecs)

function encode_shared(dist, vector::Array{T}, shared_prototypes::Array{T}) where T
    n_clusters = length(shared_prototypes)
    closest_prototypes = Array{Int32}(undef, n_features, 1);
    
    @inbounds for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(shared_prototypes[k], x)
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
           #println(k, ' ', j, ' ', best_coordinate, ' ',min_distance )
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

encode_shared (generic function with 1 method)

We load the K=32 centroids resulting of performing 1d-kmeans over the first feature of the train dataset. Notice that these centroids are sorted, resulting in a sorted codification. We will take advantage of shared quantization.

In [30]:
P_shared = vec(Float32.(npzread("1dkmeans_shared_prototypes.npy")))

32-element Vector{Float32}:
   0.22927776
   2.4602568
   4.935903
   7.958296
  10.961814
  13.964964
  16.974878
  19.983488
  23.455843
  27.45342
   ⋮
  94.42536
 100.457985
 106.51546
 112.64925
 118.534
 124.41393
 130.70255
 138.0879
 148.98564

In [7]:
PQcodes_shared = Array{Int8}(undef, n_features, n_examples);

for j in 1:n_examples
    PQcodes_shared[:,j] = encode_shared(euclidean, X_tr_vecs[:,j], P_shared)  
end


### Inspect idea of computing distances without adc table: directly from pqcodes


In [9]:
function abs_dist(y::Array{T}, X::Array{T}, j) where T
    res = zero(eltype(y))
    @inbounds @fastmath  for k in eachindex(y)
        res += abs(X[k, j] - y[k])
    end
    return res
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{Float32}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, X, j)    
    end
    return distances
end

linear_scann_exact (generic function with 1 method)

There is some problem with the casting of query_code, probably related with the fact that "res" adopts the same type than the query_code on abs_dist:

In [34]:
query_id = 1
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

query_code = encode_shared(euclidean, query, P_shared)
query_code = UInt16.(vec(query_code))

PQcodes_uint8 = UInt16.(PQcodes_shared);

pq_distances = linear_scann_exact(abs_dist, query_code, PQcodes_uint8)
top_k_pq = sortperm(pq_distances)[1:top_k];

@show recall(top_k_pq, query_true_neighbors, top_k);

recall(top_k_pq, query_true_neighbors, top_k) = 0.0


In [36]:
pq_distances[top_k_pq]

100-element Vector{Float32}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

All the top k distances are 0. This would be the expected output if all top_k_pq codes were the same and also equal to the query code (NOT TRUE):

In [44]:
println(PQcodes_shared[:,top_k_pq[1]])
println(PQcodes_shared[:,top_k_pq[2]])
println(encode_shared(euclidean, query, P_shared))

Int8[9, 2, 7, 19, 9, 4, 1, 6, 8, 11, 25, 22, 5, 1, 1, 5, 7, 8, 27, 20, 5, 5, 11, 15, 6, 5, 4, 10, 8, 9, 10, 4, 16, 9, 5, 3, 4, 6, 5, 12, 24, 27, 16, 10, 4, 2, 2, 9, 10, 21, 27, 25, 27, 15, 1, 1, 9, 25, 19, 18, 24, 8, 1, 1, 10, 4, 2, 4, 3, 1, 5, 12, 27, 12, 1, 1, 2, 6, 11, 27, 12, 5, 1, 3, 27, 27, 11, 13, 27, 16, 6, 5, 14, 13, 2, 19, 2, 1, 1, 5, 4, 1, 1, 2, 7, 1, 1, 3, 4, 6, 8, 19, 2, 1, 4, 4, 8, 20, 9, 5, 18, 1, 2, 3, 3, 3, 1, 16]
Int8[18, 15, 10, 3, 3, 2, 1, 2, 1, 17, 29, 17, 4, 3, 1, 1, 2, 19, 29, 16, 1, 1, 1, 1, 3, 15, 20, 7, 2, 1, 1, 1, 29, 15, 4, 3, 3, 1, 1, 17, 22, 24, 20, 12, 4, 1, 3, 9, 2, 9, 22, 29, 14, 4, 1, 2, 5, 11, 12, 10, 8, 15, 7, 2, 29, 12, 5, 7, 3, 4, 3, 13, 27, 5, 1, 1, 9, 12, 19, 29, 5, 1, 1, 4, 23, 21, 6, 13, 10, 12, 4, 3, 19, 18, 6, 3, 14, 22, 6, 3, 9, 4, 1, 1, 17, 4, 1, 5, 18, 11, 7, 18, 9, 1, 1, 1, 11, 26, 14, 13, 3, 5, 2, 1, 6, 9, 17, 14]


Int32[1; 2; 5; 27; 18; 9; 3; 1; 14; 8; 9; 7; 3; 10; 18; 4; 5; 1; 1; 1; 1; 13; 25; 8; 8; 2; 3; 2; 2; 4; 7; 12; 1; 1; 4; 9; 26; 28; 18; 2; 1; 1; 5; 20; 13; 25; 28; 2; 11; 1; 1; 5; 6; 7; 28; 28; 19; 5; 3; 3; 2; 2; 4; 25; 7; 2; 5; 7; 4; 6; 19; 18; 6; 1; 6; 28; 22; 1; 2; 9; 24; 12; 10; 23; 14; 1; 3; 7; 28; 16; 3; 5; 11; 6; 4; 9; 14; 7; 5; 4; 1; 1; 8; 12; 22; 7; 7; 26; 9; 3; 2; 10; 5; 17; 19; 22; 9; 5; 2; 2; 9; 7; 4; 2; 8; 14; 15; 5;;]


This does work:

In [26]:
query_id = 1
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

query_code = encode_shared(euclidean, query, P_shared)
#query_code = UInt32.(vec(query_code))

PQcodes_uint8 = Int32.(PQcodes_shared);

pq_distances = linear_scann_exact(abs_dist, query_code, PQcodes_uint8)
top_k_pq = sortperm(pq_distances)[1:top_k];

@show recall(top_k_pq, query_true_neighbors, top_k);

recall(top_k_pq, query_true_neighbors, top_k) = 0.58


## Benchmark times exact vs linearscann

#### Type UInt8

In [45]:
query_id = 1
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

query_code = encode_shared(euclidean, query, P_shared)
query_code = UInt8.(vec(query_code))

PQcodes = UInt8.(PQcodes_shared);

In [46]:
@inline function Euclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return sqrt(res)
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{Float32}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, view(X,:,j))    
    end
    return distances
end

@benchmark linear_scann_exact($Euclidean0, $query, $X_tr_vecs)


BenchmarkTools.Trial: 113 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m37.115 ms[22m[39m … [35m103.894 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m40.904 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m44.634 ms[22m[39m ± [32m  9.178 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m▄[39m▄[39m▆[39m█[34m█[39m[39m [39m▂[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▇[39m█[39m█[39m█[39m█[

In [47]:
function abs_dist(y::Array{T}, X::Array{T}, j) where T
    res = zero(eltype(y))
    @inbounds @fastmath  for k in eachindex(y)
        res += abs(X[k, j] - y[k])
    end
    return res
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{Float32}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, X, j)    
    end
    return distances
end

@benchmark linear_scann_exact($abs_dist, $query_code, $PQcodes)

BenchmarkTools.Trial: 387 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 9.828 ms[22m[39m … [35m41.081 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m11.867 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m12.897 ms[22m[39m ± [32m 3.659 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.20% ± 1.73%

  [39m [39m▄[39m▅[39m█[39m▇[39m▆[34m▂[39m[39m▇[39m▄[39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▇[39m█[39m█[39m█[39m█[39m█[34m