# PQ Linearscann on GloVe dataset

In [13]:
using HDF5
using BenchmarkTools
using Distances
using SIMD
using Clustering
using ProgressMeter
using StaticArrays
using DataFrames
using Plots
using NPZ
using LSHFunctions
using LinearAlgebra

In [2]:
profile_flag = false
Sys.cpu_info()[1].model

"Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz"

We load the GloVe dataset:

In [14]:
path = joinpath(homedir(), "TFM", "ann-benchmarks", "glove-100-angular.hdf5")

f = h5open(path, "r")

X_tr_vecs = read(f["train"])
X_te_vecs = read(f["test"]);
true_neighbors = read(f["neighbors"])
true_distances = read(f["distances"])

true_neighbors .= true_neighbors .+ 1;

@show size(X_tr_vecs)
@show size(X_te_vecs)
@show size(true_neighbors)
@show size(true_distances)

size(X_tr_vecs) = (100, 1183514)
size(X_te_vecs) = (100, 10000)
size(true_neighbors) = (100, 10000)
size(true_distances) = (100, 10000)


(100, 10000)

In [15]:
n_features, n_examples = size(X_tr_vecs)

(100, 1183514)

## Replacing ADC table by pairwise distance between PQ codes +  Shared Quantizer

We will be using a fastest approach instead of ADC table to compute distances, L1 distances, which we are allowed to use thanks to the application of a shared quantizer.In a shared quantizer we consider a single vector of prototypes or centroids (instead of one per each feature/group of features). This way, we can calculate L1 distance between each encoded query and each encoded vector instead of computing a distance table.

We define some functions relevant to the process:

In [16]:
function recall(predicted, relevant, eval_at)
    """
    fraction of the relevant documents that are successfully retrieved
    """
    if eval_at == 0
        return 0.0
    end
    
    predicted_at_k = predicted[1:eval_at]
    n_predicted_and_relevant =  length(intersect( Set(predicted_at_k), Set(relevant))) 
    return n_predicted_and_relevant/ length(relevant)
end

function encode_shared(dist, vector::Array{T}, shared_prototypes::Array{T}) where T
    n_clusters = length(shared_prototypes)
    n_features = size(vector)[1]
    closest_prototypes = Array{Int32}(undef, n_features, 1);
    
    @inbounds for (j,x) in enumerate(vector)
        best_coordinate = 1
        min_distance::T = typemax(T)
        for k in 1:n_clusters
           current_dist = dist(shared_prototypes[k], x)
           if current_dist < min_distance
               best_coordinate = k
               min_distance = current_dist
           end
           #println(k, ' ', j, ' ', best_coordinate, ' ',min_distance )
        end            
        closest_prototypes[j] = best_coordinate
    end
    return closest_prototypes
end

function abs_dist(y::Array{T}, X::Array{T}, j) where T
    # Here I use a bigger Int type than 8 due to avoid
    # res beeing overflowed
    res = Int16(0)
    @inbounds @fastmath  for k in eachindex(y)
        res += abs(X[k, j] - y[k])
    end
    return res
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{Float32}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, X, j)    
    end
    return distances
end

function linear_scann_exact_pq(dist, query, X, top_k, extra_factor)

    n_features, n_examples = size(X)
    distances = Array{Float32}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, X, j)    
    end

    top_result_pos = sortperm(distances)[1:top_k*extra_factor];
    
    return top_result_pos
end

function euclidean_mat2(y, X, j) where T
    # Here I use a bigger Int type than 8 due to avoid
    # res beeing overflowed
    res = zero(eltype(y))
    @inbounds @fastmath  for k in eachindex(y)
        partial = X[k, j] - y[k]
        res += partial * partial
    end
    return res
end

euclidean_mat2 (generic function with 1 method)

We load the centroids resulting of performing 1d-kmeans over the first feature of the train dataset. Notice that these centroids are sorted, resulting in a sorted codification. We will take advantage of shared quantization. We do tests with different sets of prototypes to check which one produces a better recall after the shared quantization: 

In [17]:
#P_shared = vec(Float32.(npzread("1dkmeans_prototypes/1dkmeans_shared_prototypes_GloVe.npy")))
#P_shared = vec(Float32.(npzread("1dkmeans_prototypes/1dkmeans_shared_prototypes_GloVe_5feat.npy")))
P_shared = vec(Float32.(npzread("1dkmeans_prototypes/1dkmeans_shared_prototypes_GloVe_128c.npy")))
#P_shared = vec(Float32.(npzread("1dkmeans_prototypes/1dkmeans_shared_prototypes_GloVe_64c_3feat.npy")))
#P_shared = vec(Float32.(npzread("1dkmeans_prototypes/1dkmeans_shared_prototypes_GloVe_256c.npy")))
#P_shared = vec(Float32.(npzread("1dkmeans_prototypes/1dkmeans_shared_prototypes_GloVe_normalized_128.npy")))

128-element Vector{Float32}:
 -3.3792334
 -2.893548
 -2.5925786
 -2.3798368
 -2.2132785
 -2.0744526
 -1.9516637
 -1.8409548
 -1.7442269
 -1.6597183
  ⋮
  1.7391312
  1.8250538
  1.9321226
  2.0490065
  2.18139
  2.3596976
  2.590493
  2.9266334
  3.47945

We can use normalized vectors, although we have checked that the performance does not vary.

In [10]:
X_tr_normed = Array{Float32}(undef, n_features, n_examples);
for j in 1:n_examples
    X_tr_normed[:,j] = X_tr_vecs[:,j]/norm(X_tr_vecs[:,j])
end

In [18]:
PQcodes_shared = Array{Int16}(undef, n_features, n_examples);

for j in 1:n_examples
    PQcodes_shared[:,j] = encode_shared(euclidean, X_tr_vecs[:,j], P_shared)
    #PQcodes_shared[:,j] = encode_shared(euclidean, X_tr_normed[:,j], P_shared)
end

### Cosine distance

GloVe dataset NNs are measured with an angular distance, so we define both cosine similarity and dot product functions in order to measure distances between the query code and the vector code

In [19]:
function cos_sim(y::Array{T}, X::Array{T}, j) where T
    sim = Int16(0)
    pq_norm_squared = Int16(0)
    q_norm = norm(y)
    @inbounds @fastmath  for k in eachindex(y)
        sim += X[k, j]*y[k]
        pq_norm_squared += X[k, j]*X[k, j]
    end
    return sim/(q_norm*sqrt(pq_norm_squared))
end

function dot_product(y::Array{T}, X::Array{T}, j) where T
    sim = Int16(0)
    @inbounds @fastmath  for k in eachindex(y)
        sim += X[k, j]*y[k]
    end
    return sim
end

dot_product (generic function with 1 method)

Again, we encounter problems for Int8 and Int16 query types

In [28]:
query_id = 1
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

query_code = encode_shared(euclidean, query, P_shared)
query_code = Int32.(vec(query_code))

PQcodes_int8 = Int32.(PQcodes_shared);

pq_distances_cos = linear_scann_exact(cos_sim, query_code, PQcodes_int8)
pq_distances_dot = linear_scann_exact(dot_product, query_code, PQcodes_int8)

top_k_pq_cos = sortperm(pq_distances_cos)[length(pq_distances_cos)-top_k+1:length(pq_distances_cos)];
top_k_pq_dot = sortperm(pq_distances_dot)[length(pq_distances_dot)-top_k+1:length(pq_distances_dot)];

@show recall(top_k_pq_cos, query_true_neighbors, top_k);
@show recall(top_k_pq_dot, query_true_neighbors, top_k);

recall(top_k_pq_cos, query_true_neighbors, top_k) = 0.33
recall(top_k_pq_dot, query_true_neighbors, top_k) = 0.0


We obtain relatively low recall for the cosine distance and 0 recall for the dot product distance (which is logical due to the fact that pqcodes are ints - non normalized)

#### Top_k_expansion + finetunning

In [21]:
function linear_scann_exact_pq_cos(dist, query, X, top_k, extra_factor)

    n_features, n_examples = size(X)
    distances = Array{Float32}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, X, j)    
    end

    top_result_pos = sortperm(distances)[length(distances)-top_k+1:length(distances)];
    
    return top_result_pos
end

linear_scann_exact_pq_cos (generic function with 1 method)

In [22]:
query_id = 5
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

query_code = encode_shared(euclidean, query, P_shared)
query_code = Int32.(vec(query_code))

PQcodes_int8 = Int32.(PQcodes_shared);
best_ids = linear_scann_exact_pq_cos(cos_sim, query_code, PQcodes_int8, top_k, 10);

#linear_scann_exact(euclidean_mat, query, view(X_tr_vecs,:,best_ids))
distances_candidates_expanded = linear_scann_exact(cos_sim, query, X_tr_vecs[:,best_ids]);
permutation_expanded = sortperm(distances_candidates_expanded)[length(distances_candidates_expanded)-top_k+1:length(distances_candidates_expanded)];
@show recall(best_ids[permutation_expanded], query_true_neighbors, top_k);

recall(best_ids[permutation_expanded], query_true_neighbors, top_k) = 0.35


(Augments from 0.33 to 0.37 when building the shared prototypes out of 5 feats.)

In [23]:
function compute_mean_recall_extra(X, PQcodes_shared, true_neighbors, P_shared, extra_factor, top_k::Int, dist::Function)

    n_features, n_examples = size(X)
    PQcodes_int8 = Int32.(PQcodes_shared);
    average = 0.
    for query_id in 1:n_examples
        query = X[:, query_id]
        query_code = encode_shared(euclidean, query, P_shared)
        query_code = Int32.(vec(query_code))
        best_ids = linear_scann_exact_pq_cos(dist, query_code, PQcodes_int8, top_k, extra_factor);
        distances_candidates_expanded = linear_scann_exact(dist, query, X_tr_vecs[:,best_ids]);
        permutation_expanded = sortperm(distances_candidates_expanded)[length(distances_candidates_expanded)-top_k+1:length(distances_candidates_expanded)];
        r = recall(best_ids[permutation_expanded], true_neighbors[:,query_id], top_k);
        average += r;
        end
    return average / n_examples
    end

extra_factor = 10
top_k = 100
avg_recall = compute_mean_recall_extra(X_te_vecs[:, 1:100], PQcodes_shared, true_neighbors, P_shared, extra_factor, top_k, cos_sim) 

0.5043000000000002

The average recall when considering 100 queries and finetunning is around 0.5.

We check that we obtain perfect recall with the exact search and the cosine distance:

In [24]:
function cos_dist(y::Array{T}, X::Array{T}, j) where T
    dist = cossim(y,X[:,j])
    return dist
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{Float32}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, X, j)     
    end
    return distances
end

linear_scann_exact (generic function with 1 method)

In [25]:
exact_distances = linear_scann_exact(cos_dist, query, X_tr_vecs)
top_k_pq_cos = sortperm(exact_distances)[length(exact_distances)-top_k+1:length(exact_distances)];

@show recall(top_k_pq_cos, query_true_neighbors, top_k);

recall(top_k_pq_cos, query_true_neighbors, top_k) = 1.0


### L1 distance

We also check the results using plain L1 distance

In [26]:
query_id = 1
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

query_code = encode_shared(euclidean, query, P_shared)
query_code = Int16.(vec(query_code))

PQcodes_int8 = Int16.(PQcodes_shared);

pq_distances = linear_scann_exact(abs_dist, query_code, PQcodes_int8)
top_k_pq_l1 = sortperm(pq_distances)[1:top_k];

@show recall(top_k_pq_l1, query_true_neighbors, top_k);

recall(top_k_pq_l1, query_true_neighbors, top_k) = 0.33


#### Top_k_pq_cos vs. top_k_pq_l1

In [29]:
inter_top = intersect(top_k_pq_cos, top_k_pq_l1)

58-element Vector{Int64}:
  100207
  689946
 1051873
  327721
  547335
  117206
  292552
  904717
  427586
  660542
       ⋮
  914909
  989331
  926742
  402217
 1133490
  995276
   93439
   97479
  660282

58 common indices in top100

In [30]:
intersect(inter_top, query_true_neighbors)

27-element Vector{Int64}:
  100207
  547335
  979283
  265227
  647844
   51822
  817758
  368656
  834700
  264603
       ⋮
  242798
  914909
  989331
  402217
 1133490
  995276
   93439
   97479
  660282

27 elements out of the total 33 that l1/cos dists get right are common.

#### Top_k_expansion + finetunning

In [31]:
query_id = 5
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100

query_code = encode_shared(euclidean, query, P_shared)
query_code = Int16.(vec(query_code))

PQcodes_int8 = Int16.(PQcodes_shared);
best_ids = linear_scann_exact_pq(abs_dist, query_code, PQcodes_int8, top_k, 10);

#linear_scann_exact(euclidean_mat, query, view(X_tr_vecs,:,best_ids))
distances_candidates_expanded = linear_scann_exact(euclidean_mat2, query, X_tr_vecs[:,best_ids]);
permutation_expanded = sortperm(distances_candidates_expanded)[1:top_k];
@show recall(best_ids[permutation_expanded], query_true_neighbors, top_k);

recall(best_ids[permutation_expanded], query_true_neighbors, top_k) = 0.38


In [32]:
function compute_mean_recall_extra(X, PQcodes_shared, true_neighbors, P_shared, extra_factor, top_k::Int, dist::Function)

    n_features, n_examples = size(X)
    PQcodes_int8 = Int16.(PQcodes_shared);
    average = 0.
    for query_id in 1:n_examples
        query = X[:, query_id]
        query_code = encode_shared(euclidean, query, P_shared)
        query_code = Int16.(vec(query_code))
        best_ids = linear_scann_exact_pq(dist, query_code, PQcodes_int8, top_k, extra_factor);
        distances_candidates_expanded = linear_scann_exact(euclidean_mat2, query, X_tr_vecs[:,best_ids]);
        permutation_expanded = sortperm(distances_candidates_expanded)[1:top_k]
        r = recall(best_ids[permutation_expanded], true_neighbors[:,query_id], top_k);
        average += r;
        end
    return average / n_examples
    end

extra_factor = 10
top_k = 100
avg_recall = compute_mean_recall_extra(X_te_vecs[:, 1:100], PQcodes_shared, true_neighbors, P_shared, extra_factor, top_k, abs_dist) 

0.4922000000000001

The final recall is similar including the finetunning.

In [33]:
@inline function Euclidean0(x, query)
    res = zero(eltype(x))
    @inbounds @fastmath  for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return sqrt(res)
end

function linear_scann_exact(dist, query, X)

    n_features, n_examples = size(X)
    distances = Array{Float32}(undef, n_examples)
    
    @inbounds for j in 1:n_examples
        distances[j] = dist(query, view(X,:,j))    
    end
    return distances
end


linear_scann_exact (generic function with 1 method)

In [34]:
exact_distances = linear_scann_exact(Euclidean0, query, X_tr_vecs)
top_k_pq_l1 = sortperm(exact_distances)[1:top_k];

@show recall(top_k_pq_l1, query_true_neighbors, top_k);

recall(top_k_pq_l1, query_true_neighbors, top_k) = 0.37


Notice that we do not get perfect recall in exact search when using l1 distance

## Standard Quantization with ADC table

Given the poor recall results obtained with the previous approach, we apply standard quantization to our dataset to see if we get improvements:

In [3]:
### Function definition

function compute_ADC(query, prototypes, dist)
    n_clusters, n_features = size(P)
    
    ADC_table = Array{Float32}(undef, n_clusters, n_features)
    
    for j in 1:n_features
        for p in 1:n_clusters
            ADC_table[p,j] = dist(query[j], @view prototypes[p,j] )
        end
    end
    return  ADC_table
end

function SEuclidean0(x, query)
    res = zero(eltype(x))
    @inbounds   for j in eachindex(x)
        aux = (query[j] - x[j])
        res += aux * aux
    end
    return res
end

@inline function euclidean(x::T,y::T) where {T}
    aux::T = (x - y)
    return sqrt(aux * aux)
end


function find_closest_coordinate(dist, 
                                 value::T,
                                 vector) where T
    best_coordinate = 1
    min_distance::T = typemax(T)
    for (j,x) in enumerate(vector)
        current_dist = dist(x, value)
        if current_dist < min_distance
            best_coordinate = j
            min_distance = current_dist
        end
    end
    return best_coordinate
end

function encode(dist, vector, prototypes)
    closest_prototypes = Array{Int8}(undef, n_features, 1);
    @inbounds for (j,x) in enumerate(vector)
        closest_prototypes[j] = find_closest_coordinate(dist, x, prototypes[:,j])
    end
    return closest_prototypes
end

function adc_dist(query_code, x_code,  adc_table)
    res = zero(eltype(adc_table))
    @inbounds @fastmath for j in eachindex(x_code)
        res+= adc_table[x_code[j], j]
    end
    return res
end

function linear_scann(query, PQcodes, adc_table, P)
    
    query_code = encode(sqeuclidean, query, P)
    n_features, n_examples = size(PQcodes)
    distances = Array{eltype(query)}(undef, n_examples)
    
    @inbounds @fastmath for j in 1:n_examples
        distances[j] = adc_dist(query_code,  view(PQcodes,:,j) ,  adc_table)    
    end
    return distances
end

### Loading the dataset


path = joinpath(homedir(), "TFM", "ann-benchmarks", "glove-100-angular.hdf5")
f = h5open(path, "r")

X_tr_vecs = read(f["train"])
X_te_vecs = read(f["test"]);
true_neighbors = read(f["neighbors"])
true_distances = read(f["distances"])

true_neighbors .= true_neighbors .+ 1;

n_features, n_examples = size(X_tr_vecs)

#Feature clusterization

n_clusters = 32
R_per_feature = []
for j in  1:n_features
    println("Feature ", j)
    R = kmeans(X_tr_vecs[[j],:], n_clusters; maxiter=200)
    push!(R_per_feature, R)
end

#Encoding vectors
P = hcat([r.centers[:] for r in R_per_feature]...);
PQcodes = Array{Int32}(undef, n_features, n_examples);
for j in 1:n_examples
    PQcodes[:,j] = encode(euclidean, X_tr_vecs[:,j] , P)  
end

# Linearscann +  recall
query_id = 1
query = X_te_vecs[:,query_id];
query_true_neighbors = true_neighbors[:,query_id]
top_k = 100


adc_table = compute_ADC(query, P, SEuclidean0);
distances = linear_scann(query, PQcodes, adc_table, P)
top_k_pq = sortperm(distances)[1:top_k];
@show recall(top_k_pq, query_true_neighbors, top_k);

Feature 1
Feature 

2


Feature 3
Feature 

4


Feature 5
Feature 

6


Feature 7
Feature 

8


Feature 9
Feature 

10


Feature 11
Feature 

12


Feature 13
Feature 

14


Feature 15
Feature 

16


Feature 17
Feature 

18


Feature 19
Feature 

20


Feature 21
Feature 

22


Feature 23
Feature 

24


Feature 25
Feature 

26


Feature 27
Feature 

28


Feature 29
Feature 

30


Feature 31
Feature 

32


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 33
Feature 

34


Feature 35
Feature 

36


Feature 37
Feature 

38


Feature 39
Feature 

40


Feature 41
Feature 

42


Feature 43
Feature 

44


Feature 45
Feature 

46


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 47
Feature 

48


Feature 49
Feature 

50


Feature 51
Feature 

52


Feature 53
Feature 

54


Feature 55
Feature 

56


Feature 57
Feature 

58


Feature 59
Feature 

60


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 61
Feature 

62


Feature 63
Feature 

64


Feature 65
Feature 

66


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 67
Feature 

68


Feature 69


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 70


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 71
Feature 

72


Feature 73


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 74
Feature 

75


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 76
Feature 

77


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 78


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 79
Feature 

80


Feature 81
Feature 

82


Feature 83


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 84
Feature 

85


Feature 86
Feature 

87


Feature 88


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 89


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 90
Feature 

91


Feature 92


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 93


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


Feature 94
Feature 

95


Feature 96
Feature 

97


Feature 98
Feature 

99


Feature 100


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188
└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


└ @ Clustering /home/adegaray/.julia/packages/Clustering/tt9vc/src/kmeans.jl:188


recall(top_k_pq, query_true_neighbors, top_k) = 0.35


Same recall as before. How to apply cosine distance here??? Due to our specific type of quantization (1d subvectors), each element of the ADC table is calculated as the distance between 2 scalars, so there is no point on using any type of angular distance. 