# Read SIFT 1 million 

In [1]:
path = joinpath(homedir(), "Datasets", "SIFT1M",
    "sift-128-euclidean.hdf5")

"/Users/davidbuchaca/Datasets/SIFT1M/sift-128-euclidean.hdf5"

In [2]:
using HDF5

In [3]:
f = h5open(path, "r")

🗂️ HDF5.File: (read-only) /Users/davidbuchaca/Datasets/SIFT1M/sift-128-euclidean.hdf5
├─ 🏷️ distance
├─ 🔢 distances
├─ 🔢 neighbors
├─ 🔢 test
└─ 🔢 train

In [4]:
X_tr_vecs = read(f["train"])
X_te_vecs = read(f["test"]);
neighbors = read(f["neighbors"])
distances = read(f["distances"])

@show size(X_tr_vecs)
@show size(X_te_vecs)
@show size(neighbors)
@show size(distances)

size(X_tr_vecs) = (128, 1000000)
size(X_te_vecs) = (128, 10000)
size(neighbors) = (100, 10000)
size(distances) = (100, 10000)


(100, 10000)

### Mean Squared Error

In [5]:
using Statistics

In [152]:
function MSE(X, query)
    d = (query .- X) .* (query .- X)
    res = d /length(query);
    return sum(res, dims=1)
end

MSE (generic function with 1 method)

In [173]:
function MSE_2(X, query)
    n_features, n_examples = size(X)
    result = zeros(n_examples)
    for m in 1:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            aux = (query[j] .- X[j,m])
            res += aux * aux
        end
        result[m] = res/n_features
    end
    return result
end

MSE_2 (generic function with 1 method)

In [174]:
query = X_te_vecs[:,1];

In [175]:
using BenchmarkTools

In [179]:
@benchmark MSE(X_te_vecs, query)

BenchmarkTools.Trial: 707 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.326 ms[22m[39m … [35m52.709 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 84.70%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m6.383 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m7.055 ms[22m[39m ± [32m 6.344 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m15.75% ± 14.95%

  [39m▅[39m▃[39m [39m▂[39m▁[39m▇[34m█[39m[32m▆[39m[39m▃[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[39m█[39m█[39m█[34m█[3

In [180]:
@benchmark MSE_2(X_te_vecs, query)

BenchmarkTools.Trial: 3442 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.161 ms[22m[39m … [35m  2.891 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.365 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m1.440 ms[22m[39m ± [32m220.855 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▆[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m█[34m [39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m▅[39m▂[39m▁[39m▂[39m▃[39

In [None]:
@benchmark MSE(X_tr_vecs, query)

In [None]:
@benchmark MSE_2(X_tr_vecs, query)

In [9]:
function top_k(X, query)
    distances = MSE(X, query)
    top_k_indices = sortperm(mse_dists)
    return top_k_indices
end

top_k (generic function with 1 method)

In [11]:
@benchmark top_k(X_te_vecs, query)

BenchmarkTools.Trial: 598 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.864 ms[22m[39m … [35m60.283 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 84.85%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m7.485 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m8.387 ms[22m[39m ± [32m 6.359 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m13.78% ± 15.13%

  [39m [39m [39m [39m [39m [39m [39m█[34m▁[39m[32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▄[39m▄[39m▃[39m▃[39m▃[39m█[39m█[3

In [12]:
size(X_te_vecs)

(128, 10000)

In [13]:
@benchmark top_k(X_tr_vecs, query)

BenchmarkTools.Trial: 6 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m727.858 ms[22m[39m … [35m991.942 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.23% … 14.20%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m825.413 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 9.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m841.582 ms[22m[39m ± [32m107.257 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m10.12% ±  5.80%

  [39m█[39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [39m█[39m▁[39m▁[

In [14]:
function top_k_2(X, query, k )
    distances = MSE(X, query)
    top_k_indices = partialsortperm(mse_dists, 1:k)
    return top_k_indices
end

top_k_2 (generic function with 1 method)

In [15]:
@benchmark MSE(X_tr_vecs, query)

BenchmarkTools.Trial: 7 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m741.655 ms[22m[39m … [35m879.469 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.20% … 17.83%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m817.105 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m8.89%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m805.183 ms[22m[39m ± [32m 48.260 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m9.57% ±  5.45%

  [39m█[39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m█[39m [39m [39m█[39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [39m█[39m▁[39m▁[39m

In [16]:
function MSE(X, query)
    d = (query .- X) .* (query .- X)
    res = d /mean(d);
    return sum(res, dims=1)
end

function MSE_2(X, query)
    result = zeros(size(X, 2))
    n_features, n_examples = size(X)
    for m in 1:n_examples
        res = zero(eltype(X))
        for j in 1:n_features
            res += (query[j] .- X[j,m]) * (query[j] - X[j,m])
        end
        result[m] = res/n_features
    end
    return result
end

MSE_2 (generic function with 1 method)

In [17]:
 MSE(X_te_vecs, query)

1×10000 Matrix{Float32}:
 0.0  63.749  179.758  52.4943  …  161.907  64.9257  127.166  96.1275

In [30]:
distances[:,1] 

100-element Vector{Float32}:
 232.87122
 234.71472
 243.98976
 255.46037
 256.31427
 258.86288
 261.24127
 264.28015
 267.2845
 268.06903
 270.82098
 271.17706
 271.25818
   ⋮
 288.1302
 288.151
 288.23428
 288.4025
 288.47357
 288.49435
 288.62952
 288.63992
 288.68842
 288.75076
 288.81656
 288.90482

In [36]:
MSE_2(X_te_vecs, query)

10000-element Vector{Float64}:
    0.0
 1157.09375
 3262.75
  952.8125
 2840.6484375
 1564.7421875
  949.953125
 2060.0
 2916.0234375
 2930.375
 1766.53125
 3001.5390625
 1457.4453125
    ⋮
 1989.6640625
 2857.4453125
 1517.515625
 1574.015625
 1944.9453125
 1570.890625
 1905.3125
 3282.9296875
 2938.734375
 1178.453125
 2308.15625
 1744.7890625

In [20]:
MSE_2(X_te_vecs, query)

10000-element Vector{Float64}:
    0.0
 1157.09375
 3262.75
  952.8125
 2840.6484375
 1564.7421875
  949.953125
 2060.0
 2916.0234375
 2930.375
 1766.53125
 3001.5390625
 1457.4453125
    ⋮
 1989.6640625
 2857.4453125
 1517.515625
 1574.015625
 1944.9453125
 1570.890625
 1905.3125
 3282.9296875
 2938.734375
 1178.453125
 2308.15625
 1744.7890625

In [21]:
 MSE(X_te_vecs, query)



1×10000 Matrix{Float32}:
 0.0  63.749  179.758  52.4943  …  161.907  64.9257  127.166  96.1275