In [1]:
using Clustering
using BenchmarkTools

function euclidean_mat(y, X, j) where T
    res = zero(eltype(y))
    @inbounds @fastmath @simd for k in eachindex(y)
        partial = X[k, j] - y[k]
        res += partial * partial
    end
    return res
end

function transform(X, R::KmeansResult)
    n_features, n_examples = size(X)
    cluster_assignments = Array{Int32, 1}(undef, n_examples)
    for n in 1:n_examples
        min_dist = typemax(eltype(X))
        cluster_assignment = Int32(0)
        for (j,c) in enumerate(eachcol(R.centers))
            dist = euclidean_mat(c, X, n)
            if dist < min_dist
                min_dist = dist
                cluster_assignment = j
            end
        end
        cluster_assignments[n] = cluster_assignment
    end
    return cluster_assignments
end


n_features = 128
n_examples = 1_000_000
n_clusters = 256

X = rand(Float32, n_features, n_examples)
R = kmeans(X, n_clusters ; maxiter=10, display=:iter)
@time transform(X,R)

  Iters               objv        objv-change | affected 
-------------------------------------------------------------
      0       1.577635e+07
      1       1.001412e+07      -5.762229e+06 |      256
      2       9.998556e+06      -1.556700e+04 |      256
      3       9.993770e+06      -4.786000e+03 |      256
      4       9.990310e+06      -3.460000e+03 |      256
      5       9.987529e+06      -2.781000e+03 |      256
      6       9.985262e+06      -2.267000e+03 |      256
      7       9.983358e+06      -1.904000e+03 |      256
      8       9.981742e+06      -1.616000e+03 |      256
      9       9.980356e+06      -1.386000e+03 |      256
     10       9.979148e+06      -1.208000e+03 |      256
K-means terminated without convergence after 10 iterations (objv = 9.979148e6)
 10.754456 seconds (768.02 M allocations: 19.083 GiB, 13.75% gc time, 0.20% compilation time)


1000000-element Vector{Int32}:
 154
 153
 226
  58
  54
  72
  66
 232
 165
  37
  72
  81
 207
   ⋮
 256
  27
  87
 250
 218
   8
 227
 107
  69
 214
  91
  42

In [2]:
 using Base.Threads

In [3]:
nthreads()

10

In [4]:
using Clustering
using BenchmarkTools

function euclidean_mat(y, X, j) where T
    res = zero(eltype(y))
    @inbounds @fastmath @simd for k in eachindex(y)
        partial = X[k, j] - y[k]
        res += partial * partial
    end
    return res
end

function transform(X, R::KmeansResult)
    n_features, n_examples = size(X)
    cluster_assignments = Array{Int32, 1}(undef, n_examples)
    Threads.@threads for n in 1:n_examples
        min_dist = typemax(eltype(X))
        cluster_assignment = Int32(0)
        for (j,c) in enumerate(eachcol(R.centers))
            dist = euclidean_mat(c, X, n)
            if dist < min_dist
                min_dist = dist
                cluster_assignment = j
            end
        end
        cluster_assignments[n] = cluster_assignment
    end
    return cluster_assignments
end


println('\nExecution with $(nthreads())')
n_features = 128
n_examples = 1_000_000
n_clusters = 256

X = rand(Float32, n_features, n_examples)
R = kmeans(X, n_clusters ; maxiter=10, display=:iter)
@time transform(X,R)

  Iters               objv        objv-change | affected 
-------------------------------------------------------------
      0       1.585152e+07
      1       1.001047e+07      -5.841051e+06 |      256
      2       9.998334e+06      -1.213700e+04 |      256
      3       9.993661e+06      -4.673000e+03 |      256
      4       9.990203e+06      -3.458000e+03 |      256
      5       9.987422e+06      -2.781000e+03 |      256
      6       9.985127e+06      -2.295000e+03 |      256
      7       9.983198e+06      -1.929000e+03 |      256
      8       9.981552e+06      -1.646000e+03 |      256
      9       9.980137e+06      -1.415000e+03 |      256
     10       9.978900e+06      -1.237000e+03 |      256
K-means terminated without convergence after 10 iterations (objv = 9.9789e6)
  6.282619 seconds (767.95 M allocations: 19.080 GiB, 18.74% gc time, 0.60% compilation time)


1000000-element Vector{Int32}:
 212
  75
 173
   3
 155
 186
 136
 125
 109
 135
  76
 252
 185
   ⋮
 106
 252
 188
  47
  53
  16
  97
  77
 210
  12
 146
 241