# Performance Checks

## `argmin`

Test the performance of `argmin` against "by hand" loops to find the minimum value in an array

In [20]:
using BenchmarkTools
using LoopVectorization

### Plain top level container

In [21]:
v = rand(Float64, 1000);

In [22]:
typeof(v)

Vector{Float64}[90m (alias for [39m[90mArray{Float64, 1}[39m[90m)[39m

In [23]:
a::Int = 0
@benchmark begin
    a = argmax(v)
    # println(a)
end

BenchmarkTools.Trial: 10000 samples with 7 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m4.554 μs[22m[39m … [35m 17.208 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m4.577 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m4.811 μs[22m[39m ± [32m421.508 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m█[34m▃[39m[39m [39m [39m [39m [39m [39m▆[39m▁[32m [39m[39m [39m [39m▅[39m▁[39m [39m [39m [39m▁[39m [39m▁[39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[39m▆[39m▅[39m▆[3

In [24]:
@code_typed argmax(v)

CodeInfo(
[90m1 ──[39m %1  = Base.arraysize(A, 1)[36m::Int64[39m
[90m│   [39m %2  = Base.slt_int(%1, 0)[36m::Bool[39m
[90m│   [39m %3  = Core.ifelse(%2, 0, %1)[36m::Int64[39m
[90m│   [39m %4  = Base.slt_int(%3, 0)[36m::Bool[39m
[90m│   [39m %5  = Core.ifelse(%4, 0, %3)[36m::Int64[39m
[90m│   [39m %6  = Base.slt_int(%5, 1)[36m::Bool[39m
[90m└───[39m       goto #3 if not %6
[90m2 ──[39m       goto #4
[90m3 ──[39m       goto #4
[90m4 ┄─[39m %10 = φ (#2 => true, #3 => false)[36m::Bool[39m
[90m│   [39m %11 = φ (#3 => 1)[36m::Int64[39m
[90m│   [39m %12 = φ (#3 => 1)[36m::Int64[39m
[90m│   [39m %13 = φ (#2 => true)[36m::Bool[39m
[90m└───[39m       goto #5
[90m5 ──[39m       goto #7 if not %10
[90m6 ──[39m       goto #8
[90m7 ──[39m %17 = Base.arrayref(true, A, %11)[36m::Float64[39m
[90m└───[39m       goto #8
[90m8 ┄─[39m %19 = φ (#6 => %13, #7 => false)[36m::Bool[39m
[90m│   [39m %20 = φ (#7 => %17)[36m::Float64[39m
[90m│   [

As the previous result is so fast, see if varying the size of the array in the loop is what causes the dramatic slowdown that I observe in my own code

In [33]:
@benchmark begin
    setup=(v = rand(Float64, 1000))
    while length(v) > 0
        a = argmin(v)
        deleteat!(v, a)
        # pop!(v)
    end
end

BenchmarkTools.Trial: 2086 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.325 ms[22m[39m … [35m 2.714 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.389 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.397 ms[22m[39m ± [32m42.246 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m▂[39m▄[39m▁[39m▆[39m▄[39m▆[39m▄[39m▇[34m█[39m[39m▅[32m▃[39m[39m▄[39m▄[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▃[39m▄[39m▄[39m▄[39m▆[39m▇[39m█

In [27]:
v = rand(Float64, 1000)
@benchmark begin
    vmax = 0.0
    imax = 0
    for (i, val) in enumerate(v)
        if val>vmax
            vmax=val
            imax=i
        end
    end
end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m62.917 μs[22m[39m … [35m 1.833 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 95.45%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m65.583 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m71.619 μs[22m[39m ± [32m85.110 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m6.21% ±  5.02%

  [39m [39m█[39m [39m [39m▂[39m [39m [39m [34m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▂[39m█[39m▆[39m▅[39m█[39m▄

Fast minimum finder, using loop vectorisation (this is the fastest, right?)

In [32]:
@benchmark begin
    setup=(v = rand(Float64, 1000))
    while length(v) > 0
        vmin = typemax(Float64)
        imin = 0
        @turbo for i ∈ eachindex(v)
            newmin = v[i] < vmin
            vmin = newmin ? v[i] : vmin
            imin = newmin ? i : imin
        end
        deleteat!(v, imin)
    end
end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m128.500 μs[22m[39m … [35m230.166 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m131.166 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m134.344 μs[22m[39m ± [32m  7.358 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m▂[39m█[39m▃[39m [39m [34m [39m[39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▂[39m█[39m█[3

### Container in an immutable struct

In [9]:
struct ImmutableBox
    v::Vector{Float64}
    a::Vector{Float64}
    b::Vector{Float64}
    c::Float64
end


In [10]:
imute = ImmutableBox(rand(Float64, 1000), rand(Float64, 1000), rand(Float64, 1000), 42)

ImmutableBox([0.7852649776564672, 0.08007834916926904, 0.0019378912961869332, 0.3294635095685463, 0.710876626855232, 0.03655727329116887, 0.8368889272137412, 0.40920768024629306, 0.37679892401869797, 0.18928419721655443  …  0.32629479999342303, 0.018424125832673055, 0.6857664081701026, 0.8037823632529453, 0.855559642599393, 0.22739556879755218, 0.5394034859993598, 0.8086541688586756, 0.23381947067118358, 0.4126994820821538], [0.4002270191222379, 0.6278645477164089, 0.43477789773188447, 0.05845714664235435, 0.014181850232540039, 0.6095619749024112, 0.43210651373944753, 0.2379141137836588, 0.6493261836738421, 0.9794316093158966  …  0.5331530208234162, 0.09913757859745875, 0.23200388492921376, 0.47975352522809755, 0.23543266330319645, 0.1443234708455703, 0.9492791129444226, 0.9645776392444547, 0.3713072160866312, 0.7070034667495301], [0.7008084766950523, 0.7945237219274326, 0.21156532940045802, 0.9340936209411682, 0.8806617571544448, 0.1161875074915869, 0.7419697477734761, 0.1575274829768

In [11]:
@benchmark begin
    a::Int = 0
    a = argmin(imute.v)
end

BenchmarkTools.Trial: 10000 samples with 7 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m4.637 μs[22m[39m … [35m  7.905 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m4.667 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m4.791 μs[22m[39m ± [32m329.919 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m█[34m█[39m[39m▃[39m [39m [39m [32m [39m[39m [39m [39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m▁[39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[39m█[39m█[39m▇[3

In [12]:
@benchmark begin
    vmax = 0.0
    imax = 0
    for (i, val) in enumerate(imute.v)
        if val>vmax
            vmax=val
            imax=i
        end
    end
end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m60.500 μs[22m[39m … [35m 1.603 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 94.64%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m61.958 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m67.008 μs[22m[39m ± [32m69.087 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m5.30% ±  4.90%

  [39m▅[39m█[39m▅[39m▃[39m▇[34m▇[39m[39m▄[39m▃[39m [39m [39m [39m▁[39m▃[39m▃[39m▂[39m▃[39m▃[39m▃[39m▃[39m▂[39m▁[39m▁[32m [39m[39m [39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▂[39m▁[39m▁[39m [39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[39m█[39m█[34m█

### Container in an mutable struct

In [13]:
struct MutableBox
    v::Vector{Float64}
    a::Vector{Float64}
    b::Vector{Float64}
    c::Float64
end

In [14]:
mmute = MutableBox(rand(Float64, 1000), rand(Float64, 1000), rand(Float64, 1000), 42)

MutableBox([0.519633787012468, 0.3486207233469808, 0.8054483777512783, 0.3900382630632965, 0.8105390664557072, 0.6565281700899827, 0.2688797822938934, 0.9607026505645238, 0.20291561672771485, 0.3946203610226229  …  0.814443603658637, 0.5711738744398518, 0.08039399182837781, 0.251396158760979, 0.6882038904048347, 0.5566088989109352, 0.7462654286577912, 0.4730980823918238, 0.823744012905345, 0.8239927264595112], [0.47516576779742503, 0.21575688250759728, 0.005685967838184203, 0.6421581648833953, 0.9809256939900968, 0.9710528423068715, 0.8614203902715024, 0.42522996146322267, 0.44403984784724915, 0.15642055414568778  …  0.4623319849914367, 0.27385601303883056, 0.7306482184848083, 0.019780459418902607, 0.6763242464673606, 0.9716905583535896, 0.6100590201069898, 0.9489812766007643, 0.14557815594514467, 0.2860250074310653], [0.7701376407297199, 0.1810317289099408, 0.33380204120790613, 0.5628488835108562, 0.8174761276479636, 0.5625813741115501, 0.3812383598026875, 0.5283987700057734, 0.479281

In [15]:
@benchmark begin
    a::Int = 0
    a = argmin(mmute.v)
end

BenchmarkTools.Trial: 10000 samples with 7 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m4.637 μs[22m[39m … [35m  7.780 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m4.673 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m4.771 μs[22m[39m ± [32m306.963 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▇[34m█[39m[39m▅[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[39m█[39m█[39m▆[3

In [16]:
@benchmark begin
    vmax = 0.0
    imax = 0
    for (i, val) in enumerate(mmute.v)
        if val>vmax
            vmax=val
            imax=i
        end
    end
end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m59.500 μs[22m[39m … [35m 1.581 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 94.94%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m60.958 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m65.932 μs[22m[39m ± [32m68.447 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m5.34% ±  4.90%

  [39m▄[39m█[39m▄[39m [39m▇[34m▇[39m[39m▃[39m▄[39m [39m [39m [39m [39m▁[39m▂[39m▂[39m▂[39m▃[39m▃[39m▃[39m▂[39m▁[39m [32m [39m[39m [39m [39m [39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▂[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[39m█[39m█[34m█