In [7]:
using LoopVectorization, BenchmarkTools


For simple loops like a dot product, LoopVectorization.jl's most important optimization is to handle these tails more efficiently:



In [8]:

function mydot(a, b)
          s = 0.0
          @inbounds @simd for i ∈ eachindex(a,b)
              s += a[i]*b[i]
          end
          s
      end

function mydotavx(a, b)
          s = 0.0
          @turbo for i ∈ eachindex(a,b)
              s += a[i]*b[i]
          end
          s
      end

a = rand(256); b = rand(256);

In [4]:

@btime mydot($a, $b)

@btime mydotavx($a, $b) # performance is similar


  23.249 ns (0 allocations: 0 bytes)
  16.429 ns (0 allocations: 0 bytes)


58.46030998684942

In [5]:

a = rand(255); b = rand(255);

@btime mydot($a, $b) # with loops shorter by 1, the remainder is now 32, and it is slow

@btime mydotavx($a, $b) # performance remains mostly unchanged.


  34.661 ns (0 allocations: 0 bytes)
  14.999 ns (0 allocations: 0 bytes)


71.31644264039625

Fancier loops can be optimized

In [9]:
function mygemm!(C, A, B)
           @inbounds @fastmath for m ∈ axes(A,1), n ∈ axes(B,2)
               Cmn = zero(eltype(C))
               for k ∈ axes(A,2)
                   Cmn += A[m,k] * B[k,n]
               end
               C[m,n] = Cmn
           end
       end

mygemm! (generic function with 1 method)

In [10]:
function mygemmavx!(C, A, B)
           @turbo for m ∈ axes(A,1), n ∈ axes(B,2)
               Cmn = zero(eltype(C))
               for k ∈ axes(A,2)
                   Cmn += A[m,k] * B[k,n]
               end
               C[m,n] = Cmn
           end
       end

mygemmavx! (generic function with 1 method)

In [11]:
M, K, N = 191, 189, 171;

In [15]:
C1 = Matrix{Float64}(undef, M, N);
C2 = similar(C1); 
C3 = similar(C1);
A = randn(M, K); 
B = randn(K, N);

In [16]:
@benchmark mygemmavx!($C1, $A, $B)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m237.173 μs[22m[39m … [35m 1.550 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m240.360 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m251.788 μs[22m[39m ± [32m34.283 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m█[34m▆[39m[39m▄[39m▄[39m▄[39m▃[32m▂[39m[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[39m█[39m█

In [17]:
@benchmark mygemm!($C2, $A, $B)

BenchmarkTools.Trial: 765 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m5.891 ms[22m[39m … [35m 17.351 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m6.402 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m6.518 ms[22m[39m ± [32m665.667 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m▁[39m▁[39m▄[39m█[39m▄[39m▂[39m▃[39m▂[39m▁[34m▅[39m[39m▂[32m▁[39m[39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▇[39m█[39m█[39m█[39m█[39m█[39m

In [18]:
using LinearAlgebra, Test


In [19]:
 @test all(C1 .≈ C2)

[32m[1mTest Passed[22m[39m
  Expression: all(C1 .≈ C2)

In [21]:
BLAS.set_num_threads(1); BLAS.get_config()

LinearAlgebra.BLAS.LBTConfig
Libraries: 
└ [ILP64] libopenblas64_.dylib

In [22]:
@benchmark mul!($C3, $A, $B)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m298.538 μs[22m[39m … [35m737.886 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m306.343 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m327.389 μs[22m[39m ± [32m 43.791 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m█[39m▅[34m▅[39m[39m▄[39m▃[39m▃[39m▃[39m▃[39m▃[32m▃[39m[39m▃[39m▂[39m▂[39m▂[39m▂[39m▂[39m▂[39m▂[39m▂[39m▁[39m▂[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[39m█[34m█[3

In [23]:
@test all(C1 .≈ C3)

[32m[1mTest Passed[22m[39m
  Expression: all(C1 .≈ C3)