<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#High-perfomance-code-for-everybody" data-toc-modified-id="High-perfomance-code-for-everybody-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>High perfomance code for everybody</a></span><ul class="toc-item"><li><span><a href="#1)-Summing-a-series" data-toc-modified-id="1)-Summing-a-series-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>1) Summing a series</a></span></li><li><span><a href="#2)-Controlling-outliers-(or-clipping-gradients)" data-toc-modified-id="2)-Controlling-outliers-(or-clipping-gradients)-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>2) Controlling outliers (or clipping gradients)</a></span></li><li><span><a href="#3)-Computing-a-quantity-depending-on-a-&quot;custom-criteria&quot;" data-toc-modified-id="3)-Computing-a-quantity-depending-on-a-&quot;custom-criteria&quot;-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>3) Computing a quantity depending on a "custom criteria"</a></span></li><li><span><a href="#4)-L2-norm" data-toc-modified-id="4)-L2-norm-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>4) L2 norm</a></span></li><li><span><a href="#Notes-on-SIMD" data-toc-modified-id="Notes-on-SIMD-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Notes on SIMD</a></span></li></ul></li></ul></div>

# High perfomance code for everybody 
#### (Tell those C, Fortran bullies to go away)
#### (Even better, show them you code runs as fast as their code )
#### (Disclaimer -> Usual response: "this benchmark is not representative" )

In [70]:
using BenchmarkTools

## 1) Summing a series 

In [71]:
function one_over_n(n)
    aux = 0.
    for i in 1:n
        aux += 1/i
    end
    return aux
end

one_over_n (generic function with 1 method)

In [72]:
@btime aux = one_over_n(10^6)

  911.235 μs (0 allocations: 0 bytes)


14.392726722864989

In [73]:
one_over_n(10^6)

14.392726722864989

In [5]:
#@code_native one_over_n(100)

In [74]:
@fastmath function one_over_n2(n)
    aux = 0.
    
    for i in 1:n
        aux += 1/i
    end
    return aux
end

one_over_n2 (generic function with 1 method)

In [75]:
@btime aux = one_over_n2(10^6)

  569.550 μs (0 allocations: 0 bytes)


14.392726722865742

## 2) Controlling outliers (or clipping gradients)

In [95]:
function clip!(x, a=0, b= 1)
    for i in 1:length(x)
        if x[i] < a
            x[i] = a
        elseif x[i] > b
            x[i] = b
        end
    end
end

clip! (generic function with 3 methods)

In [96]:
x = randn(10^7);

In [97]:
@btime clip!(x,0,1)

  25.674 ms (0 allocations: 0 bytes)


We can also use 'vectorized expressions'

In [98]:
@inbounds function clip_vectorized!(x, a=0, b= 1)
    x[x.>1.] .= a
    x[x.<0.] .= b
end

clip_vectorized! (generic function with 3 methods)

In [99]:
@btime clip_vectorized!(x,0,1)

  11.479 ms (11 allocations: 2.39 MiB)


0-element view(::Vector{Float64}, Int64[]) with eltype Float64

#### maybe in float32 this is faster

In [100]:
function clip_float32!( x::Array{Float32}, a=Float32(0.), b=Float32( 1.))
    
   @inbounds for i in 1:length(x)
        if x[i] < a
            x[i] = a
        elseif x[i] > b
            x[i] = b
        end
    end
end

clip_float32! (generic function with 3 methods)

In [101]:
x = randn(Float32, 10^7);

In [102]:
@btime clip_float32!(x)

  2.231 ms (0 allocations: 0 bytes)


#### Do you know (because you are a good programmer) that you will acces only "acceptable" positions  in the array?
#### Tell the compiler! -> @inbounds

In [103]:
function clip2!(x::Array{T}, a::T=T(0.), b::T= T(1.)) where T<: Number
    @inbounds for i in 1:length(x)
        if x[i] < a
            x[i] = a
        elseif x[i] > b
            x[i] = b
        end
    end
end

clip2! (generic function with 3 methods)

In [104]:
x = randn(Float32, 10^7);

In [105]:
@btime clip2!(x)

  2.262 ms (0 allocations: 0 bytes)


In [101]:
x = randn(Float32, 10^9);

In [102]:
@btime clip2!(x)

  238.255 ms (0 allocations: 0 bytes)


## 3) Computing a quantity depending on a "custom criteria"

In [106]:
@fastmath function compute_taxes(salary::T) where T<:Number
    if salary < 18000
        return salary * 0.2
    elseif  18000 < salary < 25000
        return salary * 0.3
    elseif   25000 < salary < 70000
        return salary * 0.4
    else
        return salary * 0.5
    end
end        

compute_taxes (generic function with 1 method)

In [107]:
salaries = Array(round.(rand(10^6,1)* 100000));

In [108]:
@btime taxes = [compute_taxes(x) for x in salaries ];

  877.168 μs (3 allocations: 7.63 MiB)


## 4) L2 norm

In [109]:
len = 1_000_000;

x = randn(Float32, len);
y = randn(Float32, len);

In [110]:
@btime sum((x .- y).^2)/length(x)

  670.367 μs (12 allocations: 3.81 MiB)


1.9995135f0

#### For loop

In [170]:
function l2_squared(x::Array{T},y::Array{T}) where T <:Number
    norm = T(0.)
    for i in 1:length(x)
        norm = norm + (x[i] - y[i])^2
    end
    return norm/length(x)
end

l2_squared (generic function with 2 methods)

In [171]:
@btime l2_squared(x,y)

  921.672 μs (1 allocation: 16 bytes)


1.997657f0

#### SIMD

In [115]:
function l2_squared_inbounds_simd(x::Array{T},y::Array{T}) where T <:Number
    norm = 0.
    n = length(x)
    @simd for i in 1:n
        @inbounds norm += (x[i] - y[i])^2
        end

    return norm/length(x)
end

l2_squared_inbounds_simd (generic function with 1 method)

In [116]:
@btime l2_squared_inbounds_simd(x,y)

  231.006 μs (1 allocation: 16 bytes)


1.9995134630243814

## Notes on SIMD

Sometimes you don't need to do anything to benefit from simd native code!

In [118]:
f(a,b) = (a[1]+b[1], a[2]+b[2], a[3]+b[3], a[4]+b[4])

f (generic function with 1 method)

In [119]:
f((1,2,3,4),(1,2,3,4))

(2, 4, 6, 8)

In [120]:
@code_llvm f((1,2,3,4),(1,2,3,4))

[90m;  @ In[118]:1 within `f`[39m
[95mdefine[39m [36mvoid[39m [93m@julia_f_3242[39m[33m([39m[33m[[39m[33m4[39m [0mx [36mi64[39m[33m][39m[0m* [95mnoalias[39m [95mnocapture[39m [95msret[39m[33m([39m[33m[[39m[33m4[39m [0mx [36mi64[39m[33m][39m[33m)[39m [0m%0[0m, [33m[[39m[33m4[39m [0mx [36mi64[39m[33m][39m[0m* [95mnocapture[39m [95mnonnull[39m [95mreadonly[39m [95malign[39m [33m8[39m [95mdereferenceable[39m[33m([39m[33m32[39m[33m)[39m [0m%1[0m, [33m[[39m[33m4[39m [0mx [36mi64[39m[33m][39m[0m* [95mnocapture[39m [95mnonnull[39m [95mreadonly[39m [95malign[39m [33m8[39m [95mdereferenceable[39m[33m([39m[33m32[39m[33m)[39m [0m%2[33m)[39m [0m#0 [33m{[39m
[91mtop:[39m
[90m; ┌ @ int.jl:87 within `+`[39m
   [0m%3 [0m= [96m[1mbitcast[22m[39m [33m[[39m[33m4[39m [0mx [36mi64[39m[33m][39m[0m* [0m%1 [95mto[39m [33m<[39m[33m4[39m [0mx [36mi64[39m[33m>[39m[0m*
   [0m%4

#### another example

In [60]:
function my_sum(x::Vector{T}) where {T}
    s = zero(T)
    @simd for x_k in x
        s += x_k
    end
    return s
end

my_sum (generic function with 2 methods)

In [66]:
x = [1,2,3,4,5,6,7,8]
#x = Float64.(x)
@code_llvm my_sum(x)

[90m;  @ In[60]:1 within `my_sum`[39m
[95mdefine[39m [36mi64[39m [93m@julia_my_sum_2389[39m[33m([39m[33m{[39m[33m}[39m[0m* [95mnonnull[39m [95malign[39m [33m16[39m [95mdereferenceable[39m[33m([39m[33m40[39m[33m)[39m [0m%0[33m)[39m [0m#0 [33m{[39m
[91mtop:[39m
[90m;  @ In[60]:3 within `my_sum`[39m
[90m; ┌ @ simdloop.jl:71 within `macro expansion`[39m
[90m; │┌ @ simdloop.jl:51 within `simd_inner_length`[39m
[90m; ││┌ @ array.jl:215 within `length`[39m
     [0m%1 [0m= [96m[1mbitcast[22m[39m [33m{[39m[33m}[39m[0m* [0m%0 [95mto[39m [33m{[39m [36mi8[39m[0m*[0m, [36mi64[39m[0m, [36mi16[39m[0m, [36mi16[39m[0m, [36mi32[39m [33m}[39m[0m*
     [0m%2 [0m= [96m[1mgetelementptr[22m[39m [95minbounds[39m [33m{[39m [36mi8[39m[0m*[0m, [36mi64[39m[0m, [36mi16[39m[0m, [36mi16[39m[0m, [36mi32[39m [33m}[39m[0m, [33m{[39m [36mi8[39m[0m*[0m, [36mi64[39m[0m, [36mi16[39m[0m, [36mi16[39m[0m, 

   [0m%value_phi2 [0m= [96m[1mphi[22m[39m [36mi64[39m [33m[[39m [33m0[39m[0m, [91m%top[39m [33m][39m[0m, [33m[[39m [0m%19[0m, [91m%middle.block[39m [33m][39m[0m, [33m[[39m [0m%22[0m, [91m%L10[39m [33m][39m
[90m; └[39m
[90m;  @ In[60]:6 within `my_sum`[39m
  [96m[1mret[22m[39m [36mi64[39m [0m%value_phi2
[33m}[39m


In [62]:
code_llvm(my_sum, Tuple{Vector{Int}}, debuginfo=:none)

[95mdefine[39m [36mi64[39m [93m@julia_my_sum_2330[39m[33m([39m[33m{[39m[33m}[39m[0m* [95mnonnull[39m [95malign[39m [33m16[39m [95mdereferenceable[39m[33m([39m[33m40[39m[33m)[39m [0m%0[33m)[39m [0m#0 [33m{[39m
[91mtop:[39m
  [0m%1 [0m= [96m[1mbitcast[22m[39m [33m{[39m[33m}[39m[0m* [0m%0 [95mto[39m [33m{[39m [36mi8[39m[0m*[0m, [36mi64[39m[0m, [36mi16[39m[0m, [36mi16[39m[0m, [36mi32[39m [33m}[39m[0m*
  [0m%2 [0m= [96m[1mgetelementptr[22m[39m [95minbounds[39m [33m{[39m [36mi8[39m[0m*[0m, [36mi64[39m[0m, [36mi16[39m[0m, [36mi16[39m[0m, [36mi32[39m [33m}[39m[0m, [33m{[39m [36mi8[39m[0m*[0m, [36mi64[39m[0m, [36mi16[39m[0m, [36mi16[39m[0m, [36mi32[39m [33m}[39m[0m* [0m%1[0m, [36mi64[39m [33m0[39m[0m, [36mi32[39m [33m1[39m
  [0m%3 [0m= [96m[1mload[22m[39m [36mi64[39m[0m, [36mi64[39m[0m* [0m%2[0m, [95malign[39m [33m8[39m
  [0m%.not [0m= [96m[1micmp