# High perfomance code for everybody 
#### (Tell those C, Fortran bullies to go away)
#### (Even better, show them you code runs as fast as their code )
#### (Disclaimer -> Usual response: "this benchmark is not representative" )

In [1]:
using BenchmarkTools

## 1) Summing a series 

In [20]:
function one_over_n(n)
    aux = 0.
    for i in 1:n
        aux += 1/i
    end
    return aux
end

one_over_n (generic function with 1 method)

In [21]:
#@code_llvm one_over_n(100)

In [22]:
@time aux = one_over_n(1000_000)

  0.013060 seconds (1.03 k allocations: 56.556 KiB)


14.392726722864989

In [4]:
@time aux = one_over_n(10^6)

  0.008208 seconds (186 allocations: 12.020 KiB)


14.392726722864989

In [5]:
function one_over_n2(n)
    aux = 0.
    for i in 1:n
        @fastmath aux += 1./i
    end
    return aux
end

one_over_n2 (generic function with 1 method)

In [8]:
@time aux = one_over_n2(10^6)

  0.002902 seconds (6 allocations: 192 bytes)


14.392726722865742

## 2) Controlling outliers (or clipping gradients)

In [9]:
function clip!( x, a=0, b= 1)
    for i in 1:length(x)
        if x[i] < a
            x[i] = a
        elseif x[i] > b
            x[i] = b
        end
    end
end

clip! (generic function with 3 methods)

In [10]:
x = randn(10^7);

In [16]:
@time clip!(x,0,1)

  0.012954 seconds (4 allocations: 160 bytes)


In [12]:
#x .>1.

In [14]:
@time  begin
    x[x.>1.] = 1.
    x[x.<0.] = 0.
end

  0.093905 seconds (12.49 k allocations: 3.058 MiB)


0.0

#### Do you know (because you are a good programmer) that you will acces only "acceptable" positions  in the array?
#### Tell the compiler! -> @inbounds

In [17]:
function clip2!( x, a=0., b= 1.)
    @inbounds for i in 1:length(x)
        if x[i] < a
            x[i] = a
        elseif x[i] > b
            x[i] = b
        end
    end
end

clip2! (generic function with 3 methods)

In [19]:
x = randn(10^7);
@time clip2!(x)

  0.011296 seconds (4 allocations: 160 bytes)


#### maybe in float32 this is faster

In [27]:
function clip_float32!( x::Array{Float32}, a=Float32(0.), b=Float32( 1.))
    
   @inbounds for i in 1:length(x)
        if x[i] < a
            x[i] = a
        elseif x[i] > b
            x[i] = b
        end
    end
end

clip_float32! (generic function with 3 methods)

In [31]:
x = randn(10^7);
x = Array{Float32}(randn(10^7));

In [32]:
@time clip_float32!(x)

  0.005938 seconds (4 allocations: 160 bytes)


## 3) Computing a quantity depending on a "custom criteria"

In [33]:
function compute_taxes(salary)
    if salary < 18000
        return salary * 0.2
    elseif  18000 < salary < 25000
        return salary * 0.3
    elseif   25000 < salary < 70000
        return salary * 0.4
    else
        return salary * 0.5
    end
end        

compute_taxes (generic function with 1 method)

In [34]:
salaries = Array{Int64}(round.(rand(10^6,1)* 100000));

In [39]:
@time taxes = [compute_taxes(x) for x in salaries ];

  0.032189 seconds (14.22 k allocations: 8.293 MB)


In [1222]:
f(x::Float64) = 2x

f (generic function with 1 method)

In [1223]:
g(5.)

10.0

In [1227]:
f.([2.,3.,4.])

3-element Array{Float64,1}:
 4.0
 6.0
 8.0

In [1228]:
map(f,[2.,3.,4.])

3-element Array{Float64,1}:
 4.0
 6.0
 8.0

In [267]:
# Let us consider the worst case scenario: 
salaries = Array{Int64}(round.( [1_000_000 for x in 1:10^6]));

In [40]:
@time taxes = [compute_taxes.(x) for x in salaries ];

  0.038335 seconds (14.42 k allocations: 8.301 MB)


### 3.1) Allocating memory of taxes before

Make your code cache friendly ! (what do you have 20 Mb of cache in your chip?)


In [41]:
taxes =  Array{Float64}(zeros(10^6));

In [42]:
function compute_taxes!( taxes, salaries)
    
    for (i, salary) in enumerate(salaries)
        if salary < 18000
            taxes[i] = salary * 0.2
        elseif  18000 < salary < 25000
            taxes[i] = salary * 0.3
        elseif   25000 < salary < 70000
            taxes[i] = salary * 0.4
        else
            taxes[i] = salary * 0.5
        end
    end
end      

compute_taxes! (generic function with 1 method)

In [45]:
@time compute_taxes!(taxes, salaries)

  0.008840 seconds (4 allocations: 160 bytes)


In [44]:
@time Array{Float64}(zeros(10^6));

  0.001995 seconds (13 allocations: 7.630 MB)


### 3.2) paralelizing computations using pmap

In [46]:
nprocs()

1

In [47]:
addprocs(3)

3-element Array{Int64,1}:
 2
 3
 4

In [1099]:
nprocs()

4

In [48]:
salaries = Array{Int64}(round.(rand(10^8,1)* 100000));

In [49]:
@everywhere function compute_taxes(salaries::Array)
    taxes = zeros(length(salaries))
    
    for (i, salary) in enumerate(salaries)
        if salary < 18000
            taxes[i] = salary * 0.2
        elseif  18000 < salary < 25000
            taxes[i] = salary * 0.3
        elseif   25000 < salary < 70000
            taxes[i] = salary * 0.4
        else
            taxes[i] = salary * 0.5
        end
    end
    return taxes
end     

In [50]:
n = length(salaries)
n_processors = nprocs()

4

In [51]:
@time taxes = [compute_taxes.(x) for x in salaries ];

  1.541745 seconds (14.42 k allocations: 763.611 MB, 3.98% gc time)


In [52]:
@time begin
splits_ind = [Int(x) for x in 1:(n/n_processors):(n+1)]
salaries_array_splits = [salaries[x:y-1] for (x,y) in zip(splits_ind[1:end-1], splits_ind[2:end])]
res = pmap(compute_taxes, salaries_array_splits);
end

  9.757134 seconds (2.36 M allocations: 1.602 GB, 7.92% gc time)


4-element Array{Array{Float64,1},1}:
 [560.0,38323.0,19864.4,22562.0,41017.0,11195.6,42160.0,17138.8,2431.2,646.8  …  10930.0,7171.5,43781.5,731.0,39896.5,11030.8,7277.4,44105.5,45533.5,44511.0]     
 [43753.5,49345.0,45546.5,3072.4,7373.7,18070.8,6044.1,22417.6,37228.0,27215.6  …  728.0,6014.4,26492.0,42813.0,20387.6,40406.0,25297.6,17805.2,13121.6,14817.2]  
 [26897.6,22170.0,13570.4,215.4,3570.0,47587.0,13770.0,47917.0,47487.5,1106.2  …  2782.6,19195.6,19119.6,2796.2,24722.4,46706.5,21642.4,44148.0,45297.0,37669.5]  
 [19928.0,40097.5,3296.4,17265.6,41168.0,42896.5,26649.2,23357.2,11830.8,12434.4  …  2276.0,45668.0,12277.6,39909.5,372.8,12738.8,35806.5,15559.2,41534.0,44426.0]

## 4) L2 norm

In [53]:
len = 1000_000;

x = randn(len);
y = randn(len);

In [55]:
@time sum((x - y).^2)/length(x)

  0.165496 seconds (6.08 k allocations: 15.550 MB, 72.10% gc time)


1.9988731097019454

In [29]:
@benchmark sum((x - y).^2)/length(x)

BenchmarkTools.Trial: 
  memory estimate:  15.26 MiB
  allocs estimate:  31
  --------------
  minimum time:     5.391 ms (0.00% GC)
  median time:      8.729 ms (24.36% GC)
  mean time:        8.748 ms (23.47% GC)
  maximum time:     17.909 ms (26.64% GC)
  --------------
  samples:          564
  evals/sample:     1

#### For loop

In [56]:
function l2_squared(x::Array{Float64},y::Array{Float64})
    norm = 0.
    for i in 1:length(x)
        norm = norm + (x[i] - y[i])^2
    end
    return norm/length(x)
end

l2_squared (generic function with 1 method)

In [58]:
@time l2_squared(x,y)

  0.002203 seconds (5 allocations: 176 bytes)


1.998873109701994

#### Inbounds

In [59]:
function l2_squared_inbounds(x::Array{Float64},y::Array{Float64})
    norm = 0.
    @inbounds for i in 1:length(x)
               norm += (x[i] - y[i])^2
             end
    return norm/length(x)
end

l2_squared_inbounds (generic function with 1 method)

In [62]:
@time l2_squared_inbounds(x,y)

  0.001414 seconds (5 allocations: 176 bytes)


1.998873109701994

In [None]:
function mytime(x)
    t0 = time()
    x
    return 


In [186]:
@benchmark l2_squared_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     994.200 μs (0.00% GC)
  median time:      1.068 ms (0.00% GC)
  mean time:        1.126 ms (0.00% GC)
  maximum time:     4.023 ms (0.00% GC)
  --------------
  samples:          4293
  evals/sample:     1

#### SIMD

In [64]:
function l2_squared_inbounds_simd(x::Array{Float64},y::Array{Float64})
    norm = 0.
    n = length(x)
    @simd for i in 1:n
        @inbounds norm += (x[i] - y[i])^2
        end

    return norm/length(x)
end

l2_squared_inbounds_simd (generic function with 1 method)

In [66]:
@time l2_squared_inbounds_simd(x,y)

  0.001201 seconds (5 allocations: 176 bytes)


1.9988731097019496

In [180]:
@benchmark l2_squared_inbounds_simd(x,y)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     812.198 μs (0.00% GC)
  median time:      894.543 μs (0.00% GC)
  mean time:        962.800 μs (0.00% GC)
  maximum time:     4.048 ms (0.00% GC)
  --------------
  samples:          4989
  evals/sample:     1

### Float 32

In [23]:
len = 1000_000
srand(1234)
x32 = Array{Float32}(randn(len));
y32 = Array{Float32}(randn(len));

function l2_squared_inbounds_simd(x::Array{Float32},y::Array{Float32})
    norm = 0.
    n = length(x)
    @inbounds @simd for i in 1:n
             norm += (x[i] - y[i])^2
        end

    return norm/length(x)
end

l2_squared_inbounds_simd (generic function with 2 methods)

In [27]:
@time l2_squared_inbounds_simd(x32,y32)

  0.000712 seconds (5 allocations: 176 bytes)


1.9997882696165643

In [None]:
θ = {}

In [72]:
η = 3

3

In [73]:
ϵ = 23

23

In [76]:
ϵ + 23


46

In [83]:
Float64 <: Real

true

In [82]:
@code_llvm p(2) 


define i64 @julia_p_63580(i64) #0 !dbg !5 {
top:
  %1 = shl i64 %0, 1
  %2 = add i64 %1, 3
  ret i64 %2
}


In [80]:

p(x) = 2x + 3

p (generic function with 1 method)

In [24]:
function test1(x::Int)
    return 2*x
end

test1 (generic function with 1 method)

In [27]:
function test1(x::String)
    return x*x
end

test1 (generic function with 2 methods)

In [28]:
test1(23)

46

In [26]:
test1("a")

"aa"