# High perfomance code for everybody 
#### (Tell those C, Fortran bullies to go away)
#### (Even better, show them you code runs as fast as their code )
#### (Disclaimer -> Usual response: "this benchmark is not representative" )

In [3]:
using BenchmarkTools

## 1) Summing a series 

In [41]:
function one_over_n(n)
    aux = 0.
    for i in 1:n
        aux += 1/i
    end
    return aux
end

one_over_n (generic function with 1 method)

In [49]:
@time aux = one_over_n(1000_000)

  0.004884 seconds (5 allocations: 176 bytes)


14.392726722864989

In [142]:
@time aux = one_over_n(10^6)

  0.005252 seconds (6 allocations: 192 bytes)


14.392726722864989

In [154]:
function one_over_n2(n)
    aux = 0.
    for i in 1:n
        @fastmath aux += 1./i
    end
    return aux
end

one_over_n2 (generic function with 1 method)

In [158]:
@time aux = one_over_n2(10^6)

  0.004459 seconds (6 allocations: 192 bytes)


14.392726722864989

## 2) Controlling outliers (or clipping gradients)

In [1198]:
function clip!( x, a=0, b= 1)
    for i in 1:length(x)
        if x[i] < a
            x[i] = a
        elseif x[i] > b
            x[i] = b
        end
    end
end

clip! (generic function with 3 methods)

In [1199]:
x = randn(10^7);

In [1202]:
@time clip!(x,0,1)

  0.012998 seconds (4 allocations: 160 bytes)


In [1203]:
x .>1.

10000000-element BitArray{1}:
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
     ⋮
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false

In [904]:
@time  begin
    x[x.>1.] = 1.
    x[x.<0.] = 0.
end

  0.111007 seconds (32.15 k allocations: 3.961 MB)


0.0

#### Do you know (because you are a good programmer) that you will acces only "acceptable" positions  in the array?
#### Tell the compiler! -> @inbounds

In [1204]:
function clip2!( x, a=0., b= 1.)
    @inbounds for i in 1:length(x)
        if x[i] < a
            x[i] = a
        elseif x[i] > b
            x[i] = b
        end
    end
end

clip2! (generic function with 3 methods)

In [1206]:
x = randn(10^7);
@time clip2!(x)

  0.012076 seconds (4 allocations: 160 bytes)


#### maybe in float32 this is faster

In [1207]:
function clip_float32!( x::Array{Float32}, a=Float32(0.), b=Float32( 1.))
    
   @inbounds for i in 1:length(x)
        if x[i] < a
            x[i] = a
        elseif x[i] > b
            x[i] = b
        end
    end
end

clip_float32! (generic function with 3 methods)

In [1208]:
x = randn(10^7);
x = Array{Float32}(randn(10^7));

In [1210]:
@time clip_float32_nosimd!(x)

  0.004193 seconds (4 allocations: 160 bytes)


## 3) Computing a quantity depending on a "custom criteria"

In [1211]:
function compute_taxes(salary)
    if salary < 18000
        return salary * 0.2
    elseif  18000 < salary < 25000
        return salary * 0.3
    elseif   25000 < salary < 70000
        return salary * 0.4
    else
        return salary * 0.5
    end
end        

compute_taxes (generic function with 3 methods)

In [1212]:
salaries = Array{Int64}(round.(rand(10^6,1)* 100000));

In [1215]:
@time taxes = [compute_taxes.(x) for x in salaries ];

  0.032079 seconds (14.46 k allocations: 8.343 MB)


In [1222]:
f(x::Float64) = 2x

f (generic function with 1 method)

In [1223]:
g(5.)

10.0

In [1227]:
f.([2.,3.,4.])

3-element Array{Float64,1}:
 4.0
 6.0
 8.0

In [1228]:
map(f,[2.,3.,4.])

3-element Array{Float64,1}:
 4.0
 6.0
 8.0

In [267]:
# Let us consider the worst case scenario: 
salaries = Array{Int64}(round.( [1_000_000 for x in 1:10^6]));

In [1231]:
@time taxes = [compute_taxes.(x) for x in salaries ];

  0.034641 seconds (14.45 k allocations: 8.303 MB)


### 3.1) Allocating memory of taxes before

Make your code cache friendly ! (what do you have 20 Mb of cache in your chip?)


In [1232]:
taxes =  Array{Float64}(zeros(10^6));

In [1233]:
function compute_taxes!( taxes, salaries)
    
    for (i, salary) in enumerate(salaries)
        if salary < 18000
            taxes[i] = salary * 0.2
        elseif  18000 < salary < 25000
            taxes[i] = salary * 0.3
        elseif   25000 < salary < 70000
            taxes[i] = salary * 0.4
        else
            taxes[i] = salary * 0.5
        end
    end
end      

compute_taxes! (generic function with 1 method)

In [1235]:
@time compute_taxes!(taxes, salaries)

  0.008861 seconds (4 allocations: 160 bytes)


In [259]:
@time Array{Float64}(zeros(10^6));

  0.002241 seconds (13 allocations: 7.630 MB)


### 3.2) paralelizing computations using pmap

In [1236]:
nprocs()

4

In [979]:
addprocs(3)

3-element Array{Int64,1}:
 2
 3
 4

In [1099]:
nprocs()

4

In [1100]:
salaries = Array{Int64}(round.(rand(10^8,1)* 100000));

In [1101]:
@everywhere function compute_taxes(salaries::Array)
    taxes = zeros(length(salaries))
    
    for (i, salary) in enumerate(salaries)
        if salary < 18000
            taxes[i] = salary * 0.2
        elseif  18000 < salary < 25000
            taxes[i] = salary * 0.3
        elseif   25000 < salary < 70000
            taxes[i] = salary * 0.4
        else
            taxes[i] = salary * 0.5
        end
    end
    return taxes
end     

In [1105]:
n = length(salaries)
n_processors = nprocs()

4

In [1104]:
@time taxes = [compute_taxes.(x) for x in salaries ];

  3.628292 seconds (14.59 k allocations: 763.622 MB, 0.49% gc time)


In [1103]:
@time begin
splits_ind = [Int(x) for x in 1:(n/n_processors):(n+1)]
salaries_array_splits = [salaries[x:y-1] for (x,y) in zip(splits_ind[1:end-1], splits_ind[2:end])]
res = pmap(compute_taxes, salaries_array_splits);
end

 10.140195 seconds (62.58 k allocations: 1.493 GB, 6.01% gc time)


4-element Array{Array{Float64,1},1}:
 [11441.2,1659.2,49470.5,26370.4,45633.0,18969.2,14466.0,14968.4,15699.2,44601.5  …  2143.4,40227.5,13803.2,3128.8,20832.4,24445.6,41667.5,2432.6,25048.8,37231.5]
 [16823.6,27015.2,245.0,42648.0,693.2,16959.2,24253.2,14373.6,2363.0,37932.5  …  22446.8,37655.5,1861.2,37260.5,16394.8,42229.5,12740.0,17375.6,6868.8,14959.2]   
 [19766.4,7179.0,40640.5,27244.8,41927.0,2963.8,25670.0,2650.8,1667.4,44551.5  …  890.8,45291.5,40100.0,24187.2,13953.2,16663.6,2681.0,19727.2,38687.5,48795.5]   
 [13319.6,23017.6,42234.0,14661.2,6112.8,5745.6,21870.4,17387.6,42004.0,7359.9  …  16957.6,24809.2,1408.2,27023.2,10074.8,25002.8,1841.2,6972.6,39987.5,27.6]     

## 4) L2 norm

In [28]:
len = 1000_000;

x = randn(len);
y = randn(len);

In [30]:
@time sum((x - y).^2)/length(x)

  0.047551 seconds (6.28 k allocations: 15.554 MiB, 16.05% gc time)


2.002433468450592

In [29]:
@benchmark sum((x - y).^2)/length(x)

BenchmarkTools.Trial: 
  memory estimate:  15.26 MiB
  allocs estimate:  31
  --------------
  minimum time:     5.391 ms (0.00% GC)
  median time:      8.729 ms (24.36% GC)
  mean time:        8.748 ms (23.47% GC)
  maximum time:     17.909 ms (26.64% GC)
  --------------
  samples:          564
  evals/sample:     1

#### For loop

In [5]:
function l2_squared(x::Array{Float64},y::Array{Float64})
    norm = 0.
    for i in 1:length(x)
        norm = norm + (x[i] - y[i])^2
    end
    return norm/length(x)
end

l2_squared (generic function with 1 method)

In [161]:
@time l2_squared(x,y)

  0.001501 seconds (5 allocations: 176 bytes)


2.00218357699933

#### Inbounds

In [183]:
function l2_squared_inbounds(x::Array{Float64},y::Array{Float64})
    norm = 0.
    @inbouds for i in 1:length(x)
               norm += (x[i] - y[i])^2
             end
    return norm/length(x)
end

LoadError: [91mUndefVarError: @inbouds not defined[39m

In [187]:
@time l2_squared_inbounds(x,y)

  0.001101 seconds (5 allocations: 176 bytes)


2.00218357699933

In [186]:
@benchmark l2_squared_inbounds(x,y)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     994.200 μs (0.00% GC)
  median time:      1.068 ms (0.00% GC)
  mean time:        1.126 ms (0.00% GC)
  maximum time:     4.023 ms (0.00% GC)
  --------------
  samples:          4293
  evals/sample:     1

#### SIMD

In [174]:
function l2_squared_inbounds_simd(x::Array{Float64},y::Array{Float64})
    norm = 0.
    n = length(x)
    @simd for i in 1:n
        @inbounds norm += (x[i] - y[i])^2
        end

    return norm/length(x)
end

l2_squared_inbounds_simd (generic function with 2 methods)

In [178]:
@time l2_squared_inbounds_simd(x,y)

  0.001026 seconds (5 allocations: 176 bytes)


2.0021835769993483

In [180]:
@benchmark l2_squared_inbounds_simd(x,y)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     812.198 μs (0.00% GC)
  median time:      894.543 μs (0.00% GC)
  mean time:        962.800 μs (0.00% GC)
  maximum time:     4.048 ms (0.00% GC)
  --------------
  samples:          4989
  evals/sample:     1

### Float 32

In [23]:
len = 1000_000
srand(1234)
x32 = Array{Float32}(randn(len));
y32 = Array{Float32}(randn(len));

function l2_squared_inbounds_simd(x::Array{Float32},y::Array{Float32})
    norm = 0.
    n = length(x)
    @inbounds @simd for i in 1:n
             norm += (x[i] - y[i])^2
        end

    return norm/length(x)
end

l2_squared_inbounds_simd (generic function with 2 methods)

In [27]:
@time l2_squared_inbounds_simd(x32,y32)

  0.000712 seconds (5 allocations: 176 bytes)


1.9997882696165643