# High performant Julia code


#### Devectorize and NumericExtensions packages

- https://github.com/lindahua/NumericExtensions.jl
- https://github.com/lindahua/Devectorize.jl


#### Write non vectorized code

- http://www.juliabloggers.com/fast-numeric-computation-in-julia/



#### Using SIMD instructions in Julia 
- http://ucidatascienceinitiative.github.io/IntroToJulia/Slides/HPCJulia#/

- http://www.juliabloggers.com/optimizing-julia-for-performance-a-practical-example/

- https://github.com/eschnett/SIMD.jl

In [61]:
using BenchmarkTools

## Speed up array comparisons

Making a function is that compares elementwise is faster

In [29]:
function compare(x,A)
       C = Array{Bool}(length(A))
       @inbounds for i in 1:length(A)
         C[i] = ifelse(A[i]>x, true, false)
       end
    return C
end

compare (generic function with 1 method)

In [59]:
A = collect(1:1000) + rand(1000);
cte  = 560.5
@btime $A .> 560.5
@btime $A .> $cte
@btime map( (l)-> l > 560.5, $A)
@btime map( (l)-> l > cte, $A);
@btime compare(threshold, $A);

  2.247 μs (5 allocations: 4.45 KiB)
  2.224 μs (5 allocations: 4.45 KiB)
  895.351 ns (2 allocations: 1.08 KiB)
  24.038 μs (1003 allocations: 16.72 KiB)
  416.939 ns (1 allocation: 1.06 KiB)


In [62]:
res = compare(threshold,A);
res2 =  map( (l)-> l > threshold, A);
res2 == res

true

## Integral squred

In [78]:
struct Point{X,Y}
    x::X
    y::Y
end
function integral_squared(v::Vector{Point{X,Y}}) where {X,Y}
    T = eltype(oneunit(X)*oneunit(Y)*oneunit(Y))
    area = zero(T)
    p1 = v[1]
    for i in 2:length(v)
        p2 = v[i]
        dx = p2.x - p1.x
        m = (p2.y - p1.y)/dx
        b = p1.y
        area += m/3*m*(dx*dx*dx) + b*m*(dx*dx) + b*b*(dx)
        p1 = p2
    end
    return area
end
# Test code
N = 1_000_000
pwl = [Point(x, sin(x)) for x in linspace(0, pi, N)];

In [71]:
Base.Test.@test integral_squared(pwl) ≈ pi/2  atol=0.00001

[1m[32mTest Passed[39m[22m

In [84]:
println("Benchmarking:")
@btime integral_squared($pwl);

Benchmarking:
  7.906 ms (0 allocations: 0 bytes)


In [77]:
function integral_squared2(v::Vector{Point{X,Y}}) where {X,Y}
    T = eltype(oneunit(X)*oneunit(Y)*oneunit(Y))
    area = zero(T)
    p1 = v[1]
    @inbounds for i in 2:length(v)
        p2 = v[i]
        dx = p2.x - p1.x
        m = (p2.y - p1.y) / dx
        b = p1.y
        area += @evalpoly(dx * m, b * b, b, 1 / 3) * dx
        p1 = p2
    end
    return area
end

integral_squared2 (generic function with 1 method)

In [83]:
println("Benchmarking 2:")
@btime integral_squared2($pwl);

Benchmarking 2:
  3.991 ms (0 allocations: 0 bytes)


In [85]:
function integral_squared3(v::Vector{Point{X,Y}}) where {X,Y}
    T = eltype(oneunit(X)*oneunit(Y)*oneunit(Y))
    area = zero(T)
    p1 = v[1]
    third = 1/3
    for i in 2:length(v)
        p2 = v[i]
        dx = p2.x - p1.x
        m = (p2.y - p1.y)
        b = p1.y
        area += dx * (m * m * third + b * (m + b))
        p1 = p2
    end
    return area
end

integral_squared3 (generic function with 1 method)

In [89]:
println("Benchmarking 3:")
@btime integral_squared3($pwl);

Benchmarking 3:
  1.795 ms (0 allocations: 0 bytes)


In [169]:
const third = 1/3
function integral_squared4(v::Vector{Point{X,Y}}) where {X,Y}
    area = zero(oneunit(X)*oneunit(Y)*oneunit(Y))
    p1 = v[1]
    @inbounds for i in 2:length(v)
        p2 = v[i]
        dx = p2.x - p1.x
        m  = p2.y - p1.y
        b  = p1.y
        area += (third*m*m + (m+b)*b)*dx
        p1 = p2
    end
    return area
end

integral_squared4 (generic function with 1 method)

In [170]:
println("Benchmarking 4:")
@btime integral_squared4($pwl);

Benchmarking 4:
  1.567 ms (0 allocations: 0 bytes)


In [173]:
const third = 1/3
function integral_squared4_simd(v::Vector{Point{X,Y}}) where {X,Y}
    area = zero(oneunit(X)*oneunit(Y)*oneunit(Y))
    p1 = v[1]
    @inbounds @simd for i in 1:length(v)
        p2 = v[i]
        dx = p2.x - p1.x
        m  = p2.y - p1.y
        b  = p1.y
        area += (third*m*m + (m+b)*b)*dx
        p1 = p2
    end
    return area
end

integral_squared4_simd (generic function with 1 method)

In [175]:
println("Benchmarking 4:")
@btime integral_squared4_simd($pwl);

Benchmarking 4:
  1.631 ms (0 allocations: 0 bytes)


## opencl

### using simd

In [None]:
pwl = [Point(x, sin(x)) for x in linspace(0, pi, N)];

In [153]:
using SIMD
function integral_squared5(x::Vector{T}, y::Vector{T}, ::Type{Vec{N, T}}) where {N, T}
    @assert length(x) == length(y)
    #@assert length(x) % N == 1
    area = Vec{N, T}(0)
    third = Vec{N, T}(1 / 3)
    for i = 1:N:(length(x)-1)
        x1 = vload(Vec{N, T}, x, i)
        y1 = vload(Vec{N, T}, y, i)
        x2 = vload(Vec{N, T}, x, i+1)
        y2 = vload(Vec{N, T}, y, i+1)
        dx = x2 - x1
        dy = y2 - y1
        area += dx * (dy * dy * third + y1 * y2)
        x1 = x2
        y1 = y2
    end
    return sum(area)
end

integral_squared5 (generic function with 1 method)

In [136]:
N = 1_000_001
x = map(Float32, collect(linspace(0, pi, N)))
y = sin.(x)
Base.Test.@test integral_squared5(x, y, Vec{8, Float32}) ≈ pi/2  atol=0.00001

[1m[32mTest Passed[39m[22m

In [154]:
N = 1_000_000
x = map(Float32, collect(linspace(0, pi, N)))
y = sin.(x);

In [155]:
println("Benchmarking 5:")
@btime integral_squared5(x, y, Vec{8, Float32})

Benchmarking 5:
  460.559 μs (1 allocation: 16 bytes)


1.570792f0

In [159]:
using Base.Threads
function integral_squared6{T}(x::Vector{T},y::Vector{T})
    area = zero(T)
    third = T(1/3)
    nsub = 512
    nouter = ceil(Int,length(x)/nsub)
    nt = nthreads()
    accs = [zeros(T,nsub) for i in 1:nt]
    @threads for i in 0:nouter-1
        @inbounds begin
        id = threadid()
        acc = accs[id]
        ioff = nsub * i
        ninner = ifelse(i==nouter-1,length(x)-ioff-1,nsub)
        @simd for j=1:ninner
            x1 = x[ioff+j]
            y1 = y[ioff+j]
            x2 = x[ioff+j+1]
            y2 = y[ioff+j+1]
            dx = x2 - x1
            dy = y2 - y1
            acc[j] += dx*(dy*dy*third + y1*y2)
        end
        end
    end
    area = sum(map(sum,accs))
    return area
end

integral_squared6 (generic function with 1 method)

In [167]:
println("Benchmarking 6:")
@btime integral_squared6($x, $y)

Benchmarking 6:
  515.509 μs (5 allocations: 2.41 KiB)


1.5707963f0

In [165]:
function integral_squared7{T}(x::Vector{T},y::Vector{T})
    area = zero(T)
    third = T(1/3)
    nsub = 512
    acc = accs[id]
    nouter = ceil(Int,length(x)/nsub)
    @inbounds begin
        @simd for j=1:length(x)
            x1 = x[j]
            y1 = y[j]
            x2 = x[j+1]
            y2 = y[j+1]
            dx = x2 - x1
            dy = y2 - y1
            acc[j] += dx*(dy*dy*third + y1*y2)
        end
    end
    return area
end

integral_squared7 (generic function with 1 method)

In [166]:
println("Benchmarking 7:")
@btime integral_squared7($x, $y)

Benchmarking 7:


LoadError: [91mUndefVarError: acc not defined[39m

## SIMD instructions

In [124]:
function inner( x, y )
    s = zero(eltype(x)) 
    for i=1:length(x)
    @inbounds s += x[i]*y[i] 
    end
    return s
end

function innersimd( x, y )
    s = zero(eltype(x))
    @simd for i=1:length(x)
        @inbounds s += x[i]*y[i] 
    end
    return s
end

function timeit( n, reps )
    x = rand(Float32,n)
    y = rand(Float32,n)
    s = zero(Float64)
    
    time = @elapsed for j in 1:reps 
        s+=inner(x,y)
    end

    println("GFlop/sec = ",2.0*n*reps/time*1E-9) 
    time = @elapsed for j in 1:reps
    s+=innersimd(x,y) 
    end

    println("GFlop/sec (SIMD) = ",2.0*n*reps/time*1E-9)
end

timeit (generic function with 1 method)

In [125]:
timeit(1000, 1000)

GFlop/sec = 2.2413483951945494
GFlop/sec (SIMD) = 11.847639357857947


In [126]:
using BenchmarkTools

n = 100
x = rand(Float32,n)
y = rand(Float32,n)
s = zero(Float32)


0.0f0

In [127]:
@btime innersimd(x,y)

  49.504 ns (1 allocation: 16 bytes)


24.495232f0

In [128]:
@btime inner(x,y)

  111.777 ns (1 allocation: 16 bytes)


24.495235f0

In [129]:
# If vectors are too small there is no difference
n = 10 
x = rand(Float32,n)
y = rand(Float32,n)
s = zero(Float32)

0.0f0

In [130]:
@btime innersimd(x,y)

  36.435 ns (1 allocation: 16 bytes)


1.2898246f0

In [131]:
@btime inner(x,y)

  36.747 ns (1 allocation: 16 bytes)


1.2898245f0

## simd summation

In [118]:

function summation(x)
    s = zero(x[1])
    for i=1:length(x)
        @inbounds s +=x[i]
    end
    return s
end

function summation_simd(x)
    s = zero(x[1])
    @simd for i=1:length(x)
        @inbounds s +=x[i]
    end
    return s
end

function summation_simd_elem(x)
    s = zero(x[1])
    @simd for x_k in x
        @inbounds s +=x_k
    end
    return s
end

summation_simd_elem (generic function with 1 method)

In [119]:
x = rand(Float32,1000);
xbig = rand(Float32,10000);

In [120]:
using BenchmarkTools

In [121]:
@btime summation($x)

  826.835 ns (0 allocations: 0 bytes)


498.49457f0

In [122]:
@btime summation_simd($x)

  104.394 ns (0 allocations: 0 bytes)


498.49463f0

In [123]:
@btime summation_simd_elem($x)

  102.560 ns (0 allocations: 0 bytes)


498.49463f0

#### Check if the code vectorizes

In [16]:
@code_llvm summation(x)


define float @julia_summation_61349(i8** dereferenceable(40)) #0 !dbg !5 {
top:
  %1 = getelementptr inbounds i8*, i8** %0, i64 1
  %2 = bitcast i8** %1 to i64*
  %3 = load i64, i64* %2, align 8
  %4 = icmp slt i64 %3, 1
  br i1 %4, label %L18, label %if.lr.ph

if.lr.ph:                                         ; preds = %top
  %5 = bitcast i8** %0 to float**
  %6 = load float*, float** %5, align 8
  br label %if

if:                                               ; preds = %if.lr.ph, %if
  %s.03 = phi float [ 0.000000e+00, %if.lr.ph ], [ %11, %if ]
  %"#temp#.02" = phi i64 [ 1, %if.lr.ph ], [ %7, %if ]
  %7 = add i64 %"#temp#.02", 1
  %8 = add i64 %"#temp#.02", -1
  %9 = getelementptr float, float* %6, i64 %8
  %10 = load float, float* %9, align 4
  %11 = fadd float %s.03, %10
  %12 = icmp eq i64 %"#temp#.02", %3
  br i1 %12, label %L18.loopexit, label %if

L18.loopexit:                                     ; preds = %if
  br label %L18

L18:                                             

We want to look for in the llvm code the prefic `vector` which does not appear in the code above. Notice that the code below contains `vector.body` suggesting that the compiler used `vector` instructions.


In [17]:
@code_native summation(x)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[10]
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 4
	movq	8(%rdi), %rax
	vxorps	%xmm0, %xmm0, %xmm0
	testq	%rax, %rax
	jle	L45
Source line: 5
	movq	(%rdi), %rcx
	vxorps	%xmm0, %xmm0, %xmm0
	nopl	(%rax,%rax)
L32:
	vaddss	(%rcx), %xmm0, %xmm0
Source line: 4
	addq	$4, %rcx
	decq	%rax
	jne	L32
Source line: 7
L45:
	popq	%rbp
	retq
	nop


In [18]:
@code_native summation_simd_elem(x)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[10]
Source line: 67
	movq	8(%rdi), %rax
Source line: 64
	movq	24(%rdi), %rdx
	xorl	%ecx, %ecx
Source line: 79
	testq	%rdx, %rdx
	cmovnsq	%rdx, %rcx
Source line: 68
	testq	%rax, %rax
	jle	L321
Source line: 79
	leaq	-1(%rcx), %rsi
	cmpq	%rdx, %rsi
	jae	L329
Source line: 50
	movq	(%rdi), %r9
Source line: 66
	leaq	96(%r9), %r8
	movq	%rax, %r10
	andq	$-32, %r10
	vxorps	%xmm0, %xmm0, %xmm0
	xorl	%edi, %edi
Source line: 50
	vxorps	%ymm1, %ymm1, %ymm1
Source line: 66
	jmp	L304
	nopw	%cs:(%rax,%rax)
Source line: 71
L80:
	testq	%rax, %rax
	jle	L304
Source line: 50
	cmpq	$32, %rax
	jae	L102
	xorl	%edx, %edx
	jmp	L263
L102:
	movq	%rax, %rdx
	andq	$-32, %rdx
	je	L261
	vblendps	$1, %ymm0, %ymm1, %ymm0 ## ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
	vxorps	%ymm2, %ymm2, %ymm2
Source line: 74
	movq	%r10, %rsi
	movq	%r8, %rcx
	vxorps	%ymm3, %ymm3, %ymm3
	vxorps	%ymm4, %ymm4, %ymm4
	nopl	(%rax,%rax)
Source line: 50
L144:
	vmovups	-96(%rcx), %xmm5
	vm

In [19]:
@code_llvm summation_simd_elem(x)


define float @julia_summation_simd_elem_61391(i8** dereferenceable(40)) #0 !dbg !5 {
top:
  %1 = getelementptr inbounds i8*, i8** %0, i64 1
  %2 = bitcast i8** %1 to i64*
  %3 = load i64, i64* %2, align 8
  %4 = icmp slt i64 %3, 1
  %5 = getelementptr i8*, i8** %0, i64 3
  %6 = bitcast i8** %5 to i64*
  %7 = load i64, i64* %6, align 8
  %8 = icmp slt i64 %7, 0
  %9 = select i1 %8, i64 0, i64 %7
  br i1 %4, label %top.split.us, label %top.top.split_crit_edge

top.top.split_crit_edge:                          ; preds = %top
  %10 = add i64 %9, -1
  %11 = icmp ult i64 %10, %7
  br i1 %11, label %top.split.split.us, label %oob

top.split.us:                                     ; preds = %top
  br i1 undef, label %L60, label %L60

top.split.split.us:                               ; preds = %top.top.split_crit_edge
  %12 = bitcast i8** %0 to float**
  %13 = load float*, float** %12, align 8
  br label %L7.outer.L7.outer.split_crit_edge.us45.outer

L19.preheader.us23:                        

# Loss between two minibatches

If we consider a minibatch to be an array whose columns are examples we can compute the loss over a whole minibatch of predictions and targets in a single for loop using SIMD instructions.

In [20]:
srand(1)
y_pred = rand(10,200);
y = rand(10,200);

In [21]:
# squared error
function se(x,y)
    dist = zero(eltype(x))
    for i in 1:length(x)
        @inbounds dist += (x[i] - y[i])^2
    end
    return dist
end

se (generic function with 1 method)

In [22]:
println(se(y_pred[:,1], y[:,1]) + se(y_pred[:,2], y[:,2]))
println(se(y_pred[1:20], y[1:20]))

3.843436662540462
3.843436662540462


In this example the loss can be computed equally 

- by summing over the losses for every example
- by summing over the loss of a single "superexample", the whole concatenated mimnibatch


In [23]:
@time se(y_pred,y)

  0.007600 seconds (2.00 k allocations: 114.390 KiB)


327.5997310650614

In [24]:
# If we do it iterating per exmaple (without creaing new arrays)
d = 0.
@time  for m in 1:200 d+=se(view(y_pred,:,m), view(y,:,m)) end
d

  0.096710 seconds (104.05 k allocations: 5.602 MiB)


327.5997310650613

In [25]:
# If we do it iterating per exmaple  (creating new arrays)
d = 0.
@time  for m in 1:200 d+=se(y_pred[:,m],y[:,m]) end
d

  0.000068 seconds (1.20 k allocations: 75.000 KiB)


327.5997310650613

##### Benchmark

In [26]:
srand(1)
y_pred = rand(Float32, 10,200);
y = rand(Float32, 10,200);

In [27]:
# squared error
function se(x,y)
    dist = zero(eltype(x))
    for i in 1:length(x)
        @inbounds dist += (x[i] - y[i])^2
    end
    return dist
end

se (generic function with 1 method)

In [28]:
# squared error
function se_simd(X, Y)
    dist = zero(eltype(X))
    @simd for i in 1:length(X)
        @inbounds dist += (X[i] - Y[i])^2
    end
    return dist
end

se_simd (generic function with 1 method)

In [29]:
println(se(y_pred,y))
println(se_simd(y_pred,y))

339.9466
339.94638


In [32]:
@benchmark se_simd(y_pred, y)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     247.741 ns (0.00% GC)
  median time:      266.622 ns (0.00% GC)
  mean time:        271.485 ns (0.25% GC)
  maximum time:     3.867 μs (88.79% GC)
  --------------
  samples:          10000
  evals/sample:     328

In [31]:
@benchmark  se(y_pred, y)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     1.569 μs (0.00% GC)
  median time:      1.571 μs (0.00% GC)
  mean time:        1.624 μs (0.00% GC)
  maximum time:     8.664 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

In [33]:
@benchmark begin 
               d = 0.; 
               for m in 1:200 
                   d+=se(view(y_pred,:,m),view(y,:,m))
               end
           end

BenchmarkTools.Trial: 
  memory estimate:  212.50 KiB
  allocs estimate:  10800
  --------------
  minimum time:     2.408 ms (0.00% GC)
  median time:      2.486 ms (0.00% GC)
  mean time:        2.545 ms (0.64% GC)
  maximum time:     4.676 ms (38.53% GC)
  --------------
  samples:          1963
  evals/sample:     1

#### Another example not as clear: involving a loss function that divides by a length

In [74]:
function mse(x,y)
    dist = zero(eltype(x))
    for i in 1:length(x)
        dist += (x[i] - y[i])^2
    end
    return dist/length(x)
end

mse (generic function with 1 method)

In [75]:
println(mse(y_pred[:,1], y[:,1]) + mse(y_pred[:,2], y[:,2]))
println(mse(y_pred[1:20], y[1:20]))

0.3395241
0.16976205


In this example it is not equivalent, yet we can easily find an alternative

In [76]:
println(se(y_pred[:,1], y[:,1])/10. + se(y_pred[:,2], y[:,2])/10.)
println(se(y_pred[1:20], y[1:20])/(10.))

0.33952410221099855
0.33952410221099855


Notice that the alternative version of the mse is much faster

In [81]:
se(y_pred,y)/10.

33.994659423828125

In [84]:
d = 0.
for m in 1:200 d+=mse(view(y_pred,:,m),view(y,:,m)) end
d

33.9946366250515

In [118]:
function mse_normalized_nfeatures_simd{T}(x::Array{T},y::Array{T},n_features)
    dist = zero(eltype(x))
    @simd for i in 1:length(x)
        @inbounds dist += (x[i] - y[i])^2
    end
    return dist/T(n_features)
end

mse_normalized_nfeatures_simd (generic function with 3 methods)

In [119]:
@benchmark mse_normalized_nfeatures_simd(y_pred, y, 10)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     341.599 ns (0.00% GC)
  median time:      343.507 ns (0.00% GC)
  mean time:        376.657 ns (0.00% GC)
  maximum time:     6.274 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     217

# Part 1

## Example montecarlo pi estimate

Let us play with an example from 



In [None]:
n_cores = 4

In [None]:
workers()

In [None]:
addprocs(4) 

In [None]:
workers()

In [None]:
@everywhere function compute_pi(N::Int)
    """
    Compute pi with a Monte Carlo simulation of N darts thrown in [-1,1]^2
    Returns estimate of pi
    """
    # counts number of points that have radial coordinate < 1, i.e. in circle
    n_landed_in_circle = 0  
    for i = 1:N
        x = rand() * 2 - 1  # uniformly distributed number on x-axis
        y = rand() * 2 - 1  # uniformly distributed number on y-axis

        r2 = x*x + y*y  # radius squared, in radial coordinates
        if r2 < 1.0
            n_landed_in_circle += 1
        end
    end

    return n_landed_in_circle / N * 4.0    
end

In [None]:
compute_pi(10)

@time compute_pi(1000_000_000)

#### Let us go parallel

In [None]:
N = Int(1_000_000_000)

In [None]:
result = pmap(compute_pi, [Int(N/n_cores) for core in 1:n_cores])

In [None]:
@time mean(pmap(compute_pi,[Int(N/n_cores) for core in 1:n_cores]))

In [None]:
function par_pi_computation(N::Int64; ncores::Int64=4)
    """
    Compute pi in parallel, over ncores cores, with a Monte Carlo simulation throwing N total darts
    """

    # compute sum of pi's estimated among all cores in parallel
    sum_of_pis = @parallel (+) for i=1:ncores
        compute_pi(Int(N / ncores))
    end

    return sum_of_pis / ncores  # average value
end

In [None]:
@time par_pi_computation(1000_000_000)

# Part 2 L2 norm
### Let us test the numpy-matlab way

In [1]:
#srand(1234)
len = 100000;

x = randn(len);
y = randn(len);

In [5]:
@time begin sum((x - y).^2)./length(x) end

  0.001707 seconds (12 allocations: 1.526 MB)


2.0019735660543594

In [None]:
print(@benchmark sum((x - y).^2)/length(x))

#### For loop 

In [None]:
function l2_squared(x::Array{Float64},y::Array{Float64})
    norm = 0.
    for i in 1:length(x)
        norm = norm + (x[i] - y[i])^2
    end
    return norm/length(x)
end

In [None]:
@time l2_squared(x,y)

In [None]:
print(@benchmark l2_squared(x,y))

#### Only inbounds does not make any improvements

In [6]:
function l2_squared_inbounds(x::Array{Float64},y::Array{Float64})
    norm = 0.
    @inbounds begin
    for i in 1:length(x)
         norm += (x[i] - y[i])^2
        end
    end
    return norm/length(x)
end

l2_squared_inbounds (generic function with 1 method)

In [7]:
@time l2_squared_inbounds(x,y)

  0.008037 seconds (2.32 k allocations: 105.927 KB)


2.0019735660543767

In [None]:
print(@benchmark l2_squared_inbounds(x,y))

#### improve speed l2_squared with simd

We will use now the @simd macro in a for loop. Notice that this does not make every loop faster. In particular, note that using SIMD implies that the order of operations within and across the loop might change. This macro tells the compiler that reordering will be safe before it attempts to parallelize a loop. Therefore, before adding @simd annotation to your code, you need to ensure that the loop has the following properties:

- All iterations of the loop are independent of each other.  No iteration of the loop uses a value from a previous iteration or waits for its completion.
   
   
- The arrays being operated upon within the loop do not overlap in memory.


-  The loop body is straight-line code without branches or function calls.


-   The number of iterations of the loop is obvious. In practical terms, this means that the loop should typically be expressed on the length of the arrays within it.


- The subscript (or index variable) within the loop changes by one for each iteration. In other words, the subscript is unit stride.


- Bounds checking is disabled for SIMD loops. (Bound checking can cause branches due to exceptional conditions.)


In [None]:
typeof(x)

In [None]:
function l2_squared_inbounds_simd(x::Array{Float64},y::Array{Float64})
    norm = 0.
    n = length(x)
    @inbounds @simd for i in 1:n
             norm += (x[i] - y[i])^2
        end

    return norm/length(x)
end

In [None]:
@time l2_squared_inbounds_simd(x,y)

In [None]:
print(@benchmark l2_squared_inbounds_simd(x,y))

#### SIMD instructions might benefit of lower precision floats

In [None]:
len = 100000
srand(1234)
x32 = Array{Float32}(randn(len));
y32 = Array{Float32}(randn(len));

function l2_squared_inbounds_simd(x::Array{Float32},y::Array{Float32})
    norm = 0.
    n = length(x)
    @inbounds @simd for i in 1:n
             norm += (x[i] - y[i])^2
        end

    return norm/length(x)
end

In [None]:
@time l2_squared_inbounds_simd(x32,y32)

In [None]:
using BenchmarkTools

In [None]:
print(@benchmark l2_squared_inbounds_simd(x32,y32))

#### Go to float 16 -> No improvement !

In [None]:
srand(1234)
len = 100000

x16 = Array{Float16}(randn(len));
y16 = Array{Float16}(randn(len));

function l2_squared_inbounds_simd(x::Array{Float16},y::Array{Float16})
    norm = 0.
    l = Float16(length(x))
    @inbounds @simd for i in 1:length(x)
             norm += (x[i] - y[i])^2
        end

    return norm/l
end

In [None]:
@time l2_squared_inbounds_simd(x16,y16)

In [None]:
print(@benchmark l2_squared_inbounds_simd(x16,y16))

# Using Yeppp for math operations 

I found this particulary uggly (having a Yeppp before evey operation is not pretty).


It would be nice to know how to create an alias and use all implementations from Yeppp without
writting Yeppp every time.

- http://www.yeppp.info/#arguments

In [34]:
using Yeppp 

In [38]:
@time Yeppp.sin(x);

LoadError: [91mMethodError: no method matching sin!(::Array{Float32,1}, ::Array{Float32,1})[0m
Closest candidates are:
  sin!(::Any) at /Users/macpro/.julia/v0.6/Yeppp/src/Yeppp.jl:212[39m

In [None]:
@time [sin(xi) for xi in x];

In [39]:
@time Yeppp.exp(x)/Yeppp.sum(x);

LoadError: [91mMethodError: no method matching exp!(::Array{Float32,1}, ::Array{Float32,1})[0m
Closest candidates are:
  exp!(::Any) at /Users/macpro/.julia/v0.6/Yeppp/src/Yeppp.jl:198[39m

In [None]:
@time exp(x)/sum(x);

# Parallel Accelerator

- https://github.com/IntelLabs/ParallelAccelerator.jl