In [1]:
# Julia package for benchmarking
using BenchmarkTools

# 0) Why Julia ?

Julia is a compiled language which allows users to write code that  runs really fast (as fast as C or Fortran). 

https://github.com/JuliaLang/julialang.github.com/blob/master/blog/_posts/moredots/More-Dots.ipynb

In [2]:
a = rand(10^6);

# C code

In [3]:
C_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    size_t i;
    for (i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""

const Clib = tempname()   # make a temporary file

# compile to a shared library by piping C_code to gcc  (need to have gcc installed)
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code) 
end

# define a Julia function that calls the C function:
c_sum(X::Array{Float64}) = ccall(("c_sum", Clib), Float64, (Csize_t, Ptr{Float64}), length(X), X)

c_sum (generic function with 1 method)

In [4]:
c_sum(a)

500652.0359250248

In [5]:
# type \approx and then <TAB> to get the ≈ symbol
c_sum(a) ≈ sum(a) 

true

In [6]:
c_bench = @benchmark c_sum(a)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     921.011 μs (0.00% GC)
  median time:      1.111 ms (0.00% GC)
  mean time:        1.134 ms (0.00% GC)
  maximum time:     3.426 ms (0.00% GC)
  --------------
  samples:          4361
  evals/sample:     1

In [7]:
println("C: Fastest time was $(minimum(c_bench.times)/1e6) msecs.")

C: Fastest time was 0.921011 msecs.


# Handwritten sum in julia

In [8]:
function mysum(A)   
    s = 0.0  # s = zero(eltype(A))
    for a in A
        s += a
    end
    s
end

mysum (generic function with 1 method)

In [9]:
j_bench_hand = @benchmark mysum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     925.279 μs (0.00% GC)
  median time:      1.191 ms (0.00% GC)
  mean time:        1.327 ms (0.00% GC)
  maximum time:     30.891 ms (0.00% GC)
  --------------
  samples:          3717
  evals/sample:     1

In [17]:
# Looking at the compiled code
@code_native mysum(a)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[15]
Source line: 3
	movq	8(%rdi), %rax
	xorpd	%xmm0, %xmm0
	testq	%rax, %rax
	je	L53
	movq	24(%rdi), %rdx
	xorpd	%xmm0, %xmm0
	xorl	%ecx, %ecx
	nopw	(%rax,%rax)
L32:
	cmpq	%rdx, %rcx
	jae	L54
	movq	(%rdi), %rsi
Source line: 4
	addsd	(%rsi,%rcx,8), %xmm0
Source line: 3
	incq	%rcx
	cmpq	%rcx, %rax
	jne	L32
Source line: 6
L53:
	retq
L54:
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 3
	movq	%rsp, %rax
	leaq	-16(%rax), %rsi
	movq	%rsi, %rsp
	incq	%rcx
	movq	%rcx, -16(%rax)
	movabsq	$jl_bounds_error_ints, %rax
	movl	$1, %edx
	callq	*%rax
	nopl	(%rax)


In [18]:
# Looking at the llvm code
@code_llvm mysum(a)


define double @julia_mysum_66080(i8**) #0 !dbg !5 {
top:
  %1 = getelementptr inbounds i8*, i8** %0, i64 1
  %2 = bitcast i8** %1 to i64*
  %3 = load i64, i64* %2, align 8
  %4 = icmp eq i64 %3, 0
  br i1 %4, label %L14, label %if.lr.ph

if.lr.ph:                                         ; preds = %top
  %5 = getelementptr i8*, i8** %0, i64 3
  %6 = bitcast i8** %5 to i64*
  %7 = load i64, i64* %6, align 8
  %8 = bitcast i8** %0 to double**
  br label %if

if:                                               ; preds = %if.lr.ph, %idxend
  %s.06 = phi double [ 0.000000e+00, %if.lr.ph ], [ %16, %idxend ]
  %"#temp#.05" = phi i64 [ 1, %if.lr.ph ], [ %15, %idxend ]
  %9 = add i64 %"#temp#.05", -1
  %10 = icmp ult i64 %9, %7
  br i1 %10, label %idxend, label %oob

L14.loopexit:                                     ; preds = %idxend
  br label %L14

L14:                                              ; preds = %L14.loopexit, %top
  %s.0.lcssa = phi double [ 0.000000e+00, %top ], [ %16, %L14.loopex

## Making our fuction faster than C :D

In [25]:
function mysum(A)   
    s = 0.0  # s = zero(eltype(A))
     for a in A
        s += a
    end
    s
end

mysum (generic function with 1 method)

In [30]:
@benchmark mysum(a)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     924.751 μs (0.00% GC)
  median time:      1.143 ms (0.00% GC)
  mean time:        1.178 ms (0.00% GC)
  maximum time:     4.343 ms (0.00% GC)
  --------------
  samples:          4175
  evals/sample:     1

In [35]:
function mysum2(A)   
    s = 0.0  # s = zero(eltype(A))
    @simd for a in A
        s += a
    end
    s
end

mysum2 (generic function with 1 method)

In [37]:
s = 0.0
@macroexpand @simd for a_ in A
        s += a_
    end

quote  # simdloop.jl, line 65:
    let ##r#694 = A # simdloop.jl, line 66:
        for ##i#695 = Base.simd_outer_range(##r#694) # simdloop.jl, line 67:
            let ##n#696 = Base.simd_inner_length(##r#694, ##i#695) # simdloop.jl, line 68:
                if zero(##n#696) < ##n#696 # simdloop.jl, line 70:
                    let ##i#697 = zero(##n#696) # simdloop.jl, line 71:
                        while ##i#697 < ##n#696 # simdloop.jl, line 72:
                            local a_ = Base.simd_index(##r#694, ##i#695, ##i#697) # simdloop.jl, line 73:
                            begin  # In[37], line 3:
                                s += a_
                            end # simdloop.jl, line 74:
                            ##i#697 += 1 # simdloop.jl, line 75:
                            $(Expr(:simdloop))
                        end
                    end # simdloop.jl, line 79:
                    a_ = last(##r#694)
                end
            end
        end
    end # simdlo

In [29]:
@benchmark mysum2(a)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     408.952 μs (0.00% GC)
  median time:      608.913 μs (0.00% GC)
  mean time:        670.530 μs (0.00% GC)
  maximum time:     6.134 ms (0.00% GC)
  --------------
  samples:          7230
  evals/sample:     1

In [17]:
@code_native mysum2(a)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[12]
Source line: 67
	movq	8(%rdi), %rax
Source line: 64
	movq	24(%rdi), %rdx
	xorl	%ecx, %ecx
Source line: 79
	testq	%rdx, %rdx
	cmovnsq	%rdx, %rcx
Source line: 68
	testq	%rax, %rax
	jle	L206
Source line: 79
	leaq	-1(%rcx), %rsi
	cmpq	%rdx, %rsi
	jae	L210
Source line: 50
	movq	(%rdi), %r9
Source line: 66
	leaq	16(%r9), %r8
	movq	%rax, %r10
	andq	$-4, %r10
	pxor	%xmm0, %xmm0
	xorl	%edi, %edi
	jmp	L192
Source line: 71
L64:
	testq	%rax, %rax
	jle	L192
Source line: 50
	cmpq	$4, %rax
	jae	L79
	xorl	%edx, %edx
	jmp	L156
L79:
	movq	%rax, %rdx
	andq	$-4, %rdx
	je	L154
	movq	%xmm0, %xmm1            ## xmm1 = xmm0[0],zero
	pxor	%xmm0, %xmm0
Source line: 74
	movq	%r10, %rsi
	movq	%r8, %rcx
	nopw	%cs:(%rax,%rax)
Source line: 50
L112:
	movupd	-16(%rcx), %xmm2
	movupd	(%rcx), %xmm3
Source line: 4
	addpd	%xmm2, %xmm1
	addpd	%xmm3, %xmm0
Source line: 50
	addq	$32, %rcx
	addq	$-4, %rsi
	jne	L112
Source line: 4
	addpd	%xmm1, %xmm0
	haddpd	%x

In [20]:
j_bench_hand = @benchmark mysum2($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     352.776 μs (0.00% GC)
  median time:      403.750 μs (0.00% GC)
  mean time:        526.741 μs (0.00% GC)
  maximum time:     3.609 ms (0.00% GC)
  --------------
  samples:          8949
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

## Summing in julia using the sum method

In [22]:
@which sum(a)

In [23]:
j_bench = @benchmark sum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     372.941 μs (0.00% GC)
  median time:      421.894 μs (0.00% GC)
  mean time:        450.698 μs (0.00% GC)
  maximum time:     1.724 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [25]:
#mapreduce(identity, +, 1:10)