In [1]:
# Julia package for benchmarking
using BenchmarkTools

# 0) Why Julia ?

Julia is a compiled language which allows users to write code that  runs really fast (as fast as C or Fortran). 

In [2]:
a = rand(10^6);

# C code

In [3]:
C_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    size_t i;
    for (i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""

const Clib = tempname()   # make a temporary file

# compile to a shared library by piping C_code to gcc  (need to have gcc installed)
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code) 
end

# define a Julia function that calls the C function:
c_sum(X::Array{Float64}) = ccall(("c_sum", Clib), Float64, (Csize_t, Ptr{Float64}), length(X), X)

c_sum (generic function with 1 method)

In [5]:
c_sum(a)

500044.8375432291

In [None]:
# type \approx and then <TAB> to get the ≈ symbol
c_sum(a) ≈ sum(a) 

In [27]:
c_bench = @benchmark c_sum(a)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     876.961 μs (0.00% GC)
  median time:      946.424 μs (0.00% GC)
  mean time:        1.005 ms (0.00% GC)
  maximum time:     3.466 ms (0.00% GC)
  --------------
  samples:          4826
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [14]:
println("C: Fastest time was $(minimum(c_bench.times)/1e6) msecs.")

C: Fastest time was 0.877922 msecs.


# Handwritten sum in julia

In [15]:
function mysum(A)   
    s = 0.0  # s = zero(eltype(A))
    for a in A
        s += a
    end
    s
end

mysum (generic function with 1 method)

In [16]:
j_bench_hand = @benchmark mysum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     881.691 μs (0.00% GC)
  median time:      938.196 μs (0.00% GC)
  mean time:        997.885 μs (0.00% GC)
  maximum time:     3.371 ms (0.00% GC)
  --------------
  samples:          4868
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [17]:
# Looking at the compiled code
@code_native mysum(a)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[15]
Source line: 3
	movq	8(%rdi), %rax
	xorpd	%xmm0, %xmm0
	testq	%rax, %rax
	je	L53
	movq	24(%rdi), %rdx
	xorpd	%xmm0, %xmm0
	xorl	%ecx, %ecx
	nopw	(%rax,%rax)
L32:
	cmpq	%rdx, %rcx
	jae	L54
	movq	(%rdi), %rsi
Source line: 4
	addsd	(%rsi,%rcx,8), %xmm0
Source line: 3
	incq	%rcx
	cmpq	%rcx, %rax
	jne	L32
Source line: 6
L53:
	retq
L54:
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 3
	movq	%rsp, %rax
	leaq	-16(%rax), %rsi
	movq	%rsi, %rsp
	incq	%rcx
	movq	%rcx, -16(%rax)
	movabsq	$jl_bounds_error_ints, %rax
	movl	$1, %edx
	callq	*%rax
	nopl	(%rax)


In [18]:
# Looking at the llvm code
@code_llvm mysum(a)


define double @julia_mysum_66080(i8**) #0 !dbg !5 {
top:
  %1 = getelementptr inbounds i8*, i8** %0, i64 1
  %2 = bitcast i8** %1 to i64*
  %3 = load i64, i64* %2, align 8
  %4 = icmp eq i64 %3, 0
  br i1 %4, label %L14, label %if.lr.ph

if.lr.ph:                                         ; preds = %top
  %5 = getelementptr i8*, i8** %0, i64 3
  %6 = bitcast i8** %5 to i64*
  %7 = load i64, i64* %6, align 8
  %8 = bitcast i8** %0 to double**
  br label %if

if:                                               ; preds = %if.lr.ph, %idxend
  %s.06 = phi double [ 0.000000e+00, %if.lr.ph ], [ %16, %idxend ]
  %"#temp#.05" = phi i64 [ 1, %if.lr.ph ], [ %15, %idxend ]
  %9 = add i64 %"#temp#.05", -1
  %10 = icmp ult i64 %9, %7
  br i1 %10, label %idxend, label %oob

L14.loopexit:                                     ; preds = %idxend
  br label %L14

L14:                                              ; preds = %L14.loopexit, %top
  %s.0.lcssa = phi double [ 0.000000e+00, %top ], [ %16, %L14.loopex

## Making our fuction faster than C :D

In [19]:
function mysum2(A)   
    s = 0.0  # s = zero(eltype(A))
    @simd for a in A
        s += a
    end
    s
end

mysum2 (generic function with 1 method)

In [20]:
j_bench_hand = @benchmark mysum2($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     352.776 μs (0.00% GC)
  median time:      403.750 μs (0.00% GC)
  mean time:        526.741 μs (0.00% GC)
  maximum time:     3.609 ms (0.00% GC)
  --------------
  samples:          8949
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

## Summing in julia using the sum method

In [22]:
@which sum(a)

In [23]:
j_bench = @benchmark sum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     372.941 μs (0.00% GC)
  median time:      421.894 μs (0.00% GC)
  mean time:        450.698 μs (0.00% GC)
  maximum time:     1.724 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [25]:
#mapreduce(identity, +, 1:10)