In [2]:
a = rand(10^7);

In [3]:
using BenchmarkTools  # Julia package for benchmarking

# C code

In [24]:
C_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    size_t i;
    for (i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""

const Clib = tempname()   # make a temporary file

# compile to a shared library by piping C_code to gcc  (need to have gcc installed)
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code) 
end

# define a Julia function that calls the C function:
c_sum(X::Array{Float64}) = ccall(("c_sum", Clib), Float64, (Csize_t, Ptr{Float64}), length(X), X)



c_sum (generic function with 1 method)

In [25]:
c_sum(a)

4.999635364386268e6

In [26]:
c_sum(a) ≈ sum(a) # type \approx and then <TAB> to get the ≈ symbol

true

In [27]:
c_bench = @benchmark c_sum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     9.132 ms (0.00% GC)
  median time:      9.966 ms (0.00% GC)
  mean time:        10.337 ms (0.00% GC)
  maximum time:     15.622 ms (0.00% GC)
  --------------
  samples:          482
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [28]:
println("C: Fastest time was $(minimum(c_bench.times)/1e6) msecs.")

C: Fastest time was 9.132067 msecs.


In [10]:
# Buildin sum in julia

In [11]:
@which sum(a)

In [12]:
j_bench = @benchmark sum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     4.905 ms (0.00% GC)
  median time:      5.256 ms (0.00% GC)
  mean time:        5.439 ms (0.00% GC)
  maximum time:     9.513 ms (0.00% GC)
  --------------
  samples:          912
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

# Handwritten sum in julia

In [14]:
function mysum(A)   
    s = 0.0  # s = zero(eltype(A))
    for a in A
        s += a
    end
    s
end

mysum (generic function with 1 method)

In [15]:
j_bench_hand = @benchmark mysum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     9.158 ms (0.00% GC)
  median time:      10.625 ms (0.00% GC)
  mean time:        11.189 ms (0.00% GC)
  maximum time:     17.106 ms (0.00% GC)
  --------------
  samples:          446
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [18]:
function mysum2(A)   
    s = 0.0  # s = zero(eltype(A))
    @simd for a in A
        s += a
    end
    s
end

mysum2 (generic function with 1 method)

In [19]:
j_bench_hand = @benchmark mysum2($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     4.869 ms (0.00% GC)
  median time:      5.075 ms (0.00% GC)
  mean time:        5.220 ms (0.00% GC)
  maximum time:     8.121 ms (0.00% GC)
  --------------
  samples:          952
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [32]:

@code_native mysum(a)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[14]
Source line: 3
	movq	8(%rdi), %rax
	vxorpd	%xmm0, %xmm0, %xmm0
	testq	%rax, %rax
	je	L54
	movq	24(%rdi), %rdx
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%ecx, %ecx
	nopw	(%rax,%rax)
L32:
	cmpq	%rdx, %rcx
	jae	L55
	movq	(%rdi), %rsi
Source line: 4
	vaddsd	(%rsi,%rcx,8), %xmm0, %xmm0
Source line: 3
	addq	$1, %rcx
	cmpq	%rcx, %rax
	jne	L32
Source line: 6
L54:
	retq
L55:
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 3
	movq	%rsp, %rax
	leaq	-16(%rax), %rsi
	movq	%rsi, %rsp
	addq	$1, %rcx
	movq	%rcx, -16(%rax)
	movabsq	$jl_bounds_error_ints, %rax
	movl	$1, %edx
	callq	*%rax
	nop
