In [2]:
a = rand(10^6);

In [3]:
using BenchmarkTools  # Julia package for benchmarking

# C code

In [15]:
C_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    size_t i;
    for (i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""

const Clib = tempname()   # make a temporary file

# compile to a shared library by piping C_code to gcc  (need to have gcc installed)
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code) 
end

# define a Julia function that calls the C function:
c_sum(X::Array{Float64}) = ccall(("c_sum", Clib), Float64, (Csize_t, Ptr{Float64}), length(X), X)



c_sum (generic function with 2 methods)

In [16]:
c_sum(a)

500003.8077172224

In [26]:
# type \approx and then <TAB> to get the ≈ symbol
c_sum(a) ≈ sum(a) 

true

In [23]:
c_bench = @benchmark c_sum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     921.145 μs (0.00% GC)
  median time:      1.086 ms (0.00% GC)
  mean time:        1.145 ms (0.00% GC)
  maximum time:     8.279 ms (0.00% GC)
  --------------
  samples:          4209
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [24]:
println("C: Fastest time was $(minimum(c_bench.times)/1e6) msecs.")

C: Fastest time was 0.921145 msecs.


# Handwritten sum in julia

In [32]:
function mysum(A)   
    s = 0.0  # s = zero(eltype(A))
    for a in A
        s += a
    end
    s
end

mysum (generic function with 1 method)

In [33]:
j_bench_hand = @benchmark mysum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     925.433 μs (0.00% GC)
  median time:      1.024 ms (0.00% GC)
  mean time:        1.054 ms (0.00% GC)
  maximum time:     2.604 ms (0.00% GC)
  --------------
  samples:          4603
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [35]:
# Looking at the compiled code
@code_native mysum(a)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[32]
Source line: 3
	movq	8(%rdi), %rax
	vxorpd	%xmm0, %xmm0, %xmm0
	testq	%rax, %rax
	je	L54
	movq	24(%rdi), %rdx
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%ecx, %ecx
	nopw	(%rax,%rax)
L32:
	cmpq	%rdx, %rcx
	jae	L55
	movq	(%rdi), %rsi
Source line: 4
	vaddsd	(%rsi,%rcx,8), %xmm0, %xmm0
Source line: 3
	addq	$1, %rcx
	cmpq	%rcx, %rax
	jne	L32
Source line: 6
L54:
	retq
L55:
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 3
	movq	%rsp, %rax
	leaq	-16(%rax), %rsi
	movq	%rsi, %rsp
	addq	$1, %rcx
	movq	%rcx, -16(%rax)
	movabsq	$jl_bounds_error_ints, %rax
	movl	$1, %edx
	callq	*%rax
	nop


In [36]:
# Looking at the llvm code
@code_llvm mysum(a)


define double @julia_mysum_63286(%jl_value_t*) #0 !dbg !5 {
top:
  %1 = getelementptr inbounds %jl_value_t, %jl_value_t* %0, i64 1
  %2 = bitcast %jl_value_t* %1 to i64*
  %3 = load i64, i64* %2, align 8
  %4 = icmp eq i64 %3, 0
  br i1 %4, label %L14, label %if.lr.ph

if.lr.ph:                                         ; preds = %top
  %5 = getelementptr inbounds %jl_value_t, %jl_value_t* %0, i64 3, i32 0
  %6 = bitcast %jl_value_t** %5 to i64*
  %7 = load i64, i64* %6, align 8
  %8 = bitcast %jl_value_t* %0 to double**
  br label %if

if:                                               ; preds = %if.lr.ph, %idxend
  %s.06 = phi double [ 0.000000e+00, %if.lr.ph ], [ %16, %idxend ]
  %"#temp#.05" = phi i64 [ 1, %if.lr.ph ], [ %15, %idxend ]
  %9 = add i64 %"#temp#.05", -1
  %10 = icmp ult i64 %9, %7
  br i1 %10, label %idxend, label %oob

L14.loopexit:                                     ; preds = %idxend
  br label %L14

L14:                                              ; preds = %L14.lo

## Making our fuction faster than C :D

In [38]:
function mysum2(A)   
    s = 0.0  # s = zero(eltype(A))
    @simd for a in A
        s += a
    end
    s
end

mysum2 (generic function with 1 method)

In [39]:
j_bench_hand = @benchmark mysum2($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     431.442 μs (0.00% GC)
  median time:      436.264 μs (0.00% GC)
  mean time:        449.759 μs (0.00% GC)
  maximum time:     1.324 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

## Summing in julia using the sum method

In [30]:
mapreduce(identity, +, 1:50)

1275

In [11]:
@which sum(a)

In [40]:
j_bench = @benchmark sum($a)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     441.568 μs (0.00% GC)
  median time:      450.363 μs (0.00% GC)
  mean time:        469.606 μs (0.00% GC)
  maximum time:     1.905 ms (0.00% GC)
  --------------
  samples:          9968
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%