
## Simd examples


Run 
```
include(joinpath(dirname(JULIA_HOME),"share","julia","build_sysimg.jl")); build_sysimg(force=true)
```
if julia is not build in source.


In [11]:
using BenchmarkTools

In [1]:

function mysum(a::Vector)
    total = zero(eltype(a))
    for x in a
        total += x
    end
    return total
end

function mysum_simd(a::Vector)
    total = zero(eltype(a))
    @simd for x in a
        total += x
    end
    return total
end

mysum_simd (generic function with 1 method)

In [3]:
x = rand(Float64 , 100000);

In [4]:
@code_native mysum_simd(x)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[1]
Source line: 67
	movq	8(%rdi), %rax
Source line: 64
	movq	24(%rdi), %rdx
	xorl	%ecx, %ecx
Source line: 79
	testq	%rdx, %rdx
	cmovnsq	%rdx, %rcx
Source line: 68
	testq	%rax, %rax
	jle	L305
Source line: 79
	leaq	-1(%rcx), %rsi
	cmpq	%rdx, %rsi
	jae	L313
Source line: 50
	movq	(%rdi), %r9
Source line: 66
	leaq	96(%r9), %r8
	movq	%rax, %r10
	andq	$-16, %r10
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%edi, %edi
Source line: 50
	vxorpd	%ymm1, %ymm1, %ymm1
Source line: 66
	jmp	L288
	nopw	%cs:(%rax,%rax)
Source line: 71
L80:
	testq	%rax, %rax
	jle	L288
Source line: 50
	cmpq	$16, %rax
	jae	L102
	xorl	%edx, %edx
	jmp	L253
L102:
	movq	%rax, %rdx
	andq	$-16, %rdx
	je	L251
	vblendpd	$1, %ymm0, %ymm1, %ymm0 ## ymm0 = ymm0[0],ymm1[1,2,3]
	vxorpd	%ymm2, %ymm2, %ymm2
Source line: 74
	movq	%r10, %rsi
	movq	%r8, %rcx
	vxorpd	%ymm3, %ymm3, %ymm3
	vxorpd	%ymm4, %ymm4, %ymm4
	nopl	(%rax,%rax)
Source line: 50
L144:
	vmovupd	-96(%rcx), %xmm5
	vmovupd	-64

In [5]:
@code_native mysum(x)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[1]
Source line: 4
	movq	8(%rdi), %rax
	vxorpd	%xmm0, %xmm0, %xmm0
	testq	%rax, %rax
	je	L50
	movq	(%rdi), %rdx
	movq	24(%rdi), %rsi
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%ecx, %ecx
	nopw	(%rax,%rax)
L32:
	cmpq	%rsi, %rcx
	jae	L51
Source line: 5
	vaddsd	(%rdx,%rcx,8), %xmm0, %xmm0
Source line: 4
	incq	%rcx
	cmpq	%rcx, %rax
	jne	L32
Source line: 7
L50:
	retq
L51:
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 4
	movq	%rsp, %rax
	leaq	-16(%rax), %rsi
	movq	%rsi, %rsp
	incq	%rcx
	movq	%rcx, -16(%rax)
	movabsq	$jl_bounds_error_ints, %rax
	movl	$1, %edx
	callq	*%rax
	nopl	(%rax)


In [12]:
@benchmark mysum(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     77.049 μs (0.00% GC)
  median time:      81.257 μs (0.00% GC)
  mean time:        80.911 μs (0.00% GC)
  maximum time:     204.574 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [15]:
@benchmark mysum_simd(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     21.971 μs (0.00% GC)
  median time:      23.424 μs (0.00% GC)
  mean time:        23.785 μs (0.00% GC)
  maximum time:     106.059 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [16]:
x = rand(Float32 , 100000);

In [17]:
@benchmark mysum(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     77.045 μs (0.00% GC)
  median time:      81.218 μs (0.00% GC)
  mean time:        80.775 μs (0.00% GC)
  maximum time:     231.363 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [19]:
@benchmark mysum_simd(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     10.932 μs (0.00% GC)
  median time:      11.602 μs (0.00% GC)
  mean time:        11.816 μs (0.00% GC)
  maximum time:     78.097 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

### Example cost function simd

In [22]:
y =  rand(Float32 , 100000);
y_hat =  rand(Float32 , 100000);

In [36]:
function MSE_simd{T}(y::Vector{T},y_pred::Vector{T})
   cost = zero(eltype(y))
    
   @simd for i in 1:length(y)
       @inbounds cost += (y[i] - y_pred[i])^2
   end
   return sqrt(cost)
end

function MSE{T}(y::Vector{T},y_pred::Vector{T})
   cost = zero(eltype(y))
    
    @inbounds for i in 1:length(y)
        cost += (y[i] - y_pred[i])^2
    end
    return sqrt(cost)
end

MSE (generic function with 1 method)

In [49]:
@benchmark MSE(y,y_hat)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     77.087 μs (0.00% GC)
  median time:      81.340 μs (0.00% GC)
  mean time:        82.346 μs (0.00% GC)
  maximum time:     310.952 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [289]:
@benchmark MSE_simd(y, y_hat)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     21.102 μs (0.00% GC)
  median time:      22.396 μs (0.00% GC)
  mean time:        22.892 μs (0.00% GC)
  maximum time:     295.311 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

### Test views

In [291]:
X = ones(200,5)
x = Array(1:200);
X2 = x.+X



200×5 Array{Float64,2}:
   2.0    2.0    2.0    2.0    2.0
   3.0    3.0    3.0    3.0    3.0
   4.0    4.0    4.0    4.0    4.0
   5.0    5.0    5.0    5.0    5.0
   6.0    6.0    6.0    6.0    6.0
   7.0    7.0    7.0    7.0    7.0
   8.0    8.0    8.0    8.0    8.0
   9.0    9.0    9.0    9.0    9.0
  10.0   10.0   10.0   10.0   10.0
  11.0   11.0   11.0   11.0   11.0
  12.0   12.0   12.0   12.0   12.0
  13.0   13.0   13.0   13.0   13.0
  14.0   14.0   14.0   14.0   14.0
   ⋮                              
 190.0  190.0  190.0  190.0  190.0
 191.0  191.0  191.0  191.0  191.0
 192.0  192.0  192.0  192.0  192.0
 193.0  193.0  193.0  193.0  193.0
 194.0  194.0  194.0  194.0  194.0
 195.0  195.0  195.0  195.0  195.0
 196.0  196.0  196.0  196.0  196.0
 197.0  197.0  197.0  197.0  197.0
 198.0  198.0  198.0  198.0  198.0
 199.0  199.0  199.0  199.0  199.0
 200.0  200.0  200.0  200.0  200.0
 201.0  201.0  201.0  201.0  201.0

In [286]:
X = ones(200,5)
x = Array(1:200);
X2 = x.+X

println(X2[1:5,:],"\n")

v = view(X2,1,:)
for i in 1:size(X2,1)
    v = view(X2,i,:)
    X2[i,:] += v
end

println(X2[1:5,:])



[2.0 2.0 2.0 2.0 2.0; 3.0 3.0 3.0 3.0 3.0; 4.0 4.0 4.0 4.0 4.0; 5.0 5.0 5.0 5.0 5.0; 6.0 6.0 6.0 6.0 6.0]

[4.0 4.0 4.0 4.0 4.0; 6.0 6.0 6.0 6.0 6.0; 8.0 8.0 8.0 8.0 8.0; 10.0 10.0 10.0 10.0 10.0; 12.0 12.0 12.0 12.0 12.0]


In [287]:
X2

200×5 Array{Float64,2}:
   4.0    4.0    4.0    4.0    4.0
   6.0    6.0    6.0    6.0    6.0
   8.0    8.0    8.0    8.0    8.0
  10.0   10.0   10.0   10.0   10.0
  12.0   12.0   12.0   12.0   12.0
  14.0   14.0   14.0   14.0   14.0
  16.0   16.0   16.0   16.0   16.0
  18.0   18.0   18.0   18.0   18.0
  20.0   20.0   20.0   20.0   20.0
  22.0   22.0   22.0   22.0   22.0
  24.0   24.0   24.0   24.0   24.0
  26.0   26.0   26.0   26.0   26.0
  28.0   28.0   28.0   28.0   28.0
   ⋮                              
 380.0  380.0  380.0  380.0  380.0
 382.0  382.0  382.0  382.0  382.0
 384.0  384.0  384.0  384.0  384.0
 386.0  386.0  386.0  386.0  386.0
 388.0  388.0  388.0  388.0  388.0
 390.0  390.0  390.0  390.0  390.0
 392.0  392.0  392.0  392.0  392.0
 394.0  394.0  394.0  394.0  394.0
 396.0  396.0  396.0  396.0  396.0
 398.0  398.0  398.0  398.0  398.0
 400.0  400.0  400.0  400.0  400.0
 402.0  402.0  402.0  402.0  402.0

In [282]:
X = ones(200,5)
x = Array(1:200);
X2 = x.+X

println(X2[1:5,:],"\n")

v = view(X2,1,:)
for i in 1:size(X2,1)
    v .= view(X2,i,:)
    X2[i,:] += v
end

println(X2[1:5,:])


[2.0 2.0 2.0 2.0 2.0; 3.0 3.0 3.0 3.0 3.0; 4.0 4.0 4.0 4.0 4.0; 5.0 5.0 5.0 5.0 5.0; 6.0 6.0 6.0 6.0 6.0]

[201.0 201.0 201.0 201.0 201.0; 6.0 6.0 6.0 6.0 6.0; 8.0 8.0 8.0 8.0 8.0; 10.0 10.0 10.0 10.0 10.0; 12.0 12.0 12.0 12.0 12.0]


In [283]:
X2

200×5 Array{Float64,2}:
 201.0  201.0  201.0  201.0  201.0
   6.0    6.0    6.0    6.0    6.0
   8.0    8.0    8.0    8.0    8.0
  10.0   10.0   10.0   10.0   10.0
  12.0   12.0   12.0   12.0   12.0
  14.0   14.0   14.0   14.0   14.0
  16.0   16.0   16.0   16.0   16.0
  18.0   18.0   18.0   18.0   18.0
  20.0   20.0   20.0   20.0   20.0
  22.0   22.0   22.0   22.0   22.0
  24.0   24.0   24.0   24.0   24.0
  26.0   26.0   26.0   26.0   26.0
  28.0   28.0   28.0   28.0   28.0
   ⋮                              
 380.0  380.0  380.0  380.0  380.0
 382.0  382.0  382.0  382.0  382.0
 384.0  384.0  384.0  384.0  384.0
 386.0  386.0  386.0  386.0  386.0
 388.0  388.0  388.0  388.0  388.0
 390.0  390.0  390.0  390.0  390.0
 392.0  392.0  392.0  392.0  392.0
 394.0  394.0  394.0  394.0  394.0
 396.0  396.0  396.0  396.0  396.0
 398.0  398.0  398.0  398.0  398.0
 400.0  400.0  400.0  400.0  400.0
 402.0  402.0  402.0  402.0  402.0