
## Simd examples


Run 
```
include(joinpath(dirname(JULIA_HOME),"share","julia","build_sysimg.jl")); build_sysimg(force=true)
```
if julia is not build in source.


In [1]:
using BenchmarkTools
using SIMD
Base.@pure simdwidth(::Type{T}) where {T} = Int(256/8/sizeof(T))

simdwidth (generic function with 1 method)

In [13]:
println("simdwitdth Float16: ", simdwidth(Float16))
println("simdwitdth Float32: ", simdwidth(Float32))
println("simdwitdth Float64: ", simdwidth(Float64))

println("\nsimdwitdth Int16: ", simdwidth(Int16))
println("simdwitdth Int32: ", simdwidth(Int32))
println("simdwitdth Int64: ", simdwidth(Int64))

simdwitdth Float16: 16
simdwitdth Float32: 8
simdwitdth Float64: 4

simdwitdth Int16: 16
simdwitdth Int32: 8
simdwitdth Int64: 4


In [23]:
function mysum(a::Vector)
    total = zero(eltype(a))
    for x in a
        total += x
    end
    return total
end

function mysum_simd(a::Vector)
    total = zero(eltype(a))
    @simd for x in a
        total += x
    end
    return total
end

mysum_simd (generic function with 1 method)

In [24]:
x = rand(Float64 , 100000);

In [25]:
@code_native mysum_simd(x)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[23]
Source line: 67
	movq	8(%rdi), %rax
Source line: 64
	movq	24(%rdi), %rdx
	xorl	%ecx, %ecx
Source line: 79
	testq	%rdx, %rdx
	cmovnsq	%rdx, %rcx
Source line: 68
	testq	%rax, %rax
	jle	L206
Source line: 79
	leaq	-1(%rcx), %rsi
	cmpq	%rdx, %rsi
	jae	L210
Source line: 50
	movq	(%rdi), %r9
Source line: 66
	leaq	16(%r9), %r8
	movq	%rax, %r10
	andq	$-4, %r10
	pxor	%xmm0, %xmm0
	xorl	%edi, %edi
	jmp	L192
Source line: 71
L64:
	testq	%rax, %rax
	jle	L192
Source line: 50
	cmpq	$4, %rax
	jae	L79
	xorl	%edx, %edx
	jmp	L156
L79:
	movq	%rax, %rdx
	andq	$-4, %rdx
	je	L154
	movq	%xmm0, %xmm1            ## xmm1 = xmm0[0],zero
	pxor	%xmm0, %xmm0
Source line: 74
	movq	%r10, %rsi
	movq	%r8, %rcx
	nopw	%cs:(%rax,%rax)
Source line: 50
L112:
	movupd	-16(%rcx), %xmm2
	movupd	(%rcx), %xmm3
Source line: 13
	addpd	%xmm2, %xmm1
	addpd	%xmm3, %xmm0
Source line: 50
	addq	$32, %rcx
	addq	$-4, %rsi
	jne	L112
Source line: 13
	addpd	%xmm1, %xmm0
	haddpd	

In [26]:
@code_native mysum(x)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[23]
Source line: 4
	movq	8(%rdi), %rax
	xorpd	%xmm0, %xmm0
	testq	%rax, %rax
	je	L50
	movq	(%rdi), %rdx
	movq	24(%rdi), %rsi
	xorpd	%xmm0, %xmm0
	xorl	%ecx, %ecx
	nopw	(%rax,%rax)
L32:
	cmpq	%rsi, %rcx
	jae	L51
Source line: 5
	addsd	(%rdx,%rcx,8), %xmm0
Source line: 4
	incq	%rcx
	cmpq	%rcx, %rax
	jne	L32
Source line: 7
L50:
	retq
L51:
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 4
	movq	%rsp, %rax
	leaq	-16(%rax), %rsi
	movq	%rsi, %rsp
	incq	%rcx
	movq	%rcx, -16(%rax)
	movabsq	$jl_bounds_error_ints, %rax
	movl	$1, %edx
	callq	*%rax
	nopl	(%rax)


In [27]:
@benchmark mysum(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     91.016 μs (0.00% GC)
  median time:      91.069 μs (0.00% GC)
  mean time:        96.762 μs (0.00% GC)
  maximum time:     527.553 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [22]:
@benchmark mysum_simd(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     24.119 μs (0.00% GC)
  median time:      24.479 μs (0.00% GC)
  mean time:        26.302 μs (0.00% GC)
  maximum time:     123.887 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [44]:
x = rand(Float32 , 100000);
sizeof(x)

400000

In [17]:
@benchmark mysum(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     77.045 μs (0.00% GC)
  median time:      81.218 μs (0.00% GC)
  mean time:        80.775 μs (0.00% GC)
  maximum time:     231.363 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [32]:
@benchmark mysum_simd(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     12.128 μs (0.00% GC)
  median time:      13.784 μs (0.00% GC)
  mean time:        15.891 μs (0.00% GC)
  maximum time:     167.141 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [34]:
x = rand(Float16, 100000);

In [35]:
# TO CHECK
# much slower than Float32, why?
@benchmark mysum_simd(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     1.235 ms (0.00% GC)
  median time:      1.315 ms (0.00% GC)
  mean time:        1.465 ms (0.00% GC)
  maximum time:     3.632 ms (0.00% GC)
  --------------
  samples:          3399
  evals/sample:     1

In [41]:
sizeof(x)

200000

In [45]:
#?sizeof

### indices

In [51]:
A = ones(5,6,7)

5×6×7 Array{Float64,3}:
[:, :, 1] =
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0

[:, :, 2] =
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0

[:, :, 3] =
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0

[:, :, 4] =
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0

[:, :, 5] =
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0

[:, :, 6] =
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0

### Example cost function simd

In [22]:
y =  rand(Float32 , 100000);
y_hat =  rand(Float32 , 100000);

In [36]:
function MSE_simd{T}(y::Vector{T},y_pred::Vector{T})
   cost = zero(eltype(y))
   @simd for i in 1:length(y)
       @inbounds cost += (y[i] - y_pred[i])^2
   end
   return sqrt(cost)
end

function MSE{T}(y::Vector{T},y_pred::Vector{T})
   cost = zero(eltype(y))
    
    @inbounds for i in 1:length(y)
        cost += (y[i] - y_pred[i])^2
    end
    return sqrt(cost)
end

MSE (generic function with 1 method)

In [49]:
@benchmark MSE(y,y_hat)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     77.087 μs (0.00% GC)
  median time:      81.340 μs (0.00% GC)
  mean time:        82.346 μs (0.00% GC)
  maximum time:     310.952 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [289]:
@benchmark MSE_simd(y, y_hat)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     21.102 μs (0.00% GC)
  median time:      22.396 μs (0.00% GC)
  mean time:        22.892 μs (0.00% GC)
  maximum time:     295.311 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

### cross entropy minibatch

In [58]:
T = Float32
p_y_given_x = rand(T, 10, 256);
onehot_y= zeros(T, 10, 256);


In [59]:
onehot_y

10×256 Array{Float32,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0

(10, 256)

### median pooling

In [46]:
#https://discourse.julialang.org/t/make-this-code-fast-median-pooling/6405

In [8]:
@inline function median5_swap(a,b,c,d,e)
    # https://github.com/JeffreySarnoff/SortingNetworks.jl/blob/master/src/swapsort.jl
    a,b = minmax(a,b)
    c,d = minmax(c,d)
    a,c = minmax(a,c)
    b,d = minmax(b,d)
    c,e = minmax(e,c)
    max(c, min(e,b))
end

@inline median5(args...) = median5_swap(args...)

function medmedpool55!(out::AbstractMatrix, img::AbstractMatrix)
    @assert size(out, 1) >= size(img, 1) ÷ 5
    @assert size(out, 2) >= size(img, 2) ÷ 5
    @inbounds for j ∈ indices(out)[2]
        @simd for i ∈ indices(out)[1]
            x11 = img[5i-4, 5j-4]
            x21 = img[5i-3, 5j-4]
            x31 = img[5i-2, 5j-4]
            x41 = img[5i-1, 5j-4]
            x51 = img[5i-0, 5j-4]
            
            x12 = img[5i-4, 5j-3]
            x22 = img[5i-3, 5j-3]
            x32 = img[5i-2, 5j-3]
            x42 = img[5i-1, 5j-3]
            x52 = img[5i-0, 5j-3]
            
            x13 = img[5i-4, 5j-2]
            x23 = img[5i-3, 5j-2]
            x33 = img[5i-2, 5j-2]
            x43 = img[5i-1, 5j-2]
            x53 = img[5i-0, 5j-2]
            
            x14 = img[5i-4, 5j-1]
            x24 = img[5i-3, 5j-1]
            x34 = img[5i-2, 5j-1]
            x44 = img[5i-1, 5j-1]
            x54 = img[5i-0, 5j-1]
            
            x15 = img[5i-4, 5j-0]
            x25 = img[5i-3, 5j-0]
            x35 = img[5i-2, 5j-0]
            x45 = img[5i-1, 5j-0]
            x55 = img[5i-0, 5j-0]
            
            y1 = median5(x11,x12,x13,x14,x15)
            y2 = median5(x21,x22,x23,x24,x25)
            y3 = median5(x31,x32,x33,x34,x35)
            y4 = median5(x41,x42,x43,x44,x45)
            y5 = median5(x51,x52,x53,x54,x55)
            
            z = median5(y1,y2,y3,y4,y5)
            out[i,j] = z
        end
    end
    out
end

medmedpool55! (generic function with 1 method)

In [5]:
using BenchmarkTools
imgs = randn(Float32, 1024,1024, 10)
img = view(imgs, :,:,1)
out = similar(img, size(img) .÷ 5)
@benchmark medmedpool55!(out, img)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     8.746 ms (0.00% GC)
  median time:      10.786 ms (0.00% GC)
  mean time:        11.430 ms (0.00% GC)
  maximum time:     47.553 ms (0.00% GC)
  --------------
  samples:          437
  evals/sample:     1

In [13]:
size(imgs),size([rand(T,N) for _ in 1:6])

((1024, 1024, 10), (6,))

In [None]:
Base.@pure simdwidth(::Type{T}) where {T} = Int(256/8/sizeof(T))

@inline function median3(a,b,c)
    max(min(a,b), min(c,max(a,b)))
end

@inline function median5(a,b,c,d,e)
    # https://stackoverflow.com/questions/480960/code-to-calculate-median-of-five-in-c-sharp
    f=max(min(a,b),min(c,d))
    g=min(max(a,b),max(c,d))
    median3(e,f,g)
end

@noinline function median5_vectors!(out, a,b,c,d,e)
    K = simdwidth(eltype(out))
    N = length(out)
    T = eltype(out)
    V = Vec{K,T}
    @assert mod(N,K) == 0

    @inbounds for i in 1:K:N
        va = vload(V,a, i)
        vb = vload(V,b, i)
        vc = vload(V,c, i)
        vd = vload(V,d, i)
        ve = vload(V,e, i)
        vo = median5(va,vb,vc,vd,ve)
        vstore(vo,out, i)
    end
    out
end

using BenchmarkTools
T = UInt8
T = Float32
N = 10^6
N = N ÷ simdwidth(T) * simdwidth(T)
out, a,b,c,d,e = [rand(T,N) for _ in 1:6]
@benchmark median5_vectors!(out, a,b,c,d,e)


In [47]:
simdwidth(Float16)

16

### Test views

In [304]:
X = ones(10,5)
x = Array(1:10);
X2 = x.+X

10×5 Array{Float64,2}:
  2.0   2.0   2.0   2.0   2.0
  3.0   3.0   3.0   3.0   3.0
  4.0   4.0   4.0   4.0   4.0
  5.0   5.0   5.0   5.0   5.0
  6.0   6.0   6.0   6.0   6.0
  7.0   7.0   7.0   7.0   7.0
  8.0   8.0   8.0   8.0   8.0
  9.0   9.0   9.0   9.0   9.0
 10.0  10.0  10.0  10.0  10.0
 11.0  11.0  11.0  11.0  11.0

In [305]:
v = view(X2,1,:)
for i in 1:size(X2,1)
    v = view(X2,i,:)
    X2[i,:] += v
end
X2

10×5 Array{Float64,2}:
  4.0   4.0   4.0   4.0   4.0
  6.0   6.0   6.0   6.0   6.0
  8.0   8.0   8.0   8.0   8.0
 10.0  10.0  10.0  10.0  10.0
 12.0  12.0  12.0  12.0  12.0
 14.0  14.0  14.0  14.0  14.0
 16.0  16.0  16.0  16.0  16.0
 18.0  18.0  18.0  18.0  18.0
 20.0  20.0  20.0  20.0  20.0
 22.0  22.0  22.0  22.0  22.0

In [302]:
X = ones(10,5)
x = Array(1:10);
X2 = x.+X

10×5 Array{Float64,2}:
  2.0   2.0   2.0   2.0   2.0
  3.0   3.0   3.0   3.0   3.0
  4.0   4.0   4.0   4.0   4.0
  5.0   5.0   5.0   5.0   5.0
  6.0   6.0   6.0   6.0   6.0
  7.0   7.0   7.0   7.0   7.0
  8.0   8.0   8.0   8.0   8.0
  9.0   9.0   9.0   9.0   9.0
 10.0  10.0  10.0  10.0  10.0
 11.0  11.0  11.0  11.0  11.0

In [303]:
v = view(X2,1,:)
for i in 1:size(X2,1)
    v .= view(X2,i,:)
    X2[i,:] += v
end

X2

10×5 Array{Float64,2}:
 11.0  11.0  11.0  11.0  11.0
  6.0   6.0   6.0   6.0   6.0
  8.0   8.0   8.0   8.0   8.0
 10.0  10.0  10.0  10.0  10.0
 12.0  12.0  12.0  12.0  12.0
 14.0  14.0  14.0  14.0  14.0
 16.0  16.0  16.0  16.0  16.0
 18.0  18.0  18.0  18.0  18.0
 20.0  20.0  20.0  20.0  20.0
 22.0  22.0  22.0  22.0  22.0

In [325]:
X = ones(10,5)
x = Array(1:10);
X2 = x.+X

v = deepcopy(view(X2,1,:))
for i in 1:size(X2,1)
    v .= view(X2,i,:)
    X2[i,:] += v
end
X2

10×5 Array{Float64,2}:
  4.0   4.0   4.0   4.0   4.0
  6.0   6.0   6.0   6.0   6.0
  8.0   8.0   8.0   8.0   8.0
 10.0  10.0  10.0  10.0  10.0
 12.0  12.0  12.0  12.0  12.0
 14.0  14.0  14.0  14.0  14.0
 16.0  16.0  16.0  16.0  16.0
 18.0  18.0  18.0  18.0  18.0
 20.0  20.0  20.0  20.0  20.0
 22.0  22.0  22.0  22.0  22.0

In [324]:
X = ones(10,5)
x = Array(1:10);
X2 = x.+X

v = zeros(X2[1,:])
for i in 1:size(X2,1)
    v .= view(X2,i,:)
    X2[i,:] += v
end
X2

10×5 Array{Float64,2}:
  4.0   4.0   4.0   4.0   4.0
  6.0   6.0   6.0   6.0   6.0
  8.0   8.0   8.0   8.0   8.0
 10.0  10.0  10.0  10.0  10.0
 12.0  12.0  12.0  12.0  12.0
 14.0  14.0  14.0  14.0  14.0
 16.0  16.0  16.0  16.0  16.0
 18.0  18.0  18.0  18.0  18.0
 20.0  20.0  20.0  20.0  20.0
 22.0  22.0  22.0  22.0  22.0

In [315]:
@time v = deepcopy(view(X2,1,:))

  0.000056 seconds (37 allocations: 1.813 KiB)


5-element SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true}:
 4.0
 4.0
 4.0
 4.0
 4.0

In [None]:
@time v = deepcopy(view(X2,1,:))

In [313]:
@time auxiliar = view(X2,1,:)

  0.000043 seconds (21 allocations: 512 bytes)


5-element SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true}:
 4.0
 4.0
 4.0
 4.0
 4.0