In [1]:
function myfunc(a, b)
    if a > b
        return a - b
    else
        return a + b
    end
end

myfunc (generic function with 1 method)

In [2]:
x = rand(1_000_000);
myfunc.(x, 2.)

1000000-element Array{Float64,1}:
 2.271677857127897 
 2.080272181861851 
 2.4434634917141627
 2.9805796127286985
 2.3191752312029523
 2.812277243337994 
 2.0366754594679346
 2.6286370461069577
 2.668748086416778 
 2.744535861207713 
 2.678223119620112 
 2.848598400930946 
 2.4525590326233675
 ⋮                 
 2.814048247673465 
 2.1078465581719765
 2.9442976616479504
 2.8165610835410853
 2.7654094328624046
 2.01369123842691  
 2.883660511951012 
 2.1416111535634115
 2.8384509199451693
 2.680449936361655 
 2.2001273015642764
 2.735075431751395 

In [3]:
using BenchmarkTools

In [None]:
@btime myfunc.(x, 2.);

In [5]:
function myfuncloop(x, value)
    result = zeros(eltype(x), length(x))
    
    for k in 1:length(x)
        if x[k] > value
            result[k] = x[k] + value
        else
            result[k] = x[k]- value
        end
    end
    return result
end

myfuncloop (generic function with 1 method)

In [6]:
x = rand(1000_000);

In [7]:
@btime myfuncloop(x, 2.);

  3.743 ms (2 allocations: 7.63 MiB)


In [8]:
function myfuncloop2(x, value)
    result = Array{eltype(x)}(undef, length(x))
    
    @inbounds for k in 1:length(x)
        if x[k] > value
            result[k] += value
        else
            result[k] -= value
        end
    end
    return result
end

myfuncloop2 (generic function with 1 method)

In [9]:
@btime myfuncloop2(x, 2.);

  3.035 ms (2 allocations: 7.63 MiB)


In [10]:
function myfuncloop3(x, value)
    result = Array{eltype(x)}(undef, length(x))

    @inbounds for k in 1:length(x)
        result[k] = myfunc(x[k], value)
    end
    return result
end

myfuncloop3 (generic function with 1 method)

In [11]:
@btime myfuncloop3(x, 2.);

  2.520 ms (2 allocations: 7.63 MiB)


In [12]:
function myfuncloop4(x, value)
    result = Array{eltype(x)}(undef, length(x))

    @inbounds for k in 1:length(x)
        result[k] =  x[k] > value ? x[k] - value : x[k] +value
    end
    return result
end

myfuncloop4 (generic function with 1 method)

### using simd

In [13]:
function myfuncloop5(x, value)
    result = Array{eltype(x)}(undef, length(x))

    @inbounds @simd for k in 1:length(x)
        result[k] =  x[k] > value ? x[k] - value : x[k] +value
    end
    return result
end

myfuncloop5 (generic function with 1 method)

In [14]:
@btime myfuncloop5(x, 2.);

  2.620 ms (2 allocations: 7.63 MiB)


In [115]:
#code_warntypetive myfuncloop5(x, 2.)

In [193]:
T = eltype(x)
value = T(2)
result =  Array{T}(undef, length(x))
lane = VecRange{N}(0)

@inbounds for i in 1:N:length(x)
    result[lane + i] += x[lane+i] > value ? x[k+lane] - value : x[k+lane] +value
end

TypeError: TypeError: non-boolean (Vec{8,Bool}) used in boolean context

In [194]:
N = 8
lane = VecRange{N}(1)

VecRange{8}(1)

In [149]:
xs = ones(100);

In [153]:
typeof(xs[lane])

Vec{8,Float64}

## simdify example

https://github.com/eschnett/SIMD.jl/pull/37

In [102]:
using SIMD

In [103]:
using Random
rng = MersenneTwister(1234);
T = Float32
N = 4
x = 8 * rand(rng, T, 100);
res = zeros(T,100);
x[1:5]

5-element Array{Float32,1}:
 0.08616257
 7.0902348 
 5.547675  
 1.8028164 
 1.6655989 

In [104]:
v = vload(Vec{N, Float32}, x, 1)

<4 x Float32>[0.08616257, 7.0902348, 5.547675, 1.8028164]

In [105]:
value = 2
aux = vifelse(v > value, v - value, v + value)

<4 x Float32>[2.0861626, 5.0902348, 3.5476751, 3.8028164]

In [106]:
res[1:10]

10-element Array{Float32,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [107]:
N           = 4
lane        = VecRange{N}(0)
v = vload(Vec{N, Float32}, x, 1)
res[lane+1] = vifelse(v > value, v - value, v + value)

<4 x Float32>[2.0861626, 5.0902348, 3.5476751, 3.8028164]

In [108]:
res[1:10]

10-element Array{Float32,1}:
 2.0861626
 5.0902348
 3.5476751
 3.8028164
 0.0      
 0.0      
 0.0      
 0.0      
 0.0      
 0.0      

In [109]:
x[1:5]

5-element Array{Float32,1}:
 0.08616257
 7.0902348 
 5.547675  
 1.8028164 
 1.6655989 

In [110]:
myfunc.(x[1:N], value)

4-element Array{Float32,1}:
 2.0861626
 5.0902348
 3.5476751
 3.8028164

In [111]:
res[1:N]

4-element Array{Float32,1}:
 2.0861626
 5.0902348
 3.5476751
 3.8028164

Going for the next slice

We have seen how the first 4 elements have been modified with a single instruction

In [112]:
N

4

In [113]:
v = vload(Vec{N, Float32}, x, N+1)
res[lane+N+1] = vifelse(v > value, v - value, v + value)

<4 x Float32>[3.6655989, 0.33736324, 2.7522717, 1.8484411]

In [114]:
res[1:10]

10-element Array{Float32,1}:
 2.0861626 
 5.0902348 
 3.5476751 
 3.8028164 
 3.6655989 
 0.33736324
 2.7522717 
 1.8484411 
 0.0       
 0.0       

we can check that the result is the same for the first 8 elements

In [115]:
res[1:10] .== myfunc.(x[1:10], value)

10-element BitArray{1}:
  true
  true
  true
  true
  true
  true
  true
  true
 false
 false

we can do it another last time

In [116]:
v = vload(Vec{N, Float32}, x, 2*N+1)
res[lane+2*N+1] = vifelse(v > value, v - value, v + value)

<4 x Float32>[0.05822563, 2.6386538, 2.3352118, 2.515665]

In [117]:
res[1:12] .== myfunc.(x[1:12], value)

12-element BitArray{1}:
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true

#### make a function

In [163]:
function myfunc_simd(x::Vector{T}, value::T, ::Type{Vec{N,T}}) where {N, T}
    @assert length(x) % N == 0
    result = Array{T}(undef, length(x))
    lane   = VecRange{N}(0)
    @inbounds for i in 1:N:length(x)        
        x_vslice    = vload(Vec{N, T}, x, i) # i = 2*k+1 where k=1,2,3,4,...
        result[lane + i] = vifelse(x_vslice > 2, x_vslice - value, x_vslice + value)
    end
    return result
end

myfunc_simd (generic function with 1 method)

In [164]:
r = myfunc_simd(x, Float32(value), Vec{4,Float32});

In [165]:
value

2

In [166]:
x[1:8]

8-element Array{Float32,1}:
 0.4665209 
 0.73454046
 0.6330079 
 0.11965513
 0.63845193
 0.6390426 
 0.1782192 
 0.7924398 

In [167]:
r[1:8]

8-element Array{Float32,1}:
 2.4665208
 2.7345405
 2.633008 
 2.1196551
 2.638452 
 2.6390426
 2.1782193
 2.79244  

In [168]:
myfunc.(x,Float32(value))[1:8]

8-element Array{Float32,1}:
 2.4665208
 2.7345405
 2.633008 
 2.1196551
 2.638452 
 2.6390426
 2.1782193
 2.79244  

In [169]:
r[1:30] .== myfunc.(x,Float32(value))[1:30]

30-element BitArray{1}:
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true
    ⋮
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true
 true

In [175]:
x = rand(Float32,1_000_000);

In [190]:
function myfunc_simd(x::Vector{T}, value::T, ::Type{Vec{N,T}}) where {N, T}
    @assert length(x) % N == 0
    result = Array{T}(undef, length(x))
    lane   = VecRange{N}(0)
    @inbounds for i in 1:N:length(x)        
        x_vslice    = vload(Vec{N, T}, x, i) # i = N*k+1 where k=1,2,3,4,...
        result[lane + i] = vifelse(x_vslice > 2, x_vslice - value, x_vslice + value)
    end
    return result
end

myfunc_simd (generic function with 1 method)

In [176]:
result_1 = myfunc.(x,1);

In [189]:
result_2 = myfunc_simd(x, Float32(1), Vec{8,Float32});

In [191]:
result_1 == result_2

true

In [201]:
@btime myfunc.(x,1);

  1.360 ms (4 allocations: 3.81 MiB)


In [202]:
@btime myfunc_simd(x, Float32(1), Vec{8,Float32});

  693.396 μs (2 allocations: 3.81 MiB)


The assert takes some time

In [195]:
function myfunc_simd2(x::Vector{T}, value::T, ::Type{Vec{N,T}}) where {N, T}
    result = Array{T}(undef, length(x))
    lane   = VecRange{N}(0)
    @inbounds for i in 1:N:length(x)        
        x_vslice    = vload(Vec{N, T}, x, i) # i = 2*k+1 where k=1,2,3,4,...
        result[lane + i] = vifelse(x_vslice > 2, x_vslice - value, x_vslice + value)
    end
    return result
end

myfunc_simd2 (generic function with 1 method)

In [196]:
@btime myfunc_simd2(x, Float32(1), Vec{8,Float32});

  684.196 μs (2 allocations: 3.81 MiB)


### vadd example

In [161]:
using SIMD

In [162]:
function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T}
    @assert length(ys) == length(xs)
    @assert length(xs) % N == 0
    lane = VecRange{N}(0)
    @inbounds for i in 1:N:length(xs)
        xs[lane + i] += ys[lane + i]
    end
end

vadd! (generic function with 1 method)

In [176]:
using Random
rng = MersenneTwister(1234);

x = rand(rng, Float32,100_000)
y = ones(Float32,100_000);

In [177]:
c = x + y;
c[1:5]

5-element Array{Float32,1}:
 1.0107703
 1.8862793
 1.6934594
 1.225352 
 1.2081999

In [165]:
@time vadd!(x,y,Vec{4,Float32})

  0.035450 seconds (34.16 k allocations: 1.608 MiB)


In [167]:
x[1:5]

5-element Array{Float32,1}:
 1.0107703
 1.8862793
 1.6934594
 1.225352 
 1.2081999

Do the operation in broadcast mode

In [182]:
rng = MersenneTwister(1234);
x = rand(rng, Float32,100_000)
y = ones(Float32,100_000);

In [183]:
@time x .= x + y;

  0.000509 seconds (8 allocations: 390.891 KiB)


In [184]:
x[1:5]

5-element Array{Float32,1}:
 1.0107703
 1.8862793
 1.6934594
 1.225352 
 1.2081999

timing 

In [185]:
@btime vadd!(x,y,Vec{32,Float32})

  29.416 μs (0 allocations: 0 bytes)


In [186]:
@btime x .= x + y;

  79.169 μs (4 allocations: 390.73 KiB)
