<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Vectorizing-code" data-toc-modified-id="Vectorizing-code-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Vectorizing code</a></span></li><li><span><a href="#Structs-of-arrays-vs-arrays-of-structs" data-toc-modified-id="Structs-of-arrays-vs-arrays-of-structs-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Structs of arrays vs arrays of structs</a></span><ul class="toc-item"><li><span><a href="#Code-for-any-numerical-element--type" data-toc-modified-id="Code-for-any-numerical-element--type-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Code for any numerical element  type</a></span></li><li><span><a href="#Adding-@simd" data-toc-modified-id="Adding-@simd-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Adding <code>@simd</code></a></span></li></ul></li><li><span><a href="#Struct-of-static-arrays" data-toc-modified-id="Struct-of-static-arrays-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Struct of static arrays</a></span><ul class="toc-item"><li><span><a href="#Another-version" data-toc-modified-id="Another-version-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Another version</a></span></li></ul></li></ul></div>

In [2]:
using BenchmarkTools

# Vectorizing code

# Structs of arrays vs arrays of structs

In [3]:

struct ComplexAoS
  real::Float64
  imag::Float64
end

struct ComplexSoA
  real::Vector{Float64}
  imag::Vector{Float64}
end

N = 1000
arrAoS = [ComplexAoS(rand(),rand()) for i in 1:N]
arrSoA = ComplexSoA(rand(N),rand(N))

ComplexSoA([0.516461, 0.469966, 0.219975, 0.788054, 0.211134, 0.438849, 0.264036, 0.291488, 0.362056, 0.614113  …  0.724571, 0.0794315, 0.670029, 0.778371, 0.987755, 0.900173, 0.393771, 0.757748, 0.0419546, 0.701018], [0.684288, 0.352884, 0.0522607, 0.722401, 0.156825, 0.627917, 0.729753, 0.721233, 0.289239, 0.45116  …  0.477383, 0.972271, 0.766809, 0.381621, 0.725287, 0.428688, 0.0923931, 0.741511, 0.458806, 0.816328])

In [4]:
# Array of Structs
arrAoS[1]

ComplexAoS(0.4524271922870875, 0.5183473176684523)

In [5]:
# Struct of arrays
arrSoA

ComplexSoA([0.516461, 0.469966, 0.219975, 0.788054, 0.211134, 0.438849, 0.264036, 0.291488, 0.362056, 0.614113  …  0.724571, 0.0794315, 0.670029, 0.778371, 0.987755, 0.900173, 0.393771, 0.757748, 0.0419546, 0.701018], [0.684288, 0.352884, 0.0522607, 0.722401, 0.156825, 0.627917, 0.729753, 0.721233, 0.289239, 0.45116  …  0.477383, 0.972271, 0.766809, 0.381621, 0.725287, 0.428688, 0.0923931, 0.741511, 0.458806, 0.816328])

There are two versions to do this

In [6]:
function normSoA(x)
  out = 0.0
  for i in 1:length(x.real)
    @inbounds out += sqrt(x.real[i]^2 + x.imag[i]^2)
  end
  out
end

function normAoS(x)
  out = 0.0
  for i in 1:length(x)
    @inbounds out += sqrt(x[i].real^2 + x[i].imag^2)
  end
  out
end

normAoS (generic function with 1 method)

In [7]:
@btime normAoS($arrAoS);

  4.255 μs (0 allocations: 0 bytes)


In [8]:
@btime normSoA($arrSoA);

  4.256 μs (0 allocations: 0 bytes)


## Code for any numerical element  type

In [9]:

struct ComplexAoS_T{T}
  real::T
  imag::T
end
struct ComplexSoA_T{T}
  real::Vector{T}
  imag::Vector{T}
end

N = 1000
arrAoS = [ComplexAoS_T{Float32}(rand(),rand()) for i in 1:N]
arrSoA = ComplexSoA_T{Float32}(rand(N),rand(N))

ComplexSoA_T{Float32}(Float32[0.145308, 0.264719, 0.680704, 0.265778, 0.240314, 0.371777, 0.571783, 0.659315, 0.163938, 0.387764  …  0.516052, 0.587327, 0.0729806, 0.586128, 0.962991, 0.675849, 0.806112, 0.0721002, 0.352715, 0.304008], Float32[0.452959, 0.126943, 0.43612, 0.749509, 0.736819, 0.83007, 0.965935, 0.963667, 0.0590758, 0.466277  …  0.303555, 0.70734, 0.407744, 0.331652, 0.519078, 0.387351, 0.602793, 0.929894, 0.581023, 0.861837])

In [10]:
eltype(arrAoS[1].real)

Float32

In [11]:
eltype(arrAoS)

ComplexAoS_T{Float32}

In [12]:
arrAoS[1]

ComplexAoS_T{Float32}(0.36881328f0, 0.72662365f0)

In [42]:
typeof(arrAoS[1].real)

Float32

In [14]:
function normSoA(x::ComplexSoA_T)
    out = zero(eltype(x.real))
    for i in 1:length(x.real)
        @inbounds out += sqrt(x.real[i]^2 + x.imag[i]^2)
    end
    return out
end

function normAoS(x::ComplexAoS_T)
    out = zero(typeof(x[1].real))
    for i in 1:length(x)
       @inbounds out += sqrt(x[i].real^2 + x[i].imag^2)
    end
    return out
end


normAoS (generic function with 2 methods)

In [15]:
@btime normSoA($arrSoA);

  2.129 μs (0 allocations: 0 bytes)


In [16]:
@btime normAoS($arrAoS);

  2.280 μs (0 allocations: 0 bytes)


## Adding `@simd`

Struct of Arrays are good when you want to SIMD across loop iterations.
In this case it is straight forward to turn each access to `x.real[i]` and `x.imag[i]`  into a load of an entire SIMD vector of them, etc for each subsequent operation. This makes sense when your arrays  are of length bigger or equal to `SIMD_WIDTH{T}`.


In [60]:
struct ComplexAoS_T{T}
  real::T
  imag::T
end

struct ComplexSoA_T{T}
  real::Vector{T}
  imag::Vector{T}
end

N = 1000
arrAoS = [ComplexAoS_T{Float32}(rand(),rand()) for i in 1:N]
arrSoA = ComplexSoA_T{Float32}(rand(N),rand(N))

ComplexSoA_T{Float32}(Float32[0.819406, 0.866079, 0.700487, 0.975528, 0.898571, 0.407571, 0.992901, 0.305007, 0.0740585, 0.361329  …  0.60909, 0.700431, 0.0809155, 0.163068, 0.354285, 0.496126, 0.749361, 0.221263, 0.415495, 0.0538045], Float32[0.585698, 0.921505, 0.501492, 0.383083, 0.975917, 0.758238, 0.661168, 0.0497383, 0.5924, 0.395759  …  0.504708, 0.536726, 0.81607, 0.645291, 0.882591, 0.00839603, 0.506446, 0.84836, 0.268614, 0.926723])

In [114]:
function normSoA(x::ComplexSoA_T)
    out = zero(eltype(x.real))
    @simd for i in 1:length(x.real)
        @inbounds out += sqrt(x.real[i]^2 + x.imag[i]^2)
    end
    return out
end

function normAoS(x::Array{ComplexAoS_T})
    out = zero(typeof(x[1].real))
    @simd  for i in 1:length(x)
       @inbounds out += sqrt(x[i].real^2 + x[i].imag^2)
    end
    return out
end


normAoS (generic function with 3 methods)

In [115]:
@btime normSoA($arrSoA);

  600.478 ns (0 allocations: 0 bytes)


In [129]:
@btime normAoS($arrAoS);

  2.280 μs (0 allocations: 0 bytes)


## Another version

In [109]:
function normAoS_vec(x)
    out = zero(typeof(x[1].real))
    
    @simd  for i in 1:length(x)
       @inbounds out +=   sqrt(x[i].real^2 + x[i].imag^2)
    end
    return out
end

normAoS_vec (generic function with 1 method)

In [110]:
normAoS_muladd(arrAoS)

749.87445f0

In [111]:
normAoS(arrAoS)

749.8745259158313

In [113]:
@btime  normAoS_vec($arrAoS);

  2.140 μs (0 allocations: 0 bytes)
