
## Simd examples


Run 
```
include(joinpath(dirname(JULIA_HOME),"share","julia","build_sysimg.jl")); build_sysimg(force=true)
```
if julia is not build in source.


In [2]:
using BenchmarkTools

[1m[36mINFO: [39m[22m[36mRecompiling stale cache file /Users/macpro/.julia/lib/v0.6/HDF5.ji for module HDF5.
[39m[1m[36mINFO: [39m[22m[36mRecompiling stale cache file /Users/macpro/.julia/lib/v0.6/JLD.ji for module JLD.
[39m

In [3]:
function mysum(a::Vector)
    total = zero(eltype(a))
    for x in a
        total += x
    end
    return total
end

function mysum_simd(a::Vector)
    total = zero(eltype(a))
    @simd for x in a
        total += x
    end
    return total
end

mysum_simd (generic function with 1 method)

In [4]:
x = rand(Float64 , 100000);

In [5]:
@code_native mysum_simd(x)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[3]
Source line: 67
	movq	8(%rdi), %rax
Source line: 64
	movq	24(%rdi), %rdx
	xorl	%ecx, %ecx
Source line: 79
	testq	%rdx, %rdx
	cmovnsq	%rdx, %rcx
Source line: 68
	testq	%rax, %rax
	jle	L305
Source line: 79
	leaq	-1(%rcx), %rsi
	cmpq	%rdx, %rsi
	jae	L313
Source line: 50
	movq	(%rdi), %r9
Source line: 66
	leaq	96(%r9), %r8
	movq	%rax, %r10
	andq	$-16, %r10
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%edi, %edi
Source line: 50
	vxorpd	%ymm1, %ymm1, %ymm1
Source line: 66
	jmp	L288
	nopw	%cs:(%rax,%rax)
Source line: 71
L80:
	testq	%rax, %rax
	jle	L288
Source line: 50
	cmpq	$16, %rax
	jae	L102
	xorl	%edx, %edx
	jmp	L253
L102:
	movq	%rax, %rdx
	andq	$-16, %rdx
	je	L251
	vblendpd	$1, %ymm0, %ymm1, %ymm0 ## ymm0 = ymm0[0],ymm1[1,2,3]
	vxorpd	%ymm2, %ymm2, %ymm2
Source line: 74
	movq	%r10, %rsi
	movq	%r8, %rcx
	vxorpd	%ymm3, %ymm3, %ymm3
	vxorpd	%ymm4, %ymm4, %ymm4
	nopl	(%rax,%rax)
Source line: 50
L144:
	vmovupd	-96(%rcx), %xmm5
	vmovupd	-64

In [6]:
@code_native mysum(x)

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[3]
Source line: 3
	movq	8(%rdi), %rax
	vxorpd	%xmm0, %xmm0, %xmm0
	testq	%rax, %rax
	je	L50
	movq	(%rdi), %rdx
	movq	24(%rdi), %rsi
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%ecx, %ecx
	nopw	(%rax,%rax)
L32:
	cmpq	%rsi, %rcx
	jae	L51
Source line: 4
	vaddsd	(%rdx,%rcx,8), %xmm0, %xmm0
Source line: 3
	incq	%rcx
	cmpq	%rcx, %rax
	jne	L32
Source line: 6
L50:
	retq
L51:
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 3
	movq	%rsp, %rax
	leaq	-16(%rax), %rsi
	movq	%rsi, %rsp
	incq	%rcx
	movq	%rcx, -16(%rax)
	movabsq	$jl_bounds_error_ints, %rax
	movl	$1, %edx
	callq	*%rax
	nopl	(%rax)


In [7]:
@benchmark mysum(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     77.029 μs (0.00% GC)
  median time:      81.211 μs (0.00% GC)
  mean time:        80.979 μs (0.00% GC)
  maximum time:     187.244 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [8]:
@benchmark mysum_simd(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     21.962 μs (0.00% GC)
  median time:      23.467 μs (0.00% GC)
  mean time:        23.644 μs (0.00% GC)
  maximum time:     91.863 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [9]:
x = rand(Float32 , 100000);
sizeof(x)

400000

In [10]:
@benchmark mysum(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     77.042 μs (0.00% GC)
  median time:      81.218 μs (0.00% GC)
  mean time:        80.648 μs (0.00% GC)
  maximum time:     243.196 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [11]:
@benchmark mysum_simd(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     10.893 μs (0.00% GC)
  median time:      11.197 μs (0.00% GC)
  mean time:        11.635 μs (0.00% GC)
  maximum time:     65.019 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [12]:
x = rand(Float16, 100000);

In [13]:
# TO CHECK
# much slower than Float32, why?
# Can we make Float16 faster than Float32
@benchmark mysum_simd(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     981.829 μs (0.00% GC)
  median time:      1.036 ms (0.00% GC)
  mean time:        1.043 ms (0.00% GC)
  maximum time:     1.844 ms (0.00% GC)
  --------------
  samples:          4778
  evals/sample:     1

### Example cost function simd

In [22]:
y =  rand(Float32 , 100000);
y_hat =  rand(Float32 , 100000);

In [36]:
function MSE_simd{T}(y::Vector{T},y_pred::Vector{T})
   cost = zero(eltype(y))
   @simd for i in 1:length(y)
       @inbounds cost += (y[i] - y_pred[i])^2
   end
   return sqrt(cost)
end

function MSE{T}(y::Vector{T},y_pred::Vector{T})
   cost = zero(eltype(y))
    
    @inbounds for i in 1:length(y)
        cost += (y[i] - y_pred[i])^2
    end
    return sqrt(cost)
end

MSE (generic function with 1 method)

In [49]:
@benchmark MSE(y,y_hat)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     77.087 μs (0.00% GC)
  median time:      81.340 μs (0.00% GC)
  mean time:        82.346 μs (0.00% GC)
  maximum time:     310.952 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [289]:
@benchmark MSE_simd(y, y_hat)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     21.102 μs (0.00% GC)
  median time:      22.396 μs (0.00% GC)
  mean time:        22.892 μs (0.00% GC)
  maximum time:     295.311 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

### cross entropy minibatch

In [15]:
T = Float32
p_y_given_x = rand(T, 10, 256);
onehot_y = zeros(T, 10, 256);

In [16]:
onehot_y

10×256 Array{Float32,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0

## Explicit SIMD vectorization

In [14]:
using SIMD
Base.@pure simdwidth(::Type{T}) where {T} = Int(256/8/sizeof(T))

println("simdwitdth Float16: ", simdwidth(Float16))
println("simdwitdth Float32: ", simdwidth(Float32))
println("simdwitdth Float64: ", simdwidth(Float64))

println("\nsimdwitdth Int16: ", simdwidth(Int16))
println("simdwitdth Int32: ", simdwidth(Int32))
println("simdwitdth Int64: ", simdwidth(Int64))

simdwitdth Float16: 16
simdwitdth Float32: 8
simdwitdth Float64: 4

simdwitdth Int16: 16
simdwitdth Int32: 8
simdwitdth Int64: 4


In [19]:
v = Vec{8,Float32}((1,2,3,4,5,6,7,8))

8-element SIMD.Vec{8,Float32}:
Float32⟨1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0⟩

In [23]:
# This will should return an error on this machine?
v = Vec{8,Float64}((1,2,3,4,5,6,7,8))

8-element SIMD.Vec{8,Float64}:
Float64⟨1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0⟩

In [35]:
v = Vec{8,Float32}((1,2,3,4,5,6,7,8))

8-element SIMD.Vec{8,Float32}:
Float32⟨1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0⟩

In [29]:
v_tuple = NTuple{8,Float32}(v)

(1.0f0, 2.0f0, 3.0f0, 4.0f0, 5.0f0, 6.0f0, 7.0f0, 8.0f0)

In [34]:
println(typeof(v_tuple))
println(typeof(v))

NTuple{8,Float32}
SIMD.Vec{8,Float32}


#### Operations on SIMD.Vec types


SIMD.Vec types can contain elements from the following collection:
```
Bool Int{8,16,32,64,128} UInt{8,16,32,64,128} Float{16,32,64}
```

The following vector operations can be used

```
+ - * / % ^ ! ~ & | $ << >> >>> == != < <= > >=
```

The following reduction operations can be used

```
all any maximum minimum sum prod
```


In [46]:
v = Vec{8,Float32}((1,2,3,4,5,6,7,8))

8-element SIMD.Vec{8,Float32}:
Float32⟨1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0⟩

In [60]:
#println("all(v): ", all(v))
println("sum(v): ", sum(v))
println("maximum(v): ", maximum(v))
println("minimum(v): ", minimum(v))
println("prod(v): ", prod(v))

sum(v): 36.0
maximum(v): 8.0
minimum(v): 1.0
prod(v): 40320.0


#### Accessing arrays: reading and writting from julia Arrays
When using explicit SIMD vectorization, it is convenient to allocate arrays still as arrays of scalars, not as arrays of vectors. The vload and vstore functions allow reading vectors from and writing vectors into arrays, accessing several contiguous array elements.

In [74]:
arr = Vector{Float64}(100:200);

In [76]:
# The vload call reads a vector of size 4 from the array, i.e. it reads arr[i:i+3]
xs = vload(Vec{4,Float64}, arr, 1)

4-element SIMD.Vec{4,Float64}:
Float64⟨100.0, 101.0, 102.0, 103.0⟩

In [79]:
xs = 2*xs
#Similarly, the vstore call writes the vector xs to the four array elements arr[i:i+3].
vstore(xs, arr, 1)

In [101]:
arr[1:10]

10-element Array{Float64,1}:
 200.0
 202.0
 204.0
 206.0
 104.0
 105.0
 106.0
 107.0
 108.0
 109.0

#### Making some easy functions

In [99]:
x1 = rand(Float32, 512)
x2 = rand(Float32, 512)
y = similar(x1)

function add!(y, x1,x2)
    @inbounds for i=1:length(x1)
        y[i] = x1[i] + x2[i] 
    end
end

function simd_add!(y, x1,x2)
    @simd for i=1:length(x1)
        @inbounds y[i] = x1[i] + x2[i] 
    end
end

function simd_add_no_inbounds!(y, x1,x2)
    @simd for i=1:length(x1)
        y[i] = x1[i] + x2[i] 
    end
end

function vadd!{N,T}(y::Vector{T}, xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}=Vec{8,T})
    @inbounds for i in 1:N:length(xs)
        xv = vload(Vec{N,T}, xs, i)
        yv = vload(Vec{N,T}, ys, i)
        xv += yv 
        vstore(xv, y, i)
    end
end

function euclid!(y, x1,x2)
    @inbounds for i=1:length(x1)
        y[i] = sqrt(x1[1] * x1[1]) + (x2[i] * x2[i])
    end
end

function veuclid!{N,T}(y::Vector{T}, xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}=Vec{8,T})
    @inbounds for i in 1:N:length(xs)
        xv = vload(Vec{N,T}, xs, i)
        yv = vload(Vec{N,T}, ys, i)
        xv = sqrt(xv*xv + yv*yv)
        vstore(xv, y, i)
    end
end

veuclid! (generic function with 2 methods)

In [86]:
@benchmark euclid!(y,x1,x2)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     316.636 ns (0.00% GC)
  median time:      339.911 ns (0.00% GC)
  mean time:        348.751 ns (0.00% GC)
  maximum time:     13.335 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     236

In [87]:
 @benchmark veuclid!(y,x1,x2)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     335.747 ns (0.00% GC)
  median time:      361.215 ns (0.00% GC)
  mean time:        393.591 ns (0.00% GC)
  maximum time:     1.545 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     221

In [88]:
 @benchmark add!(y,x1,x2)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     65.874 ns (0.00% GC)
  median time:      68.821 ns (0.00% GC)
  mean time:        83.308 ns (0.00% GC)
  maximum time:     196.249 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     976

In [89]:
@benchmark vadd!(y,x1,x2)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     146.237 ns (0.00% GC)
  median time:      148.293 ns (0.00% GC)
  mean time:        186.955 ns (0.00% GC)
  maximum time:     447.124 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     839

In [95]:
@benchmark simd_add!(y,x1,x2)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     44.431 ns (0.00% GC)
  median time:      44.868 ns (0.00% GC)
  mean time:        53.032 ns (0.00% GC)
  maximum time:     196.486 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     990

In [102]:
@benchmark simd_add_no_inbounds!(y,x1,x2)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     539.552 ns (0.00% GC)
  median time:      545.930 ns (0.00% GC)
  mean time:        611.648 ns (0.00% GC)
  maximum time:     1.293 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     194

### median pooling

In [46]:
#https://discourse.julialang.org/t/make-this-code-fast-median-pooling/6405

In [8]:
@inline function median5_swap(a,b,c,d,e)
    # https://github.com/JeffreySarnoff/SortingNetworks.jl/blob/master/src/swapsort.jl
    a,b = minmax(a,b)
    c,d = minmax(c,d)
    a,c = minmax(a,c)
    b,d = minmax(b,d)
    c,e = minmax(e,c)
    max(c, min(e,b))
end

@inline median5(args...) = median5_swap(args...)

function medmedpool55!(out::AbstractMatrix, img::AbstractMatrix)
    @assert size(out, 1) >= size(img, 1) ÷ 5
    @assert size(out, 2) >= size(img, 2) ÷ 5
    @inbounds for j ∈ indices(out)[2]
        @simd for i ∈ indices(out)[1]
            x11 = img[5i-4, 5j-4]
            x21 = img[5i-3, 5j-4]
            x31 = img[5i-2, 5j-4]
            x41 = img[5i-1, 5j-4]
            x51 = img[5i-0, 5j-4]
            
            x12 = img[5i-4, 5j-3]
            x22 = img[5i-3, 5j-3]
            x32 = img[5i-2, 5j-3]
            x42 = img[5i-1, 5j-3]
            x52 = img[5i-0, 5j-3]
            
            x13 = img[5i-4, 5j-2]
            x23 = img[5i-3, 5j-2]
            x33 = img[5i-2, 5j-2]
            x43 = img[5i-1, 5j-2]
            x53 = img[5i-0, 5j-2]
            
            x14 = img[5i-4, 5j-1]
            x24 = img[5i-3, 5j-1]
            x34 = img[5i-2, 5j-1]
            x44 = img[5i-1, 5j-1]
            x54 = img[5i-0, 5j-1]
            
            x15 = img[5i-4, 5j-0]
            x25 = img[5i-3, 5j-0]
            x35 = img[5i-2, 5j-0]
            x45 = img[5i-1, 5j-0]
            x55 = img[5i-0, 5j-0]
            
            y1 = median5(x11,x12,x13,x14,x15)
            y2 = median5(x21,x22,x23,x24,x25)
            y3 = median5(x31,x32,x33,x34,x35)
            y4 = median5(x41,x42,x43,x44,x45)
            y5 = median5(x51,x52,x53,x54,x55)
            
            z = median5(y1,y2,y3,y4,y5)
            out[i,j] = z
        end
    end
    out
end

medmedpool55! (generic function with 1 method)

In [5]:
using BenchmarkTools
imgs = randn(Float32, 1024,1024, 10)
img = view(imgs, :,:,1)
out = similar(img, size(img) .÷ 5)
@benchmark medmedpool55!(out, img)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     8.746 ms (0.00% GC)
  median time:      10.786 ms (0.00% GC)
  mean time:        11.430 ms (0.00% GC)
  maximum time:     47.553 ms (0.00% GC)
  --------------
  samples:          437
  evals/sample:     1

In [13]:
size(imgs),size([rand(T,N) for _ in 1:6])

((1024, 1024, 10), (6,))

In [None]:
Base.@pure simdwidth(::Type{T}) where {T} = Int(256/8/sizeof(T))

@inline function median3(a,b,c)
    max(min(a,b), min(c,max(a,b)))
end

@inline function median5(a,b,c,d,e)
    # https://stackoverflow.com/questions/480960/code-to-calculate-median-of-five-in-c-sharp
    f=max(min(a,b),min(c,d))
    g=min(max(a,b),max(c,d))
    median3(e,f,g)
end

@noinline function median5_vectors!(out, a,b,c,d,e)
    K = simdwidth(eltype(out))
    N = length(out)
    T = eltype(out)
    V = Vec{K,T}
    @assert mod(N,K) == 0

    @inbounds for i in 1:K:N
        va = vload(V,a, i)
        vb = vload(V,b, i)
        vc = vload(V,c, i)
        vd = vload(V,d, i)
        ve = vload(V,e, i)
        vo = median5(va,vb,vc,vd,ve)
        vstore(vo,out, i)
    end
    out
end

using BenchmarkTools
T = UInt8
T = Float32
N = 10^6
N = N ÷ simdwidth(T) * simdwidth(T)
out, a,b,c,d,e = [rand(T,N) for _ in 1:6]
@benchmark median5_vectors!(out, a,b,c,d,e)


In [47]:
simdwidth(Float16)

16

### Test views

In [304]:
X = ones(10,5)
x = Array(1:10);
X2 = x.+X

10×5 Array{Float64,2}:
  2.0   2.0   2.0   2.0   2.0
  3.0   3.0   3.0   3.0   3.0
  4.0   4.0   4.0   4.0   4.0
  5.0   5.0   5.0   5.0   5.0
  6.0   6.0   6.0   6.0   6.0
  7.0   7.0   7.0   7.0   7.0
  8.0   8.0   8.0   8.0   8.0
  9.0   9.0   9.0   9.0   9.0
 10.0  10.0  10.0  10.0  10.0
 11.0  11.0  11.0  11.0  11.0

In [305]:
v = view(X2,1,:)
for i in 1:size(X2,1)
    v = view(X2,i,:)
    X2[i,:] += v
end
X2

10×5 Array{Float64,2}:
  4.0   4.0   4.0   4.0   4.0
  6.0   6.0   6.0   6.0   6.0
  8.0   8.0   8.0   8.0   8.0
 10.0  10.0  10.0  10.0  10.0
 12.0  12.0  12.0  12.0  12.0
 14.0  14.0  14.0  14.0  14.0
 16.0  16.0  16.0  16.0  16.0
 18.0  18.0  18.0  18.0  18.0
 20.0  20.0  20.0  20.0  20.0
 22.0  22.0  22.0  22.0  22.0

In [302]:
X = ones(10,5)
x = Array(1:10);
X2 = x.+X

10×5 Array{Float64,2}:
  2.0   2.0   2.0   2.0   2.0
  3.0   3.0   3.0   3.0   3.0
  4.0   4.0   4.0   4.0   4.0
  5.0   5.0   5.0   5.0   5.0
  6.0   6.0   6.0   6.0   6.0
  7.0   7.0   7.0   7.0   7.0
  8.0   8.0   8.0   8.0   8.0
  9.0   9.0   9.0   9.0   9.0
 10.0  10.0  10.0  10.0  10.0
 11.0  11.0  11.0  11.0  11.0

In [303]:
v = view(X2,1,:)
for i in 1:size(X2,1)
    v .= view(X2,i,:)
    X2[i,:] += v
end

X2

10×5 Array{Float64,2}:
 11.0  11.0  11.0  11.0  11.0
  6.0   6.0   6.0   6.0   6.0
  8.0   8.0   8.0   8.0   8.0
 10.0  10.0  10.0  10.0  10.0
 12.0  12.0  12.0  12.0  12.0
 14.0  14.0  14.0  14.0  14.0
 16.0  16.0  16.0  16.0  16.0
 18.0  18.0  18.0  18.0  18.0
 20.0  20.0  20.0  20.0  20.0
 22.0  22.0  22.0  22.0  22.0

In [325]:
X = ones(10,5)
x = Array(1:10);
X2 = x.+X

v = deepcopy(view(X2,1,:))
for i in 1:size(X2,1)
    v .= view(X2,i,:)
    X2[i,:] += v
end
X2

10×5 Array{Float64,2}:
  4.0   4.0   4.0   4.0   4.0
  6.0   6.0   6.0   6.0   6.0
  8.0   8.0   8.0   8.0   8.0
 10.0  10.0  10.0  10.0  10.0
 12.0  12.0  12.0  12.0  12.0
 14.0  14.0  14.0  14.0  14.0
 16.0  16.0  16.0  16.0  16.0
 18.0  18.0  18.0  18.0  18.0
 20.0  20.0  20.0  20.0  20.0
 22.0  22.0  22.0  22.0  22.0

In [324]:
X = ones(10,5)
x = Array(1:10);
X2 = x.+X

v = zeros(X2[1,:])
for i in 1:size(X2,1)
    v .= view(X2,i,:)
    X2[i,:] += v
end
X2

10×5 Array{Float64,2}:
  4.0   4.0   4.0   4.0   4.0
  6.0   6.0   6.0   6.0   6.0
  8.0   8.0   8.0   8.0   8.0
 10.0  10.0  10.0  10.0  10.0
 12.0  12.0  12.0  12.0  12.0
 14.0  14.0  14.0  14.0  14.0
 16.0  16.0  16.0  16.0  16.0
 18.0  18.0  18.0  18.0  18.0
 20.0  20.0  20.0  20.0  20.0
 22.0  22.0  22.0  22.0  22.0

In [315]:
@time v = deepcopy(view(X2,1,:))

  0.000056 seconds (37 allocations: 1.813 KiB)


5-element SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true}:
 4.0
 4.0
 4.0
 4.0
 4.0

In [None]:
@time v = deepcopy(view(X2,1,:))

In [313]:
@time auxiliar = view(X2,1,:)

  0.000043 seconds (21 allocations: 512 bytes)


5-element SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true}:
 4.0
 4.0
 4.0
 4.0
 4.0