# Optimization



https://github.com/JuliaArrays/TiledIteration.jl


https://julialang.org/blog/2013/09/fast-numeric


### Related work 

https://www.mreisinger.com/2016/06/01/enabling-polyhedral-optimizations-in-julia.html

In [1]:
using BenchmarkTools

In [2]:
n = 10_000_000
x = rand(Float32, n);

In [3]:
function sum(x)
    i = 0
    n = length(x)
    s = zero(eltype(x))
    
    @inbounds for i in 1:n
        s += x[i]
    end
    
    return s 
end

sum (generic function with 1 method)

In [4]:
@time sum(x)

  0.027919 seconds (16.45 k allocations: 897.229 KiB)


5.000458f6

In [5]:
@benchmark sum(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     9.863 ms (0.00% GC)
  median time:      10.716 ms (0.00% GC)
  mean time:        10.745 ms (0.00% GC)
  maximum time:     13.546 ms (0.00% GC)
  --------------
  samples:          465
  evals/sample:     1

In [6]:
function sum_unrolled_2(x)
    i = 0
    n = length(x)
    s = zero(eltype(x))
    
    @inbounds for i in 1:2:n
        s += x[i]
        s += x[i+1]
    end
    
    return s 
end

sum_unrolled_2 (generic function with 1 method)

In [7]:
sum_unrolled_2(x) ≈ sum(x)

true

In [8]:
@time sum_unrolled_2(x)

  0.012089 seconds (5 allocations: 176 bytes)


5.000458f6

In [9]:
@benchmark sum_unrolled_2(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     10.098 ms (0.00% GC)
  median time:      10.772 ms (0.00% GC)
  mean time:        10.970 ms (0.00% GC)
  maximum time:     16.974 ms (0.00% GC)
  --------------
  samples:          455
  evals/sample:     1

In [10]:
function sum_unrolled_4(x)
    i = 0
    n = length(x)
    s = zero(eltype(x))
    
    @inbounds for i in 1:4:n
        s += x[i]
        s += x[i+1]
        s += x[i+2]x
        s += x[i+3]
    end
    
    return s 
end

sum_unrolled_4 (generic function with 1 method)

In [11]:
@benchmark sum_unrolled_4(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     9.934 ms (0.00% GC)
  median time:      10.846 ms (0.00% GC)
  mean time:        11.104 ms (0.00% GC)
  maximum time:     18.782 ms (0.00% GC)
  --------------
  samples:          450
  evals/sample:     1

In [12]:
function sum_simd(x)
    i = 0
    n = length(x)
    s = zero(eltype(x))
    
    @inbounds @simd for i in 1:n
        s += x[i]
    end
    
    return s 
end

sum_simd (generic function with 1 method)

In [13]:
@benchmark sum_simd(x)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     2.646 ms (0.00% GC)
  median time:      2.888 ms (0.00% GC)
  mean time:        2.951 ms (0.00% GC)
  maximum time:     6.120 ms (0.00% GC)
  --------------
  samples:          1683
  evals/sample:     1

## Matrix multiplication


https://gist.github.com/nadavrot/5b35d44e8ba3dd718e595e40184d03f0

In [259]:
p = 100
q = 80
r = 70

A = rand(p,q)
B = rand(q,r)

# C is result of A*B
C = zeros(p,r)

100×70 Array{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0

Write in J

```
C(m, n) = A(m, k) * B(k, n)

Translates to

for (int i = 0; i < m; i++) {
  for (int j = 0; j < n; j++) {
    for (int p = 0; p < k; p++) {
      C(i, j) += A(i, p) * B(p, j);
    }
  }
}```



In [260]:
function matmul!(C,A,B)
   
    I, J = size(A)
    P, Q = size(B)    
    for i = 1:I
        for q = 1:Q
            for j = 1:J
                C[i,q] += A[i,1]*B[1,q]
            end
        end
    end
end

function matmul(A,B)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    for i = 1:I
        for q = 1:Q
            for j = 1:J
                C[i,q] += A[i,j]*B[j,q]
            end
        end
    end
    return C
end

matmul (generic function with 1 method)

In [261]:
isapprox(A*B, matmul(A,B))

true

In [262]:
@benchmark matmul(A,B)

BenchmarkTools.Trial: 
  memory estimate:  54.77 KiB
  allocs estimate:  2
  --------------
  minimum time:     1.479 ms (0.00% GC)
  median time:      1.493 ms (0.00% GC)
  mean time:        1.581 ms (0.32% GC)
  maximum time:     5.126 ms (63.24% GC)
  --------------
  samples:          3145
  evals/sample:     1

In [263]:
@benchmark A*B

BenchmarkTools.Trial: 
  memory estimate:  54.77 KiB
  allocs estimate:  2
  --------------
  minimum time:     29.997 μs (0.00% GC)
  median time:      34.022 μs (0.00% GC)
  mean time:        52.137 μs (12.54% GC)
  maximum time:     4.311 ms (98.12% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [264]:
function matmul_inbounds(A,B)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    @inbounds for i = 1:I
        for q = 1:Q
            for j = 1:J
                C[i,q] += A[i,j]*B[j,q]
            end
        end
    end
    return C
end

matmul_inbounds (generic function with 1 method)

In [265]:
@benchmark matmul_inbounds(A,B)

BenchmarkTools.Trial: 
  memory estimate:  54.77 KiB
  allocs estimate:  2
  --------------
  minimum time:     529.545 μs (0.00% GC)
  median time:      664.341 μs (0.00% GC)
  mean time:        887.662 μs (0.68% GC)
  maximum time:     61.692 ms (0.00% GC)
  --------------
  samples:          5505
  evals/sample:     1

### using polly matmul

To use polly you need to compile it 

In [280]:
@polly function matmul_polly(A,B,C)
    m,n = size(A)
    n,o = size(B)
    @inbounds for i=1:m, j=1:o, k=1:n
        C[i,j] += A[i,k] * B[k,j]
    end
end

matmul_polly (generic function with 1 method)

In [284]:
@benchmark matmul_polly(A,B,C)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     532.943 μs (0.00% GC)
  median time:      534.057 μs (0.00% GC)
  mean time:        542.598 μs (0.00% GC)
  maximum time:     1.430 ms (0.00% GC)
  --------------
  samples:          9184
  evals/sample:     1

In [286]:
@benchmark matmul_inbounds_JQI(A,B)

BenchmarkTools.Trial: 
  memory estimate:  54.77 KiB
  allocs estimate:  2
  --------------
  minimum time:     436.026 μs (0.00% GC)
  median time:      440.582 μs (0.00% GC)
  mean time:        456.598 μs (1.64% GC)
  maximum time:     4.953 ms (90.42% GC)
  --------------
  samples:          10000
  evals/sample:     1

## Index permuations matrix multiply

How the data is stored and how we take it out of memory might have a huge impact in performance.

Notice there are 3 possible permuations of the indices

-  matmul_inbounds_IQJ
-  matmul_inbounds_IJQ
-  matmul_inbounds_JIQ
-  matmul_inbounds_JQI
-  matmul_inbounds_QIJ
-  matmul_inbounds_QJI
  

In [266]:
function matmul_inbounds_IQJ(A,B)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    @inbounds for i = 1:I
                for q = 1:Q
                    for j = 1:J
                        C[i,q] += A[i,j]*B[j,q]
                    end
                end
            end
    return C
end

matmul_inbounds_IQJ (generic function with 1 method)

In [267]:
function matmul_inbounds_IJQ(A,B)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    @inbounds for i = 1:I
                for j = 1:J
                   for q = 1:Q
                       C[i,q] += A[i,j]*B[j,q]
                   end
                end
            end
    return C
end

matmul_inbounds_IJQ (generic function with 1 method)

In [268]:
function matmul_inbounds_JIQ(A,B)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    @inbounds for j = 1:J
                for i = 1:I
                   for q = 1:Q
                       C[i,q] += A[i,j]*B[j,q]
                   end
                end
            end
    return C
end

matmul_inbounds_JIQ (generic function with 1 method)

In [269]:
function matmul_inbounds_JQI(A,B)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    @inbounds for j = 1:J
                for q = 1:Q
                    for i = 1:I
                       C[i,q] += A[i,j]*B[j,q]
                   end
                end
            end
    return C
end

matmul_inbounds_JQI (generic function with 1 method)

In [270]:
function matmul_inbounds_QIJ(A,B)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    @inbounds for q = 1:Q
                for i = 1:I
                    for j = 1:J
                       C[i,q] += A[i,j]*B[j,q]
                   end
                end
            end
    return C
end

matmul_inbounds_QIJ (generic function with 1 method)

In [271]:
function matmul_inbounds_QJI(A,B)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    @inbounds for q = 1:Q
                for i = 1:I
                    for j = 1:J
                       C[i,q] += A[i,j]*B[j,q]
                   end
                end
            end
    return C
end

matmul_inbounds_QJI (generic function with 1 method)

In [272]:
IQJ = @benchmark matmul_inbounds_IQJ(A,B)
IJQ = @benchmark matmul_inbounds_IJQ(A,B)
JIQ = @benchmark matmul_inbounds_JIQ(A,B)
JQI = @benchmark matmul_inbounds_JQI(A,B)
QIJ = @benchmark matmul_inbounds_QIJ(A,B)
QJI = @benchmark matmul_inbounds_QJI(A,B);

In [273]:
println(mean(IQJ.times)/1000000)
println(mean(IJQ.times)/1000000)
println(mean(JIQ.times)/1000000)
println(mean(JQI.times)/1000000)
println(mean(QIJ.times)/1000000)
println(mean(QJI.times)/1000000)

0.5705620700483092
0.4924839291
0.6216346775901185
0.4875274166
0.571378713956967
0.5866179504024621


We can see QIJ is the bes performant index

In [274]:
function matmul_inbounds_JQI_simd(A,B)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    @inbounds for j = 1:J
                for q = 1:Q
                   @simd for i = 1:I
                       C[i,q] += A[i,j]*B[j,q]
                   end
                end
            end
    return C
end


matmul_inbounds_JQI_simd (generic function with 1 method)

In [275]:
QIJ_simd = @benchmark matmul_inbounds_JQI_simd(A,B)

BenchmarkTools.Trial: 
  memory estimate:  54.77 KiB
  allocs estimate:  2
  --------------
  minimum time:     442.467 μs (0.00% GC)
  median time:      448.068 μs (0.00% GC)
  mean time:        495.115 μs (1.29% GC)
  maximum time:     4.170 ms (86.36% GC)
  --------------
  samples:          10000
  evals/sample:     1

### Task 1 make metaprograming code such that generates a different version of code for each for loop index

In [257]:
function matmul_inbounds_IQJ_simd(A::T,B::T)
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    @inbounds for i = 1:I
                for q = 1:Q
                    aux = zero(T)
                    @simd for j = 1:J
                        aux += A[i,j]*B[j,q]
                    end
                    C[i,q] = aux
                end
            end
    return C
end

matmul_inbounds_IQJ_simd (generic function with 2 methods)

In [258]:
@benchmark matmul_inbounds_IQJ_simd(A,B)

BenchmarkTools.Trial: 
  memory estimate:  5.34 MiB
  allocs estimate:  2
  --------------
  minimum time:     814.984 ms (0.00% GC)
  median time:      846.462 ms (0.00% GC)
  mean time:        859.195 ms (0.06% GC)
  maximum time:     940.195 ms (0.00% GC)
  --------------
  samples:          6
  evals/sample:     1

In [229]:
Threads.nthreads()

1

We can add the scalar product of `A[i,:] * B[:,q]` in an auxiliary variable

In [210]:
function matmul_inbounds_QIJ_simd_acum(A::T,B::T) where T
    
    I, J = size(A)
    P, Q = size(B)    
    if J != P
        println("ERROR")
        return
    end
    
    C = zeros(I,Q)
    # C= Array{T}(undef, I, Q) # UndefRefError: access to undefined reference
    
    @inbounds for q = 1:Q
                for j = 1:J
                    row_times_col = 0
                    @simd for i = 1:I
                       row_times_col += A[i,j]*B[j,q]
                   end
                    C[j,q] = row_times_col
                end
            end
    return C
end

matmul_inbounds_QIJ_simd_acum (generic function with 1 method)

In [211]:
matmul_inbounds_QIJ_simd_acum(A,B)

1000×100 Array{Float64,2}:
 230.661  256.574   160.358   104.757   …  381.466    137.124   142.848 
 373.259  468.36    406.947   256.743       93.2326   153.718   356.71  
 429.855  407.349   398.998   394.064      493.948    383.495   470.119 
 390.695  364.544   399.682   461.16       248.178    122.278   240.923 
 391.658  421.065   242.815    67.2285       3.72325  272.254   433.584 
 171.456   51.9242  212.474   195.287   …   72.1357   440.425   452.462 
 430.366  383.963   293.431   466.025      483.9       35.0885  304.776 
 280.461   15.2007   34.62    218.202      461.956    139.932    51.8188
 130.182  452.15    117.745   345.689      297.127    404.954    21.5072
 266.33   485.11    109.451   114.441      129.563    221.433    37.114 
 252.192  357.883   183.898   289.661   …   48.4865   318.944   373.42  
 352.434   55.5546   78.5262  301.628      281.912     88.0401  273.166 
 357.095   28.0251  318.092   464.542       78.7658   428.265    98.7745
   ⋮                    

In [201]:
QIJ_simd_acum = @benchmark matmul_inbounds_QIJ_simd_acum(A,B)

MethodError: MethodError: no method matching zero(::Type{Array{Float64,2}})
Closest candidates are:
  zero(!Matched::Type{LibGit2.GitHash}) at /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.1/LibGit2/src/oid.jl:220
  zero(!Matched::Type{Pkg.Resolve.VersionWeights.VersionWeight}) at /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.1/Pkg/src/resolve/VersionWeights.jl:19
  zero(!Matched::Type{Pkg.Resolve.MaxSum.FieldValues.FieldValue}) at /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.1/Pkg/src/resolve/FieldValues.jl:44
  ...

In [168]:
@benchmark A*B

BenchmarkTools.Trial: 
  memory estimate:  781.33 KiB
  allocs estimate:  2
  --------------
  minimum time:     949.771 μs (0.00% GC)
  median time:      1.481 ms (0.00% GC)
  mean time:        1.783 ms (6.44% GC)
  maximum time:     11.150 ms (0.00% GC)
  --------------
  samples:          2754
  evals/sample:     1

In [193]:
#C = zeros(I,Q)
T = Float64
C= Array{T}(undef, 400, 5)

400×5 Array{Float64,2}:
   0.0           NaN             …  NaN             NaN           
   0.0             2.25373e-314       0.0             9.06092e-312
   1.07706e-321  NaN                  0.0           NaN           
   8.48798e-314    2.25373e-314       7.63918e-313    6.95232e-310
   8.48798e-314  NaN                  6.95232e-310    1.54906e-312
   1.061e-314      2.25373e-314  …    6.95232e-310    3.23791e-319
   0.0           NaN                  0.0             6.95232e-310
   3.83945e151   NaN                  6.95232e-310    9.06092e-312
   3.70594e-317  NaN                  6.95232e-310  NaN           
   2.48333e-307    2.25373e-314       2.24473e-314    6.95232e-310
   0.0           NaN             …  NaN               1.42174e-312
   1.07706e-321    2.25373e-314       6.95232e-310    3.23791e-319
   4.10074e-322  NaN                  7.29114e-304    6.95232e-310
   ⋮                             ⋱                                
 NaN             NaN                  

In [189]:
C[1,1] += 123

NaN

In [179]:
C= Array{T}(undef, 10, 123)

10×123 Array{Float64,2}:
 NaN             NaN             …    0.0    0.0    0.0    0.0  NaN  
   6.95229e-310    6.95336e-310     NaN      0.0    0.0    0.0    0.0
   6.95336e-310    5.05923e-321       0.0  NaN      0.0    0.0    0.0
   5.05923e-321    6.95232e-310       0.0    0.0  NaN      0.0    0.0
   6.95232e-310    6.95232e-310       0.0    0.0    0.0  NaN      0.0
   6.95232e-310    6.95232e-310  …    0.0    0.0    0.0    0.0  NaN  
   6.95232e-310    6.95232e-310     NaN      0.0    0.0    0.0    0.0
   6.95232e-310    6.95232e-310     NaN    NaN      0.0    0.0    0.0
 NaN             NaN                  0.0  NaN    NaN      0.0    0.0
   6.95232e-310    0.0                0.0    0.0  NaN    NaN      0.0

231

### Refactor code to use simd efficiently

With a single `for` loop it is pretty straightforward to know when we can use `@simd`. What about when we have more than a single loop?