# Performance optimization exercise 1

Optimize the function `work!` in the following code. You may change the function name, the function signature, and the function body.

However, the types and sizes of the inputs `N`, `A`, `b`, `c` are fixed and **may not be changed.**

In [151]:
# function you should optimze
function work!(A, N)
    D = zeros(N,N)
    for i in 1:N
        D = b[i]*c*A
        b[i] = sum(D)/N^2
    end
    return b
end

# fixed input (do not change!)
N = 10
A = [float(i+j) for i in 1:N, j in 1:N] # matrix of size NxN
b = collect(Float64, 1:N) # vector of length N
c = 1.23;

# desired result (do not change!)
const RESULT = [13.53, 27.06, 40.59, 54.12, 67.65, 81.18, 94.71, 108.24, 121.77, 135.3];



In [152]:
using BenchmarkTools
@btime work!($A, $N);

  2.289 μs (51 allocations: 10.25 KiB)


## Optimizations

### Avoiding globals

In [153]:
@code_warntype work!(A,N)

Variables
  #self#[36m::Core.Compiler.Const(work!, false)[39m
  A[36m::Array{Float64,2}[39m
  N[36m::Int64[39m
  D[91m[1m::Any[22m[39m
  @_5[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m

Body[91m[1m::Any[22m[39m
[90m1 ─[39m       (D = Main.zeros(N, N))
[90m│  [39m %2  = (1:N)[36m::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])[39m
[90m│  [39m       (@_5 = Base.iterate(%2))
[90m│  [39m %4  = (@_5 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_5::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│  [39m %10 = Base.getindex(Main.b, i)[91m[1m::Any[22m[39m
[90m│  [39m       (D = %10 * Main.c * A)
[90m│  [39m %12 = Main.sum(D)[91m[1m::Any[22m[39m
[90m│  [39m %13 = Core.apply_type(B

In [154]:
function work1!(A, N, b, c)
    D = zeros(N,N)
    for i in 1:N
        D = b[i]*c*A
        b[i] = sum(D)/N^2
    end
    return b
end

work1! (generic function with 1 method)

In [155]:
@code_warntype work1!(A,N,b,c)

Variables
  #self#[36m::Core.Compiler.Const(work1!, false)[39m
  A[36m::Array{Float64,2}[39m
  N[36m::Int64[39m
  b[36m::Array{Float64,1}[39m
  c[36m::Float64[39m
  D[36m::Array{Float64,2}[39m
  @_7[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m

Body[36m::Array{Float64,1}[39m
[90m1 ─[39m       (D = Main.zeros(N, N))
[90m│  [39m %2  = (1:N)[36m::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])[39m
[90m│  [39m       (@_7 = Base.iterate(%2))
[90m│  [39m %4  = (@_7 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_7::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│  [39m %10 = Base.getindex(b, i)[36m::Float64[39m
[90m│  [39m       (D = %10 * c * A)
[90m│  [39m %12 = Main.sum(D)[36m::Float6

In [156]:
b = collect(Float64, 1:N) # vector of length N

work1!(A,N,b,c) ≈ RESULT

true

In [157]:
@btime work1!($A, $N, $b, $c);

  1.239 μs (11 allocations: 9.63 KiB)


### Avoiding globals + temporary allocations

In [158]:
function work2!(A, N, b, c)
    D = zeros(N,N)
    for i in 1:N
        @. D = b[i]*c*A
        b[i] = sum(D)/N^2
    end
    return b
end

b = collect(Float64, 1:N) # vector of length N
@assert work2!(A,N,b,c) ≈ RESULT

@btime work2!($A, $N, $b, $c);

  846.838 ns (1 allocation: 896 bytes)


In [159]:
function work3!(A, N, b, c)
    D = zeros(N,N)
    for i in 1:N
        @inbounds for j in eachindex(D)
            D[j] = b[i]*c*A[j]
        end
        b[i] = sum(D)/N^2
    end
    return b
end

b = collect(Float64, 1:N) # vector of length N
@assert work3!(A,N,b,c) ≈ RESULT

@btime work3!($A, $N, $b, $c);

  352.118 ns (1 allocation: 896 bytes)


### Avoiding globals + temporary allocations and merging `sum` with loop

In [160]:
function work4!(A, N, b, c)
    D = zeros(N,N)
    for i in 1:N
        s = 0.0
        @inbounds @simd for j in eachindex(D)
            D[j] = b[i]*c*A[j]
            s += D[j]
        end
        b[i] = s/N^2
    end
    return b
end

b = collect(Float64, 1:N) # vector of length N
@assert work4!(A,N,b,c) ≈ RESULT

@btime work4!($A, $N, $b, $c);

  216.105 ns (1 allocation: 896 bytes)


### Realizing that one can factor out `b` and `c`

In [161]:
# function work!(A, N)
#     D = zeros(N,N)
#     for i in 1:N
#         D = b[i]*c*A
#         b[i] = sum(D)/N^2
#     end
# end

# function work!(A, N)
#     for i in 1:N
#         b[i] = sum(b[i]*c*A)/N^2
#     end
# end

# function work!(A, N)
#     for i in 1:N
#         b[i] = b[i]*c*sum(A)/N^2
#     end
# end

# function work!(A, N)
#     D = c*sum(A)/N^2
#     for i in 1:N
#         b[i] *= D
#     end
# end

function work5!(A, N, b, c)
    D = c * sum(A)/N^2
    b .*= D
end

b = collect(Float64, 1:N) # vector of length N
@assert work5!(A, N, b, c) ≈ RESULT

@btime work5!($A, $N, $b, $c);

  27.836 ns (0 allocations: 0 bytes)
