# Performance optimization exercise 2

Optimize the following function.

In [1]:
function work!(A, B, v, N)
    val = 0
    for i in 1:N
        for j in 1:N
            val = mod(v[i],256);
            A[i,j] = B[i,j]*(sin(val)*sin(val)-cos(val)*cos(val));
        end
    end
    return A
end

work! (generic function with 1 method)

The (fixed) input is given by:

In [2]:
N = 4000
A = [float(i+j) for i in 1:N, j in 1:N] # matrix of size NxN
B = [float(i-j) for i in 1:N, j in 1:N]
v = [i for i in 1:N]

work!(A,B,v,N);

You can benchmark with the following code snippet. The **larger** the Mega-iterations per second (MIt/s) the better!

In [3]:
using BenchmarkTools
runtime = @belapsed work!($A, $B, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 17.627403063710076 MIt/s


In [8]:
import Base.Threads: @threads


## Optimizations

In [15]:
function work_opt(A, B, v, N)
    val = [-cos(2*mod(x, 256)) for x in v]
    @inbounds @threads for i in 1:N 
        for j in 1:N
            A[j, i] = B[j, i] * val[i];
        end
    end
    return A
end

function work_opt2(A, B, v, N)
    @inbounds val = [-cos(2*mod(x, 256)) for x in v]
    @inbounds for i in 1:N 
        for j in 1:N
            A[j, i] = B[j, i] * val[i];
        end
    end
    return A
end


work_opt2 (generic function with 1 method)

In [16]:
runtime = @belapsed work_opt($A, $B, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 694.3221846743393 MIt/s


In [17]:
runtime = @belapsed work_opt2($A, $B, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 733.8261954670822 MIt/s
