In [32]:
# You should start your notebook with:  JULIA_NUM_THREADS=4 jupyter notebook
Base.Threads.nthreads()

4

In [33]:
N = 1000

A = rand(N,N)
B = rand(N,N)
C = A *B;

### Naive matrix multiply

- select row `r` in A, let's call it `row_r`

- select col `c` in B, let's call it `col_c`

- compute element `C[r,c]` as the scalar product of `row_ by` `row_c`



In [4]:
C2 = zeros(size(A));


function dot_product_row_r_col_c(A,r,B,c, n_elements)
    value = zero(eltype(A))
    for i in 1:n_elements
        value += A[r,i] * B[i,c] 
    end
    return value    
end 


dot_product_row_r_col_c (generic function with 1 method)

In [5]:

"""
Naive matrix multiply of A and B
"""
function naive_matmul!(A, B, C)
    n_rows_A = size(A,1)
    n_cols_B = size(B,2)
    n_elements = n_cols_B
        
    for r in 1:n_rows_A
        for c in 1:n_cols_B
           C[r,c] = dot_product_row_r_col_c(A,r,B,c, n_elements)
        end
    end
    
end

naive_matmul!

In [6]:
@time naive_matmul!(A,B,C2)

  2.256420 seconds (33.55 k allocations: 1.741 MiB)


In [7]:
@time C = A * B;

  0.050353 seconds (6 allocations: 7.630 MiB)


In [8]:
isapprox(C,C2;rtol=0.000001)

true

### Accessing columns in the transposed of `A` instead of rows in `A`

Notice that the elements of an Array in Julia are stroed in memory by column. 

This makes the memory access of `A[r,i]` in the function `dot_product_row_r_col_c` slow (where `i` is changing).

We can improve this by by accessing `A[i,r]` in a new function `dot_product_col_r_col_c`. To have an equivalent computation we simply transpose `A` and then select a row of `A` (a column of `A_t`) and do the dot product with a column of `B`.

In [9]:
C2 = zeros(size(A));

function dot_product_col_r_col_c(A,r,B,c, n_elements)
    value = zero(eltype(A))
    
    for i in 1:n_elements
        value += A[i,r] * B[i,c] 
    end
    return value    
end 

dot_product_col_r_col_c (generic function with 1 method)

In [17]:

"""
Naive matrix multiply of A and B
"""
function naive_matmul_2!(A, B, C)
    n_rows_A = size(A,1)
    n_cols_B = size(B,2)
    n_elements = n_cols_B
    A_t = copy(transpose(A))
    for r in 1:n_rows_A
        for c in 1:n_cols_B
           C[r,c] = dot_product_col_r_col_c(A_t,r,B,c, n_elements)
        end
    end
    
end

naive_matmul_2!

In [18]:
@time naive_matmul_2!(A,B,C2)

  1.231909 seconds (29.08 k allocations: 9.147 MiB)


In [19]:
isapprox(C,C2;rtol=0.000001)

true

Notice that even taking into account that we make a copy of A inside the function this version is twice as fast.


#### Using SIMD instructions

The function `dot_product_col_r_col_c` can benefit from SIMD instructions

In [101]:


@inline function dot_product_col_r_col_c_simd(A,r,B,c, n_elements)
    value = zero(eltype(A))
    
    @simd for i in 1:n_elements
       @inbounds value += A[i,r] * B[i,c] 
    end
    return value    
end 



dot_product_col_r_col_c_simd (generic function with 1 method)

In [102]:

function naive_matmul_3!(A, B, C)
    n_rows_A = size(A,1)
    n_cols_B = size(B,2)
    n_elements = n_cols_B
    A_t = copy(transpose(A))
    for r in 1:n_rows_A
        for c in 1:n_cols_B
           C[r,c] = dot_product_col_r_col_c_simd(A_t,r,B,c, n_elements)
        end
    end
end

naive_matmul_3! (generic function with 1 method)

In [103]:
C2 = zeros(size(A));

@time naive_matmul_3!(A,B,C2)

  0.619295 seconds (78.14 k allocations: 11.712 MiB)


In [104]:
isapprox(C,C2;rtol=0.000001)

true

Notice that A*B is still 10x faster

In [15]:
@time A*B;

  0.055422 seconds (6 allocations: 7.630 MiB, 14.96% gc time)


#### Blocked version version

In [None]:
C2 = zeros(size(A));

function block_times_block(A, r, B, c, block_size)
    value = zero(eltype(A))
    
    for block_r in r:r+block_size
        for block_c in c:c+block_size
            @inbounds value += A[r,c] * B[r,c]  
        end
    end
    return value    
end 



In [172]:

function naive_matmul_4!(A, B, C)
    n_rows_A = size(A,1)
    n_cols_B = size(B,2)
    n_elements = n_cols_B
    bs = 10
    n_elements_block = bs*bs
    
    for row in 1:bs:n_rows_A-bs
        for col in 1:bs:n_cols_B-bs
            for block_row in row:(row+bs)
                for block_col in col:(col+bs)
                    res = 0.0
                    #C[row:row+bs,col:col+bs] = block_times_block(A, row, B, col, n_elements_block)
                    for k in 1:n_elements_block
                        res += A[k, block_row] * B[block_col, k]
                    end
                    C[row, col] = res
                end
            end
        end
    end
    
end

naive_matmul_4! (generic function with 1 method)

In [173]:
C2 = zeros(size(A));

@time naive_matmul_4!(A,B,C2)

  0.175196 seconds (62.81 k allocations: 3.371 MiB)


In [174]:
isapprox(C,C2;rtol=0.000001)

false

In [None]:
B=10

for (int r=0; i<j, i+=B)