In [172]:
# You should start your notebook with:  JULIA_NUM_THREADS=4 jupyter notebook
# Base.Threads.nthreads()

In [613]:
using BenchmarkTools
using TimerOutputs

In [615]:
N = 1000

A = rand(N,N)
B = rand(N,N)
C = A *B;

### Naive matrix multiply

- select row `r` in A, let's call it `row_r`

- select col `c` in B, let's call it `col_c`

- compute element `C[r,c]` as the scalar product of `row_ by` `row_c`




In [617]:
C2 = zeros(size(A));


function dot_product_row_r_col_c(A,r,B,c, n_elements)
    value = zero(eltype(A))
    for i in 1:n_elements
        value += A[r,i] * B[i,c] 
    end
    return value    
end 


dot_product_row_r_col_c (generic function with 1 method)

In [634]:

"""
Naive matrix multiply of A and B
"""

function naive_matmul!(A, B, C)
    n_rows_A = size(A,1)
    n_cols_B = size(B,2)
    n_elements = n_cols_B
    
    for r in 1:n_rows_A
        for c in 1:n_cols_B
           C[r,c] = dot_product_row_r_col_c(A,r,B,c, n_elements)
        end
    end
    
end

naive_matmul! (generic function with 1 method)

In [635]:
@time naive_matmul!(A,B,C2)

  2.574466 seconds (27.50 k allocations: 1.444 MiB)


In [636]:
@time C = A * B;

  0.063270 seconds (6 allocations: 7.630 MiB, 14.74% gc time)


In [637]:
isapprox(C,C2;rtol=0.000001)

true

### Accessing columns in the transposed of `A` instead of rows in `A`

Notice that the elements of an Array in Julia are stroed in memory by column. 

This makes the memory access of `A[r,i]` in the function `dot_product_row_r_col_c` slow (where `i` is changing).

We can improve this by by accessing `A[i,r]` in a new function `dot_product_col_r_col_c`. To have an equivalent computation we simply transpose `A` and then select a row of `A` (a column of `A_t`) and do the dot product with a column of `B`.

In [638]:
C2 = zeros(size(A));

function dot_product_col_r_col_c(A,r,B,c, n_elements)
    value = zero(eltype(A))
    
    for i in 1:n_elements
        value += A[i,r] * B[i,c] 
    end
    return value    
end 

dot_product_col_r_col_c (generic function with 1 method)

In [639]:

"""
Naive matrix multiply of A and B
"""
function naive_matmul_2!(A, B, C)
    n_rows_A = size(A,1)
    n_cols_B = size(B,2)
    n_elements = n_cols_B
    A_t = copy(transpose(A))
    for r in 1:n_rows_A
        for c in 1:n_cols_B
           C[r,c] = dot_product_col_r_col_c(A_t,r,B,c, n_elements)
        end
    end
    
end

naive_matmul_2!

In [640]:
@time naive_matmul_2!(A,B,C2)

  1.285091 seconds (35.13 k allocations: 9.443 MiB, 0.42% gc time)


In [641]:
isapprox(C,C2;rtol=0.000001)

true

Notice that even taking into account that we make a copy of A inside the function this version is twice as fast.

We can profile the function with `TimeroOutputs.jl` to see the execution time spend in the generation of the transposed.

In [656]:
const timer = TimerOutput()

"""
Naive matrix multiply of A and B
"""
function naive_matmul_2!(A, B, C)
    n_rows_A = size(A,1)
    n_cols_B = size(B,2)
    n_elements = n_cols_B
    @timeit timer "transposed" A_t = copy(transpose(A))
    @timeit timer "nrows" begin 
        for r in 1:n_rows_A
            for c in 1:n_cols_B
               @timeit timer "dot product" C[r,c] = dot_product_col_r_col_c(A_t,r,B,c, n_elements)
            end
        end
    end
    
end



naive_matmul_2!

In [657]:
@time naive_matmul_2!(A,B,C2)

  1.664015 seconds (60.03 k allocations: 10.525 MiB)


In [664]:
print(timer)

 [1m────────────────────────────────────────────────────────────────────────[22m
 [1m                        [22m        Time                   Allocations      
                         ──────────────────────   ───────────────────────
    Tot / % measured:         81.8s / 1.95%           46.2MiB / 16.5%    

 Section         ncalls     time   %tot     avg     alloc   %tot      avg
 ────────────────────────────────────────────────────────────────────────
 nrows                1    1.59s   100%   1.59s      864B  0.01%     864B
   dot product    1.00M    1.52s  95.1%  1.52μs     0.00B  0.00%    0.00B
 transposed           1   5.78ms  0.36%  5.78ms   7.63MiB  100%   7.63MiB
 [1m────────────────────────────────────────────────────────────────────────[22m

#### Using SIMD instructions

The function `dot_product_col_r_col_c` can benefit from SIMD instructions

In [666]:
@inline function dot_product_col_r_col_c_simd(A,r,B,c, n_elements)
    value = zero(eltype(A))
    
    @simd for i in 1:n_elements
       @inbounds value += A[i,r] * B[i,c] 
    end
    return value    
end

dot_product_col_r_col_c_simd (generic function with 1 method)

In [667]:
function naive_matmul_3!(A, B, C)
    n_rows_A = size(A,1)
    n_cols_B = size(B,2)
    n_elements = n_cols_B
    A_t = copy(transpose(A))
    for r in 1:n_rows_A
        for c in 1:n_cols_B
           C[r,c] = dot_product_col_r_col_c_simd(A_t,r,B,c, n_elements)
        end
    end
end

naive_matmul_3! (generic function with 1 method)

In [668]:
C2 = zeros(size(A));
@time naive_matmul_3!(A,B,C2)

  0.625690 seconds (78.14 k allocations: 11.712 MiB)


In [669]:
isapprox(C,C2;rtol=0.000001)

true

Notice that A*B is still 10x faster

In [663]:
@time A*B;

  0.048397 seconds (6 allocations: 7.630 MiB)


#### Blocked version version 1


The approach presented here comes from http://csapp.cs.cmu.edu/public/waside/waside-blocking.pdf

The key of this algorithm is to avoid data movenents and reuse the data in the cache. This version is known as a blocked version of matrix multiply.


Let us consider we want to do `A * B`. We can get a block of size `bs x bs` from `B` and try to reuse it as much as possible without changing the block. To do this, after a block from `B` is selected, we will iterate over all slices of `A` of size `bs` (without changing B). Therefore, first we have a double for loop that gives us the left hand side coordinates of a block of size `bs x bs`.


```julia
# assume n is the width and heigh of A, B and C (we have squared matrices)

for row in 1:bs:n
    for col in 1:bs:n
        # (row,col) are the coordinates of the top left hand block from B           
    end
end
```

Now that we know that a block we iterate over row slices from `A`.
To do this  we will add a for loop that iterates over `i in 1:n`.


```julia
# assume n is the width and heigh of A, B and C (we have squared matrices)

for kk in 1:bs:n       # iterates over cols of A (rows of B)
    for jj in 1:bs:n   # iterates over rows of B
    
       # (kk,jj) are the coordinates of the top left hand block from B, B_block=B[kk:kk+bs, jj:jj+bs]
       # [      |             |              |        ]
       # [  B_block[:,1], B_block[:,2], B_block[:,bs] ]
       # [      |             |              |        ]
                       
       for i in 1:n                       # pick slice A[i,kk:kk+bs]   
           for j in jj:jj+bs              # Make dot product  A[i,kk:kk+bs] * B_block[:,j] for all j
               s = C[i,j]
               for k in kk:kk+bs
                  s += A[i,k] * B[k,j]
               end
               C[i,j] =s
           end
        end
    end
end
```


```julia
# scalar product of  A[i,kk:kk+bs] * B_block[:,j] for all j in the B_block

function scalar_product

    for j in jj:jj+bs
        # Make dot product  A[i,kk:kk+bs] * B_block[:,j] for all j
  
        s = C[i,j]
        for k in kk:kk+bs
           s += A[i,k] * B[k,j]
        end
        C[i,j] =s
    end
    return s
end
 ```
 
 
 


In [670]:
n = 10
n=20
for row in 1:bs:n
    for col in 1:bs:n
        # (row,col) are the coordinates of the top left hand block from B           
        println(row, " ",col)
    end
end

1 1
1 11
11 1
11 11


In [671]:
function matmul_blocked_1!(A, B, C)
    bs = 10
    n = size(A,1)

    @inbounds for kk in 1:bs:n               # iterates over cols of A (rows of B)
        for jj in 1:bs:n                     # iterates over rows of B    

            for i in 1:n                     # pick slice A[i,kk:kk+bs]
               for j in jj:jj+bs-1           # Make dot product  A[i,kk:kk+bs] * B_block[:,j] for all j
                   s = C[i,j]
                   for k in kk:kk+bs-1
                      s += A[i,k] * B[k,j]
                   end
                   C[i,j] =s
               end
            end
        end
    end
    # nothing returned, C updated
end

matmul_blocked_1! (generic function with 1 method)

In [672]:
size(A), size(B)

((1000, 1000), (1000, 1000))

In [675]:
C2 = zeros(size(A));
@time matmul_blocked_1!(A,B,C2)

  0.943197 seconds (4 allocations: 160 bytes)


In [676]:
isapprox(C,C2;rtol=0.000001)

true

Profiling the funtion

In [710]:
reset_timer!()

 [1m──────────────────────────────────────────────────────────────────[22m
 [1m                  [22m        Time                   Allocations      
                   ──────────────────────   ───────────────────────
 Tot / % measured:      148μs / 0.00%           2.73KiB / 0.00%    

 Section   ncalls     time   %tot     avg     alloc   %tot      avg
 ──────────────────────────────────────────────────────────────────
 [1m──────────────────────────────────────────────────────────────────[22m

In [711]:
function matmul_blocked_1!(A, B, C)
    bs = 10
    n = size(A,1)

    @inbounds for kk in 1:bs:n               # iterates over cols of A (rows of B)
         for jj in 1:bs:n                     # iterates over rows of B    

             @timeit "for i" for i in 1:n                     # pick slice A[i,kk:kk+bs]
               @timeit "for j" for j in jj:jj+bs-1           # Make dot product  A[i,kk:kk+bs] * B_block[:,j] for all j
                   s = C[i,j]
                   @timeit "scalar product" for k in kk:kk+bs-1
                      s += A[i,k] * B[k,j]
                   end
                   C[i,j] = s
               end
            end
        end
    end
    # nothing returned, C updated
end

matmul_blocked_1! (generic function with 1 method)

In [712]:
C2 = zeros(size(A));
@time matmul_blocked_1!(A,B,C2)

 17.351815 seconds (20.11 M allocations: 310.127 MiB, 0.14% gc time)


In [714]:
print_timer()

 [1m─────────────────────────────────────────────────────────────────────────────[22m
 [1m                             [22m        Time                   Allocations      
                              ──────────────────────   ───────────────────────
       Tot / % measured:           36.1s / 47.8%            318MiB / 95.9%    

 Section              ncalls     time   %tot     avg     alloc   %tot      avg
 ─────────────────────────────────────────────────────────────────────────────
 for i                 10.0k    17.3s   100%  1.73ms    305MiB  100%   31.3KiB
   for j               10.0M    16.1s  93.5%  1.61μs    153MiB  50.0%    16.0B
     scalar product     100M    7.91s  45.9%  79.1ns     0.00B  0.00%    0.00B
 [1m─────────────────────────────────────────────────────────────────────────────[22m

In [735]:
reset_timer!()

 [1m──────────────────────────────────────────────────────────────────[22m
 [1m                  [22m        Time                   Allocations      
                   ──────────────────────   ───────────────────────
 Tot / % measured:      931μs / 0.00%           2.73KiB / 0.00%    

 Section   ncalls     time   %tot     avg     alloc   %tot      avg
 ──────────────────────────────────────────────────────────────────
 [1m──────────────────────────────────────────────────────────────────[22m

In [736]:
function matmul_blocked_1!(A, B, C)
    bs = 10
    n = size(A,1)

    @inbounds for kk in 1:bs:n               # iterates over cols of A (rows of B)
         for jj in 1:bs:n                     # iterates over rows of B    

             @timeit "for i" for i in 1:n                     # pick slice A[i,kk:kk+bs]
               @timeit "for j" for j in jj:jj+bs-1           # Make dot product  A[i,kk:kk+bs] * B_block[:,j] for all j
                   @timeit "get C[i,j]" s = C[i,j]
                   @timeit "scalar product" for k in kk:kk+bs-1
                      s += A[i,k] * B[k,j]
                   end
                   @timeit "store s to C[i,j]" C[i,j] = s
               end
            end
        end
    end
    # nothing returned, C updated
end

matmul_blocked_1! (generic function with 1 method)

In [739]:
reset_timer!()
C2 = zeros(size(A));
matmul_blocked_1!(A,B,C2)

In [742]:
print_timer()

 [1m──────────────────────────────────────────────────────────────────────────────[22m
 [1m                              [22m        Time                   Allocations      
                               ──────────────────────   ───────────────────────
       Tot / % measured:            1257s / 4.68%           4.63GiB / 100%     

 Section               ncalls     time   %tot     avg     alloc   %tot      avg
 ──────────────────────────────────────────────────────────────────────────────
 for i                  10.0k    58.8s   100%  5.88ms   4.62GiB  100%    484KiB
   for j                10.0M    57.7s  98.1%  5.77μs   4.47GiB  96.8%     480B
     scalar product      100M    8.47s  14.4%  84.7ns     0.00B  0.00%    0.00B
     get C[i,j]          100M    7.43s  12.6%  74.3ns     0.00B  0.00%    0.00B
     store s to C[i,j]   100M    6.78s  11.5%  67.8ns     0.00B  0.00%    0.00B
 [1m──────────────────────────────────────────────────────────────────────────────[22m

#### Rewritting the code with dot products

Notice that this version is actually slower than our previous version of matmul naive with simd.

To ilustrate that the code inside the loop over `i in 1:n`.

```
  A[i,kk:kk+bs] * B_block[:,j] for all 
```

In [294]:
@inline function scalar_product_Arow_Bcol(A,i,B,j,kk,bs)
    s = 0.
    for k in kk:kk+bs-1
       @inbounds s += A[i,k] * B[k,j]
    end
    return s
end

scalar_product_Arow_Bcol (generic function with 1 method)

In [305]:
function matmul_blocked_2!(A, B, C)
    bs = 10
    n = size(A,1)

    @inbounds for kk in 1:bs:n            # iterates over cols of A (rows of B)
        for jj in 1:bs:n                  # iterates over rows of B    
            
            for j in jj:jj+bs-1           # Make dot product  A[i,kk:kk+bs] * B_block[:,j] for all j, store i C[i,j]
               for i in 1:n               # pick slice A[i,kk:kk+bs]
                   s = scalar_product_Arow_Bcol(A, i, B, j, kk, bs)
                   C[i,j] +=s
               end
            end
        end
    end
    # nothing returned, C updated
end

matmul_blocked_2! (generic function with 1 method)

In [314]:
C2 = zeros(size(A));
@time matmul_blocked_2!(A,B,C2)

  0.721783 seconds (4 allocations: 160 bytes)


In [307]:
isapprox(C,C2;rtol=0.000001)

true

In [316]:
@time A*B;

  0.047823 seconds (6 allocations: 7.630 MiB)


#### Blocked version version 3

Using views 

In [380]:
@inline function scalar_product(v1,v2)
    s = 0.
    for i in 1:length(v1)
       @inbounds s += v1[i] * v2[i]
    end
    return s
end

scalar_product (generic function with 1 method)

In [425]:
function matmul_blocked_3!(A, B, C)
    bs = 10
    n = size(A,1)
    @inbounds for kk in 1:bs:n            # iterates over cols of A (rows of B)
        for jj in 1:bs:n                  # iterates over rows of B    
            
            for j in jj:jj+bs-1           # Make dot product  A[i,kk:kk+bs] * B_block[:,j] for all j, store i C[i,j]
                B_block_col = @view B[kk:kk+bs-1,j]
                for i in 1:n               # pick slice A[i,kk:kk+bs]
                   A_slice =  @view A[i, kk:kk+bs-1]
                   s = scalar_product(A_slice,B_block_col)
                   C[i,j] +=s
               end
            end
        end
    end
    # nothing returned, C updated
end

matmul_blocked_3! (generic function with 1 method)

In [427]:
C2 = zeros(size(A));
@time matmul_blocked_3!(A,B,C2)

  0.874477 seconds (4 allocations: 160 bytes)


In [428]:
isapprox(C,C2;rtol=0.000001)

true

What if we transpose A before so we can access rows (cols in the transposed) fast?

In [430]:
function matmul_blocked_4!(A, B, C)
    bs = 10
    n = size(A,1)
    At = transpose(A)
    
    @inbounds for kk in 1:bs:n            # iterates over cols of A (rows of B)
        for jj in 1:bs:n                  # iterates over rows of B    
            
            for j in jj:jj+bs-1           # Make dot product  A[i,kk:kk+bs] * B_block[:,j] for all j, store i C[i,j]
                B_block_col = @view B[kk:kk+bs-1,j]
                for i in 1:n               # pick slice A[i,kk:kk+bs] which is At[kk:kk+bs-1,i]
                   A_slice =  @view At[kk:kk+bs-1,i]
                   s = scalar_product(A_slice,B_block_col)
                   C[i,j] +=s
               end
            end
        end
    end
    # nothing returned, C updated
end

matmul_blocked_4! (generic function with 1 method)

In [433]:
C2 = zeros(size(A));
@time matmul_blocked_4!(A,B,C2)

  0.888835 seconds (4 allocations: 160 bytes)


In [434]:
isapprox(C,C2;rtol=0.000001)

true

### Loading slices as SIMD vectors

In [605]:
@inline function scalar_product_vec(v1,v2)
    return sum(v1*v2)
end

scalar_product_vec (generic function with 1 method)

In [606]:
function matmul_blocked_5!(A, B, C)
    bs = 10
    n = size(A,1)
    T = eltype(A)
    #v_type = Vec{bs, T}
    vrange = VecRange{bs}(1)

    @inbounds for kk in 1:bs:n            # iterates over cols of A (rows of B)
        for jj in 1:bs:n                  # iterates over rows of B    
            
            for j in jj:jj+bs-1           # Make dot product  A[i,kk:kk+bs] * B_block[:,j] for all j, store i C[i,j]
                B_block_col = B[kk+vrange, j]  #@view B[kk:kk+bs-1,j] 
                for i in 1:n               # pick slice A[i,kk:kk+bs]
                   A_slice =   A[i, kk:kk+bs-1]
                   s = scalar_product_vec(A_slice,B_block_col)
                   C[i,j] +=s
               end
            end
        end
    end
    # nothing returned, C updated
end

matmul_blocked_5! (generic function with 1 method)

In [607]:
C2 = zeros(size(A));
@time matmul_blocked_5!(A,B,C2)

MethodError: MethodError: no method matching *(::Array{Float64,1}, ::Vec{10,Float64})
Closest candidates are:
  *(::Any, ::Any, !Matched::Any, !Matched::Any...) at operators.jl:529
  *(!Matched::Union{Bool, Float16, Float32, Float64, Int128, Int16, Int32, Int64, Int8, UInt128, UInt16, UInt32, UInt64, UInt8, Ptr}, ::Vec{N,T<:Union{Float16, Float32, Float64}}) where {N, T<:Union{Float16, Float32, Float64}} at /Users/david/.julia/packages/SIMD/Am38N/src/SIMD.jl:1165
  *(::Union{DenseArray{T,1}, DenseArray{T,2}, Base.ReinterpretArray{T,2,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray}, Base.ReinterpretArray{T,1,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray}, Base.ReshapedArray{T,1,A,MI} where MI<:Tuple{Vararg{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray}, Base.ReshapedArray{T,2,A,MI} where MI<:Tuple{Vararg{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray}, SubArray{T,1,A,I,L} where L where I<:Tuple{Vararg{Union{Int64, AbstractRange{Int64}, Base.AbstractCartesianIndex},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, Base.ReshapedArray{T,N,A,MI} where MI<:Tuple{Vararg{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, DenseArray}, SubArray{T,2,A,I,L} where L where I<:Tuple{Vararg{Union{Int64, AbstractRange{Int64}, Base.AbstractCartesianIndex},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, Base.ReshapedArray{T,N,A,MI} where MI<:Tuple{Vararg{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, DenseArray}} where T, !Matched::LinearAlgebra.Adjoint{#s617,#s616} where #s616<:LinearAlgebra.LQPackedQ where #s617) at /Users/sabae/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.2/LinearAlgebra/src/lq.jl:216
  ...

In [438]:
using SIMD

In [441]:
T = Float64
N = 4
a = rand(T, 10_000);
v_type = Vec{N, T}

Vec{4,Float64}

In [449]:
vload(v_type, A[:,1], 10)

<4 x Float64>[0.9209063615440263, 0.8689028116971553, 0.5137699213455982, 0.2980577029881102]

In [None]:
vload(v_type, A, 10)

In [475]:
xs = Array([1,2,3,4,5,6,7,8])
vrange = VecRange{4}(1)
xs[vrange]  

<4 x Int64>[1, 2, 3, 4]

In [478]:
Xs = zeros(10,10)
for i in 1:length(Xs)
    Xs[i]=i
end
vrange = VecRange{4}(1)
Xs[vrange,1]  

<4 x Float64>[1.0, 2.0, 3.0, 4.0]

In [502]:
Xs = zeros(6,6)
for i in 1:length(Xs)
    Xs[i]=i
end
vrange = VecRange{4}(1)
Xs[vrange,1], Xs[1:4,1]

(<4 x Float64>[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0])

In [586]:
Xs = zeros(6,6)
for i in 1:length(Xs)
    Xs[i]=i
end
vrange = VecRange{4}(1)
Xs[vrange,2], Xs[1:4,2]

(<4 x Float64>[7.0, 8.0, 9.0, 10.0], [7.0, 8.0, 9.0, 10.0])

In [591]:
Xs[2,1:4], Xs[2,vrange]

4-element Array{Float64,1}:
  2.0
  8.0
 14.0
 20.0

In [584]:
Xs[2,1:4], Xs[2,vrange]

ArgumentError: ArgumentError: invalid index: VecRange{4}(1) of type VecRange{4}

In [580]:
Xs = zeros(6,6)
for i in 1:length(Xs)
    Xs[i]=i
end
vrange = VecRange{4}(1)
p =1
Xs[p-1+vrange,2], Xs[p:p+N-1,2]

(<4 x Float64>[7.0, 8.0, 9.0, 10.0], [7.0, 8.0, 9.0, 10.0])

In [None]:
v = vload(v_type, a, i)

In [105]:
C2 = zeros(size(A));

function block_times_block(A, r, B, c, block_size)
    value = zero(eltype(A))
    
    for block_r in r:r+block_size
        for block_c in c:c+block_size
            @inbounds value += A[r,c] * B[r,c]  
        end
    end
    return value    
end 

block_times_block (generic function with 1 method)

In [48]:
n_rows_A=20
n_cols_B=20
bs=10
for row in 1:bs:n_rows_A
   for col in 1:bs:n_cols_B
       println(row, " ", col)
    end
end

1 1
1 11
11 1
11 11


In [55]:
a = rand(20);

In [79]:
for i in 0:10:19
    imax = i+10-1
    #print(i," ", i+10-1, " ")
    #println(length(a[i:imax]))
    println(i)
end

0
10


In [88]:
k=0
for i in 1:2
    k += 10
    imax = i+10-1
    #print(i," ", i+10-1, " ")
    #println(length(a[i:imax]))
    println(i, " ", k)
end

1 10
2 20


In [47]:
#A[11:20]

In [172]:

function naive_matmul_4!(A, B, C)
    n_rows_A = size(A,1)
    n_cols_B = size(B,2)
    n_elements = n_cols_B
    bs = 10
    n_elements_block = bs*bs
    
    for row in 1:bs:n_rows_A
        for col in 1:bs:n_cols_B
            for block_row in row:(row+bs)
                for block_col in col:(col+bs)
                    res = 0.0
                    #C[row:row+bs,col:col+bs] = block_times_block(A, row, B, col, n_elements_block)
                    for k in 1:n_elements_block
                        res += A[k, block_row] * B[block_col, k]
                    end
                    C[row, col] = res
                end
            end
        end
    end
    
end

naive_matmul_4! (generic function with 1 method)

In [173]:
C2 = zeros(size(A));

@time naive_matmul_4!(A,B,C2)

  0.175196 seconds (62.81 k allocations: 3.371 MiB)


In [174]:
isapprox(C,C2;rtol=0.000001)

false

In [None]:
B=10

for (int r=0; i<j, i+=B)