From 3a81a51178b96c1cf9c46b870240ff5a6becef92 Mon Sep 17 00:00:00 2001
From: Seyed Ali Ghasemi <info@gha3mi.com>
Date: Wed, 31 Jan 2024 11:24:57 +0100
Subject: [PATCH] Update matmul_mm

---
 benchmarks/matmul/README.md         |  35 +--
 benchmarks/matmul/matmul_mm.f90     | 322 +++++++++++++++-------------
 benchmarks/matmul/results/export.py |   6 +-
 fpm.rsp                             |  24 +--
 4 files changed, 204 insertions(+), 183 deletions(-)
diff --git a/benchmarks/matmul/README.md b/benchmarks/matmul/README.md
index e7b0b6b6..d863fa61 100644
--- a/benchmarks/matmul/README.md
+++ b/benchmarks/matmul/README.md
@@ -65,70 +65,73 @@ cd ../..
 
 ## matmul (matrix-matrix)
 
+**TODO**:
+- Generate results.
+
 ### Intel Fortran Compiler (ifx)
 
-| Elapsed Time | Performance | Speedup |
+<!-- | Elapsed Time | Performance | Speedup |
 |--------------|-------------|---------|
 | <img alt="matmul_elapsed_time" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_ifx_time.png" width="300"> | <img alt="matmul_performance" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_ifx_perf.png" width="300"> | <img alt="matmul_speedup" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_ifx_speedup.png" width="300"> |
 
 
-[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mm_ifx.html)
+[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mm_ifx.html) -->
 
 ### Intel Fortran Compiler Classic (ifort)
 
-| Elapsed Time | Performance | Speedup |
+<!-- | Elapsed Time | Performance | Speedup |
 |--------------|-------------|---------|
 | <img alt="matmul_elapsed_time" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_ifort_time.png" width="300"> | <img alt="matmul_performance" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_ifort_perf.png" width="300"> | <img alt="matmul_speedup" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_ifort_speedup.png" width="300"> |
 
-[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mm_ifort.html)
+[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mm_ifort.html) -->
 
 ### GNU Fortran (gfortran)
 
-| Elapsed Time | Performance | Speedup |
+<!-- | Elapsed Time | Performance | Speedup |
 |--------------|-------------|---------|
 | <img alt="matmul_elapsed_time" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_gfortran_time.png" width="300"> | <img alt="matmul_performance" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_gfortran_perf.png" width="300"> | <img alt="matmul_speedup" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_gfortran_speedup.png" width="300"> |
 
-[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mm_gfortran.html)
+[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mm_gfortran.html) -->
 
 ### NVIDIA HPC (nvfortran)
 
-| Elapsed Time | Performance | Speedup |
+<!-- | Elapsed Time | Performance | Speedup |
 |--------------|-------------|---------|
 | <img alt="matmul_elapsed_time" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_nvfortran_time.png" width="300"> | <img alt="matmul_performance" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_nvfortran_perf.png" width="300"> | <img alt="matmul_speedup" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mm_nvfortran_speedup.png" width="300"> |
 
-[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mm_nvfortran.html)
+[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mm_nvfortran.html) -->
 
 ## matmul (matrix-vector)
 
 ### Intel Fortran Compiler (ifx)
 
-| Elapsed Time | Performance | Speedup |
+<!-- | Elapsed Time | Performance | Speedup |
 |--------------|-------------|---------|
 | <img alt="matmul_elapsed_time" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_ifx_time.png" width="300"> | <img alt="matmul_performance" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_ifx_perf.png" width="300"> | <img alt="matmul_speedup" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_ifx_speedup.png" width="300"> |
 
 
-[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mv_ifx.html)
+[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mv_ifx.html) -->
 
 ### Intel Fortran Compiler Classic (ifort)
 
-| Elapsed Time | Performance | Speedup |
+<!-- | Elapsed Time | Performance | Speedup |
 |--------------|-------------|---------|
 | <img alt="matmul_elapsed_time" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_ifort_time.png" width="300"> | <img alt="matmul_performance" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_ifort_perf.png" width="300"> | <img alt="matmul_speedup" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_ifort_speedup.png" width="300"> |
 
-[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mv_ifort.html)
+[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mv_ifort.html) -->
 
 ### GNU Fortran (gfortran)
 
-| Elapsed Time | Performance | Speedup |
+<!-- | Elapsed Time | Performance | Speedup |
 |--------------|-------------|---------|
 | <img alt="matmul_elapsed_time" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_gfortran_time.png" width="300"> | <img alt="matmul_performance" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_gfortran_perf.png" width="300"> | <img alt="matmul_speedup" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_gfortran_speedup.png" width="300"> |
 
-[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mv_gfortran.html)
+[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mv_gfortran.html) -->
 
 ### NVIDIA HPC (nvfortran)
 
-| Elapsed Time | Performance | Speedup |
+<!-- | Elapsed Time | Performance | Speedup |
 |--------------|-------------|---------|
 | <img alt="matmul_elapsed_time" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_nvfortran_time.png" width="300"> | <img alt="matmul_performance" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_nvfortran_perf.png" width="300"> | <img alt="matmul_speedup" src="https://github.com/gha3mi/forbenchmark/raw/main/benchmarks/matmul/results/matmul_mv_nvfortran_speedup.png" width="300"> |
 
-[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mv_nvfortran.html)
\ No newline at end of file
+[View detailed table](https://raw.githack.com/gha3mi/forbenchmark/main/benchmarks/matmul/results/matmul_mv_nvfortran.html) -->
\ No newline at end of file
diff --git a/benchmarks/matmul/matmul_mm.f90 b/benchmarks/matmul/matmul_mm.f90
index 147bc60e..2769c5f5 100644
--- a/benchmarks/matmul/matmul_mm.f90
+++ b/benchmarks/matmul/matmul_mm.f90
@@ -1,159 +1,177 @@
 program benchmark_matmul_mm
 
-   use kinds
-   use formatmul
-   use forbenchmark
-
-   implicit none
-
-   type(benchmark)       :: bench
-   real(rk), allocatable :: A(:,:), B(:,:), C(:,:)
-   integer(ik)           :: m, n, o, p
-   integer               :: nl
-
-   call bench%init(12,'Benchmark matmul','benchmarks/matmul/results/matmul_mm', 10)
-
-   do p = 250_ik,1500_ik,250_ik
-
-      !===============================================================================
-      ! C(m,o) = A(m,n).B(n,o)
-      m = p
-      n = p
-      o = p
-
-      if (allocated(A)) deallocate(A)
-      if (allocated(B)) deallocate(B)
-      if (allocated(C)) deallocate(C)
-      allocate(A(m,n))
-      allocate(B(n,o))
-      allocate(C(m,o))
-      call random_number(A)
-      call random_number(B)
-      !===============================================================================
-
-
-      !===============================================================================
-      ! Reference
-      call bench%start_benchmark(1,'matmul',"C = matmul(A,B)",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B)
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(2,'m1',"C = matmul(A,B,option='m1')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m1')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(3,'m2',"C = matmul(A,B,option='m2')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m2')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(4,'m3',"C = matmul(A,B,option='m3')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m3')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(5,'m4',"C = matmul(A,B,option='m4')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m4')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(6,'m5',"C = matmul(A,B,option='m5')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m5')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(7,'m6',"C = matmul(A,B,option='m6')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m6')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(8,'m7',"C = matmul(A,B,option='m7')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m7')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(9,'m8',"C = matmul(A,B,option='m8')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m8')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(10,'m9',"C = matmul(A,B,option='m9')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m9')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(11,'m12',"C = matmul(A,B,option='m12')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m12')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-
-      !===============================================================================
-      call bench%start_benchmark(12,'m13',"C = matmul(A,B,option='m13')",[m*n*o])
-      do nl = 1,bench%nloops
-         C = matmul(A,B,option='m13')
-      end do
-      call bench%stop_benchmark(cmp_gflops)
-      !===============================================================================
-
-   end do
-
-   call bench%finalize()
+    use kinds
+    use formatmul, only: fmatmul => matmul ! use fmatmul instead of matmul to avoid overloading for reference implementation
+    use forbenchmark
+
+    implicit none
+
+    type(benchmark)          :: bench
+    real(rk),    allocatable :: A(:,:), B(:,:), C(:,:)
+    integer(ik)              :: m, n, o, p
+    integer                  :: nl, seed_size, i, imark
+    integer,     allocatable :: seed_array(:)
+    integer(ik), allocatable :: num_elements(:)
+
+    call random_seed(size = seed_size)
+    allocate(seed_array(seed_size))
+    seed_array = 123456789
+
+    call bench%init(9,'Benchmark matmul','benchmarks/matmul/results/matmul_mm', 100)
+
+    num_elements = [500_ik, 1000_ik, 1500_ik, 2000_ik]
+
+    do i = 1, size(num_elements)
+        p = num_elements(i)
+
+        ! C(m,o) = A(m,n).B(n,o)
+        m = p
+        n = p
+        o = p
+
+        call init_matrices(m,n,o,seed_array,A,B,C)
+
+        !===============================================================================
+        ! Reference
+        call bench%start_benchmark(1,'matmul',"C = matmul(A,B)",[m*n*o])
+        do nl = 1,bench%nloops
+            C = matmul(A,B)
+            call prevent_optimization(C,nl) ! loop-invariant
+        end do
+        call bench%stop_benchmark(cmp_gflops)
+        !===============================================================================
+
+        call init_matrices(m,n,o,seed_array,A,B,C)
+
+        !===============================================================================
+        call bench%start_benchmark(2,'blas',"C = matmul(A,B,option='m2')",[m*n*o])
+        do nl = 1,bench%nloops
+            C = fmatmul(A,B,option='m2')
+            call prevent_optimization(C,nl) ! loop-invariant
+        end do
+        call bench%stop_benchmark(cmp_gflops)
+        !===============================================================================
+
+        call init_matrices(m,n,o,seed_array,A,B,C)
+
+        !===============================================================================
+        call bench%start_benchmark(3,'m1_b32',"C = matmul(A,B,option='m1',nblock=32)",[m*n*o])
+        do nl = 1,bench%nloops
+            C = fmatmul(A,B,option='m1',nblock=32)
+            call prevent_optimization(C,nl) ! loop-invariant
+        end do
+        call bench%stop_benchmark(cmp_gflops)
+        !===============================================================================
+
+        call init_matrices(m,n,o,seed_array,A,B,C)
+
+        !===============================================================================
+        call bench%start_benchmark(4,'m3_b32',"C = matmul(A,B,option='m3',nblock=32)",[m*n*o])
+        do nl = 1,bench%nloops
+            C = fmatmul(A,B,option='m3',nblock=32)
+            call prevent_optimization(C,nl) ! loop-invariant
+        end do
+        call bench%stop_benchmark(cmp_gflops)
+        !===============================================================================
+
+        call init_matrices(m,n,o,seed_array,A,B,C)
+
+        !===============================================================================
+        call bench%start_benchmark(5,'m4_b32',"C = matmul(A,B,option='m4',nblock=32)",[m*n*o])
+        do nl = 1,bench%nloops
+            C = fmatmul(A,B,option='m4',nblock=32)
+            call prevent_optimization(C,nl) ! loop-invariant
+        end do
+        call bench%stop_benchmark(cmp_gflops)
+        !===============================================================================
+
+        call init_matrices(m,n,o,seed_array,A,B,C)
+
+        !===============================================================================
+        call bench%start_benchmark(6,'m5_b32',"C = matmul(A,B,option='m5',nblock=32)",[m*n*o])
+        do nl = 1,bench%nloops
+            C = fmatmul(A,B,option='m5',nblock=32)
+            call prevent_optimization(C,nl) ! loop-invariant
+        end do
+        call bench%stop_benchmark(cmp_gflops)
+        !===============================================================================
+
+        call init_matrices(m,n,o,seed_array,A,B,C)
+
+        !===============================================================================
+        call bench%start_benchmark(7,'m6_b32',"C = matmul(A,B,option='m6',nblock=32)",[m*n*o])
+        do nl = 1,bench%nloops
+            C = fmatmul(A,B,option='m6',nblock=32)
+            call prevent_optimization(C,nl) ! loop-invariant
+        end do
+        call bench%stop_benchmark(cmp_gflops)
+        !===============================================================================
+
+        call init_matrices(m,n,o,seed_array,A,B,C)
+
+        !===============================================================================
+        call bench%start_benchmark(8,'m7_b32',"C = matmul(A,B,option='m7',nblock=32)",[m*n*o])
+        do nl = 1,bench%nloops
+            C = fmatmul(A,B,option='m7',nblock=32)
+            call prevent_optimization(C,nl) ! loop-invariant
+        end do
+        call bench%stop_benchmark(cmp_gflops)
+        !===============================================================================
+
+        call init_matrices(m,n,o,seed_array,A,B,C)
+
+        !===============================================================================
+        call bench%start_benchmark(9,'m8_b32',"C = matmul(A,B,option='m8',nblock=32)",[m*n*o])
+        do nl = 1,bench%nloops
+            C = fmatmul(A,B,option='m8',nblock=32)
+            call prevent_optimization(C,nl) ! loop-invariant
+        end do
+        call bench%stop_benchmark(cmp_gflops)
+        !===============================================================================
+    
+    end do
+
+    call bench%finalize()
 
 contains
 
-   !===============================================================================
-   function cmp_gflops(argi,argr) result(gflops)
-      integer(ik), dimension(:), intent(in), optional :: argi
-      real(rk),    dimension(:), intent(in), optional :: argr
-      real(rk)                                        :: gflops
-
-      gflops = 2.0_rk*real(argi(1),kind=rk)*1.0e-9_rk
-   end function cmp_gflops
-   !===============================================================================
+    !===============================================================================
+    subroutine init_matrices(m,n,o,seed_array,A,B,C)
+        integer(ik), intent(in) :: m,n,o
+        real(rk), allocatable, intent(inout) :: A(:,:), B(:,:), C(:,:)
+        integer, intent(in) :: seed_array(:)
+
+        if (allocated(A)) deallocate(A)
+        if (allocated(B)) deallocate(B)
+        if (allocated(C)) deallocate(C)
+        allocate(A(m,n))
+        allocate(B(n,o))
+        allocate(C(m,o))
+        call random_seed(put = seed_array)
+        call random_number(A)
+        call random_number(B)
+    end subroutine init_matrices
+    !===============================================================================
+
+
+    !===============================================================================
+    function cmp_gflops(argi,argr) result(gflops)
+        integer(ik), dimension(:), intent(in), optional :: argi
+        real(rk),    dimension(:), intent(in), optional :: argr
+        real(rk)                                        :: gflops
+
+        gflops = 2.0_rk*real(argi(1),kind=rk)*1.0e-9_rk
+    end function cmp_gflops
+    !===============================================================================
+
+
+    !===============================================================================
+    ! to prevent compiler from optimizing (loop-invariant)
+    subroutine prevent_optimization(C, nl)
+        real(rk), dimension(:,:), intent(in) :: C
+        integer, intent(in)  :: nl
+        if (abs(C(1,1))<tiny(0.0_rk)) print*, nl, C(1,1)
+    end subroutine prevent_optimization
+    !===============================================================================
 
 end program benchmark_matmul_mm
diff --git a/benchmarks/matmul/results/export.py b/benchmarks/matmul/results/export.py
index d9cfb768..88bb6a10 100644
--- a/benchmarks/matmul/results/export.py
+++ b/benchmarks/matmul/results/export.py
@@ -18,7 +18,7 @@
 
 
 # Set plot settings
-fpd.set_plot_settings(fig_size=(12, 6), dpi=600, colormap='prism')
+fpd.set_plot_settings(fig_size=(6, 6), dpi=600, colormap='prism')
 
 # Plot the elapsed time
 fpd.plot_elapsed_time(file_path, benchmark_data, x_data,
@@ -36,12 +36,12 @@
                                    title='MatMul Benchmark - Speedup',
                                    xlabel='Number of Elements',
                                    ylabel='Speedup [-]',
-                                   bar_width=0.05)
+                                   bar_width=0.10)
 
 
 fpd.plot_speedup_avg(file_path, benchmark_data, x_data, 
                                    title='MatMul Benchmark - Average Speedup',
-                                   xlabel='Number of Elements',
+                                   xlabel='Methods',
                                    ylabel='Average Speedup [-]')
 
 fh.generate_html(file_path, benchmark_data,
diff --git a/fpm.rsp b/fpm.rsp
index 2c57110f..ed024535 100644
--- a/fpm.rsp
+++ b/fpm.rsp
@@ -79,42 +79,42 @@ options run --example
 options --target matmul_mm
 options --profile release
 options --compiler ifx
-options --flag "-O3 -mtune=native -xHost -llapack -lblas -qopenmp -DINT64"
+options --flag "-O3 -mtune=native -xHost -lblas -qopenmp -DUSE_DO_CONCURRENT -DINT64"
 
 @benchmark-matmul-mm-ifort
 options run --example
 options --target matmul_mm
 options --profile release
 options --compiler ifort
-options --flag --flag "-O3 -mtune=native -xHost -llapack -lblas -qopenmp -ipo -DINT64"
+options --flag --flag "-O3 -mtune=native -xHost -lblas -qopenmp -ipo -DINT64"
 
 @benchmark-matmul-mm-gfortran
 options run --example
 options --target matmul_mm
 options --profile release
 options --compiler gfortran
-options --flag "-O3 -march=native -llapack -lblas -fopenmp -flto -DINT64"
+options --flag "-O3 -march=native -lblas -fopenmp -flto -DINT64"
 
 @benchmark-matmul-mm-nvfortran
 options run --example
 options --target matmul_mm
 options --profile release
 options --compiler nvfortran
-options --flag "-O3 -fast -march=native -mtune=native -stdpar=gpu,multicore -llapack -lblas -openmp -DINT64"
+options --flag "-O3 -fast -march=native -mtune=native -stdpar=gpu,multicore -lblas -openmp -DINT64"
 
 @benchmark-matmul-mm-ifx-coarray
 options run --example
 options --target matmul_mm_co
 options --profile release
 options --compiler ifx
-options --flag "-O3 -mtune=native -xHost -llapack -lblas -qopenmp -DINT64 -coarray -coarray-num-images=4 -DUSE_COARRAY"
+options --flag "-O3 -mtune=native -xHost -lblas -qopenmp -DUSE_DO_CONCURRENT -DINT64 -coarray -coarray-num-images=4 -DUSE_COARRAY"
 
 @benchmark-matmul-mm-ifort-coarray
 options run --example
 options --target matmul_mm_co
 options --profile release
 options --compiler ifort
-options --flag "-O3 -mtune=native -xHost -llapack -lblas -qopenmp -DINT64 -ipo -coarray -coarray-num-images=4 -DUSE_COARRAY"
+options --flag "-O3 -mtune=native -xHost -lblas -qopenmp -DINT64 -ipo -coarray -coarray-num-images=4 -DUSE_COARRAY"
 
 
 
@@ -124,39 +124,39 @@ options run --example
 options --target matmul_mv
 options --profile release
 options --compiler ifx
-options --flag "-O3 -mtune=native -xHost -llapack -lblas -qopenmp -DINT64"
+options --flag "-O3 -mtune=native -xHost -lblas -qopenmp -DUSE_DO_CONCURRENT -DINT64"
 
 @benchmark-matmul-mv-ifort
 options run --example
 options --target matmul_mv
 options --profile release
 options --compiler ifort
-options --flag --flag "-O3 -mtune=native -xHost -llapack -lblas -qopenmp -ipo -DINT64"
+options --flag --flag "-O3 -mtune=native -xHost -lblas -qopenmp -ipo -DINT64"
 
 @benchmark-matmul-mv-gfortran
 options run --example
 options --target matmul_mv
 options --profile release
 options --compiler gfortran
-options --flag "-O3 -march=native -llapack -lblas -fopenmp -flto -DINT64"
+options --flag "-O3 -march=native -lblas -fopenmp -flto -DINT64"
 
 @benchmark-matmul-mv-nvfortran
 options run --example
 options --target matmul_mv
 options --profile release
 options --compiler nvfortran
-options --flag "-O3 -fast -march=native -mtune=native -stdpar=gpu,multicore -llapack -lblas -openmp -DINT64"
+options --flag "-O3 -fast -march=native -mtune=native -stdpar=gpu,multicore -lblas -openmp -DINT64"
 
 @benchmark-matmul-mv-ifx-coarray
 options run --example
 options --target matmul_mv_co
 options --profile release
 options --compiler ifx
-options --flag "-O3 -mtune=native -xHost -llapack -lblas -qopenmp -DINT64 -coarray -coarray-num-images=4 -DUSE_COARRAY"
+options --flag "-O3 -mtune=native -xHost -lblas -qopenmp -DUSE_DO_CONCURRENT -DINT64 -coarray -coarray-num-images=4 -DUSE_COARRAY"
 
 @benchmark-matmul-mv-ifort-coarray
 options run --example
 options --target matmul_mv_co
 options --profile release
 options --compiler ifort
-options --flag "-O3 -mtune=native -xHost -llapack -lblas -qopenmp -DINT64 -ipo -coarray -coarray-num-images=4 -DUSE_COARRAY"
\ No newline at end of file
+options --flag "-O3 -mtune=native -xHost -lblas -qopenmp -DINT64 -ipo -coarray -coarray-num-images=4 -DUSE_COARRAY"
\ No newline at end of file