From 3a81a51178b96c1cf9c46b870240ff5a6becef92 Mon Sep 17 00:00:00 2001 From: Seyed Ali Ghasemi Date: Wed, 31 Jan 2024 11:24:57 +0100 Subject: [PATCH] Update matmul_mm --- benchmarks/matmul/README.md | 35 +-- benchmarks/matmul/matmul_mm.f90 | 322 +++++++++++++++------------- benchmarks/matmul/results/export.py | 6 +- fpm.rsp | 24 +-- 4 files changed, 204 insertions(+), 183 deletions(-) diff --git a/benchmarks/matmul/README.md b/benchmarks/matmul/README.md index e7b0b6b6..d863fa61 100644 --- a/benchmarks/matmul/README.md +++ b/benchmarks/matmul/README.md @@ -65,70 +65,73 @@ cd ../.. ## matmul (matrix-matrix) +**TODO**: +- Generate results. + ### Intel Fortran Compiler (ifx) -| Elapsed Time | Performance | Speedup | + ### Intel Fortran Compiler Classic (ifort) -| Elapsed Time | Performance | Speedup | + ### GNU Fortran (gfortran) -| Elapsed Time | Performance | Speedup | + ### NVIDIA HPC (nvfortran) -| Elapsed Time | Performance | Speedup | + ## matmul (matrix-vector) ### Intel Fortran Compiler (ifx) -| Elapsed Time | Performance | Speedup | + ### Intel Fortran Compiler Classic (ifort) -| Elapsed Time | Performance | Speedup | + ### GNU Fortran (gfortran) -| Elapsed Time | Performance | Speedup | + ### NVIDIA HPC (nvfortran) -| Elapsed Time | Performance | Speedup | + \ No newline at end of file diff --git a/benchmarks/matmul/matmul_mm.f90 b/benchmarks/matmul/matmul_mm.f90 index 147bc60e..2769c5f5 100644 --- a/benchmarks/matmul/matmul_mm.f90 +++ b/benchmarks/matmul/matmul_mm.f90 @@ -1,159 +1,177 @@ program benchmark_matmul_mm - use kinds - use formatmul - use forbenchmark - - implicit none - - type(benchmark) :: bench - real(rk), allocatable :: A(:,:), B(:,:), C(:,:) - integer(ik) :: m, n, o, p - integer :: nl - - call bench%init(12,'Benchmark matmul','benchmarks/matmul/results/matmul_mm', 10) - - do p = 250_ik,1500_ik,250_ik - - !=============================================================================== - ! C(m,o) = A(m,n).B(n,o) - m = p - n = p - o = p - - if (allocated(A)) deallocate(A) - if (allocated(B)) deallocate(B) - if (allocated(C)) deallocate(C) - allocate(A(m,n)) - allocate(B(n,o)) - allocate(C(m,o)) - call random_number(A) - call random_number(B) - !=============================================================================== - - - !=============================================================================== - ! Reference - call bench%start_benchmark(1,'matmul',"C = matmul(A,B)",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B) - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(2,'m1',"C = matmul(A,B,option='m1')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m1') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(3,'m2',"C = matmul(A,B,option='m2')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m2') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(4,'m3',"C = matmul(A,B,option='m3')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m3') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(5,'m4',"C = matmul(A,B,option='m4')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m4') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(6,'m5',"C = matmul(A,B,option='m5')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m5') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(7,'m6',"C = matmul(A,B,option='m6')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m6') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(8,'m7',"C = matmul(A,B,option='m7')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m7') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(9,'m8',"C = matmul(A,B,option='m8')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m8') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(10,'m9',"C = matmul(A,B,option='m9')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m9') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(11,'m12',"C = matmul(A,B,option='m12')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m12') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - - !=============================================================================== - call bench%start_benchmark(12,'m13',"C = matmul(A,B,option='m13')",[m*n*o]) - do nl = 1,bench%nloops - C = matmul(A,B,option='m13') - end do - call bench%stop_benchmark(cmp_gflops) - !=============================================================================== - - end do - - call bench%finalize() + use kinds + use formatmul, only: fmatmul => matmul ! use fmatmul instead of matmul to avoid overloading for reference implementation + use forbenchmark + + implicit none + + type(benchmark) :: bench + real(rk), allocatable :: A(:,:), B(:,:), C(:,:) + integer(ik) :: m, n, o, p + integer :: nl, seed_size, i, imark + integer, allocatable :: seed_array(:) + integer(ik), allocatable :: num_elements(:) + + call random_seed(size = seed_size) + allocate(seed_array(seed_size)) + seed_array = 123456789 + + call bench%init(9,'Benchmark matmul','benchmarks/matmul/results/matmul_mm', 100) + + num_elements = [500_ik, 1000_ik, 1500_ik, 2000_ik] + + do i = 1, size(num_elements) + p = num_elements(i) + + ! C(m,o) = A(m,n).B(n,o) + m = p + n = p + o = p + + call init_matrices(m,n,o,seed_array,A,B,C) + + !=============================================================================== + ! Reference + call bench%start_benchmark(1,'matmul',"C = matmul(A,B)",[m*n*o]) + do nl = 1,bench%nloops + C = matmul(A,B) + call prevent_optimization(C,nl) ! loop-invariant + end do + call bench%stop_benchmark(cmp_gflops) + !=============================================================================== + + call init_matrices(m,n,o,seed_array,A,B,C) + + !=============================================================================== + call bench%start_benchmark(2,'blas',"C = matmul(A,B,option='m2')",[m*n*o]) + do nl = 1,bench%nloops + C = fmatmul(A,B,option='m2') + call prevent_optimization(C,nl) ! loop-invariant + end do + call bench%stop_benchmark(cmp_gflops) + !=============================================================================== + + call init_matrices(m,n,o,seed_array,A,B,C) + + !=============================================================================== + call bench%start_benchmark(3,'m1_b32',"C = matmul(A,B,option='m1',nblock=32)",[m*n*o]) + do nl = 1,bench%nloops + C = fmatmul(A,B,option='m1',nblock=32) + call prevent_optimization(C,nl) ! loop-invariant + end do + call bench%stop_benchmark(cmp_gflops) + !=============================================================================== + + call init_matrices(m,n,o,seed_array,A,B,C) + + !=============================================================================== + call bench%start_benchmark(4,'m3_b32',"C = matmul(A,B,option='m3',nblock=32)",[m*n*o]) + do nl = 1,bench%nloops + C = fmatmul(A,B,option='m3',nblock=32) + call prevent_optimization(C,nl) ! loop-invariant + end do + call bench%stop_benchmark(cmp_gflops) + !=============================================================================== + + call init_matrices(m,n,o,seed_array,A,B,C) + + !=============================================================================== + call bench%start_benchmark(5,'m4_b32',"C = matmul(A,B,option='m4',nblock=32)",[m*n*o]) + do nl = 1,bench%nloops + C = fmatmul(A,B,option='m4',nblock=32) + call prevent_optimization(C,nl) ! loop-invariant + end do + call bench%stop_benchmark(cmp_gflops) + !=============================================================================== + + call init_matrices(m,n,o,seed_array,A,B,C) + + !=============================================================================== + call bench%start_benchmark(6,'m5_b32',"C = matmul(A,B,option='m5',nblock=32)",[m*n*o]) + do nl = 1,bench%nloops + C = fmatmul(A,B,option='m5',nblock=32) + call prevent_optimization(C,nl) ! loop-invariant + end do + call bench%stop_benchmark(cmp_gflops) + !=============================================================================== + + call init_matrices(m,n,o,seed_array,A,B,C) + + !=============================================================================== + call bench%start_benchmark(7,'m6_b32',"C = matmul(A,B,option='m6',nblock=32)",[m*n*o]) + do nl = 1,bench%nloops + C = fmatmul(A,B,option='m6',nblock=32) + call prevent_optimization(C,nl) ! loop-invariant + end do + call bench%stop_benchmark(cmp_gflops) + !=============================================================================== + + call init_matrices(m,n,o,seed_array,A,B,C) + + !=============================================================================== + call bench%start_benchmark(8,'m7_b32',"C = matmul(A,B,option='m7',nblock=32)",[m*n*o]) + do nl = 1,bench%nloops + C = fmatmul(A,B,option='m7',nblock=32) + call prevent_optimization(C,nl) ! loop-invariant + end do + call bench%stop_benchmark(cmp_gflops) + !=============================================================================== + + call init_matrices(m,n,o,seed_array,A,B,C) + + !=============================================================================== + call bench%start_benchmark(9,'m8_b32',"C = matmul(A,B,option='m8',nblock=32)",[m*n*o]) + do nl = 1,bench%nloops + C = fmatmul(A,B,option='m8',nblock=32) + call prevent_optimization(C,nl) ! loop-invariant + end do + call bench%stop_benchmark(cmp_gflops) + !=============================================================================== + + end do + + call bench%finalize() contains - !=============================================================================== - function cmp_gflops(argi,argr) result(gflops) - integer(ik), dimension(:), intent(in), optional :: argi - real(rk), dimension(:), intent(in), optional :: argr - real(rk) :: gflops - - gflops = 2.0_rk*real(argi(1),kind=rk)*1.0e-9_rk - end function cmp_gflops - !=============================================================================== + !=============================================================================== + subroutine init_matrices(m,n,o,seed_array,A,B,C) + integer(ik), intent(in) :: m,n,o + real(rk), allocatable, intent(inout) :: A(:,:), B(:,:), C(:,:) + integer, intent(in) :: seed_array(:) + + if (allocated(A)) deallocate(A) + if (allocated(B)) deallocate(B) + if (allocated(C)) deallocate(C) + allocate(A(m,n)) + allocate(B(n,o)) + allocate(C(m,o)) + call random_seed(put = seed_array) + call random_number(A) + call random_number(B) + end subroutine init_matrices + !=============================================================================== + + + !=============================================================================== + function cmp_gflops(argi,argr) result(gflops) + integer(ik), dimension(:), intent(in), optional :: argi + real(rk), dimension(:), intent(in), optional :: argr + real(rk) :: gflops + + gflops = 2.0_rk*real(argi(1),kind=rk)*1.0e-9_rk + end function cmp_gflops + !=============================================================================== + + + !=============================================================================== + ! to prevent compiler from optimizing (loop-invariant) + subroutine prevent_optimization(C, nl) + real(rk), dimension(:,:), intent(in) :: C + integer, intent(in) :: nl + if (abs(C(1,1))