From 1d923e0e071bf7bb0e2593cc0d12dcf5a22e87a9 Mon Sep 17 00:00:00 2001 From: Alfio Lazzaro Date: Fri, 5 Oct 2018 17:47:39 +0200 Subject: [PATCH] Exclude transpose for B matrix for the opt rect algo --- Makefile | 6 ++++++ src/mm/dbcsr_mm.F | 55 ++++++++++++++++++++++++----------------------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/Makefile b/Makefile index 9b78b196a69..7cdac6c99ce 100644 --- a/Makefile +++ b/Makefile @@ -246,10 +246,16 @@ OTHER_HELP += "install : Install the library and modules under PREFIX=>{M, N}, and it is a variant - ! of the CARMA algorithm. - ! - ! -------- ----- - ! | | x | | - ! -------- | | - ! | | - ! ----- - ! - use_rect_algo = .FALSE. - IF (nprows .EQ. numnodes .AND. npcols .EQ. 1 .AND. & - transa .EQ. dbcsr_transpose .AND. & - (.NOT. product_reindex) .AND. & - (.NOT. keep_sparsity) .AND. & - (.NOT. dbcsr_cfg%use_mpi_rma) .AND. & - dbcsr_nfullrows_total(matrix_a) .GT. dbcsr_nfullrows_total(matrix_c) .AND. & - dbcsr_nfullrows_total(matrix_a) .GT. dbcsr_nfullcols_total(matrix_c)) THEN - use_rect_algo = .TRUE. - ENDIF ! check parameters --------------------------------------------------------- transa_l = transa @@ -554,6 +529,32 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & CALL dbcsr_print(matrix_c, nodata=.TRUE.) ENDIF ENDIF + ! + ! Introducing an optimized algoritm for a 1D grid, + ! where all ranks are organized over row processors. + ! The left matrix must be distributed over row-processors, + ! i.e. it is transposed. + ! This algorithm is particular beneficial for rectangular + ! matrix multiplications, where K>>{M, N}, and it is a variant + ! of the CARMA algorithm. + ! + ! -------- ----- + ! | | x | | + ! -------- | | + ! | | + ! ----- + ! + use_rect_algo = .FALSE. + IF (nprows .EQ. numnodes .AND. npcols .EQ. 1 .AND. & + transa_l .EQ. dbcsr_transpose .AND. & + transb_l .EQ. dbcsr_no_transpose .AND. & + (.NOT. product_reindex) .AND. & + (.NOT. keep_sparsity) .AND. & + (.NOT. dbcsr_cfg%use_mpi_rma) .AND. & + dbcsr_nfullrows_total(matrix_a) .GT. dbcsr_nfullrows_total(matrix_c) .AND. & + dbcsr_nfullrows_total(matrix_a) .GT. dbcsr_nfullcols_total(matrix_c)) THEN + use_rect_algo = .TRUE. + ENDIF ! transpose/conjg left and/or right matrices if needed SELECT CASE (transa_l) @@ -1009,8 +1010,8 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & IF (use_rect_algo) THEN matrix_c_local = dbcsr_type() IF (use_dense_mult) THEN - ALLOCATE (dist_rows(matrix_a%nblkcols_total), & - dist_cols(matrix_b%nblkcols_total)) + ALLOCATE (dist_rows(matrix_left%nblkrows_total), & + dist_cols(matrix_right%nblkcols_total)) dist_rows(:) = mynode dist_cols(:) = 0 CALL dbcsr_distribution_new(local_distribution, mp_obj, &