Skip to content
Permalink
Browse files

fm: factor out redist info and print for later retrieval

  • Loading branch information
dev-zero committed Nov 27, 2019
1 parent 32e6c13 commit cfa49cadead9fb1df4185a45b675ede839afcff1
Showing with 83 additions and 50 deletions.
  1. +83 −50 src/fm/cp_fm_diag_utils.F
@@ -42,6 +42,18 @@ MODULE cp_fm_diag_utils

CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'cp_fm_diag_utils'

! Information on redistribution
TYPE, PUBLIC :: cp_fm_redistribute_info
INTEGER :: matrix_order
INTEGER :: num_pe_old ! number of processes before a potential redistribute
INTEGER :: num_pe_new ! number of processes after a potential redistribute
INTEGER :: num_pe_opt ! optimal number of processes for the given matrix
INTEGER :: num_pe_max_nz_col ! the maximal number of processes s.t. no column has zero width, may be < 0 if ignored
LOGICAL :: redistribute ! whether or not the matrix was actually redistributed
CONTAINS
PROCEDURE, PASS(self) :: write => cp_fm_redistribute_info_write
END TYPE

! Container for redistribution settings and temporary work structs
TYPE cp_fm_redistribute_type
! Settings
@@ -68,6 +80,31 @@ MODULE cp_fm_diag_utils

CONTAINS

! **************************************************************************************************
!> \brief Write the redistribute info nicely formatted to the given I/O unit
!> \param self reference to the cp_fm_redistribute_info instance
!> \param io_unit I/O unit to use for writing
! **************************************************************************************************
SUBROUTINE cp_fm_redistribute_info_write(self, io_unit)
CLASS(cp_fm_redistribute_info), INTENT(IN) :: self
INTEGER, INTENT(IN) :: io_unit

WRITE (io_unit, '(/,A,I10)') "CP_FM_DIAG| Matrix order: ", self%matrix_order
WRITE (io_unit, '(A,I6,A)') "CP_FM_DIAG| Matrix distributed on ", self%num_pe_old, " processes"
WRITE (io_unit, '(A,I5)') "CP_FM_DIAG| Optimal number of CPUs: ", self%num_pe_opt
IF (self%num_pe_max_nz_col < 0) THEN
WRITE (io_unit, '(A,A)') "CP_FM_DIAG| Max number of CPUs (with non-zero columns): ", "<N/A>"
ELSE
WRITE (io_unit, '(A,I5)') "CP_FM_DIAG| Max number of CPUs (with non-zero columns): ", self%num_pe_max_nz_col
END IF
IF (self%redistribute) THEN
WRITE (io_unit, '(A,I5,A)') "CP_FM_DIAG| The matrix will be redistributed onto ", self%num_pe_new, " processes"
ELSE
WRITE (io_unit, '(A)') "CP_FM_DIAG| The matrix will NOT be redistributed"
END IF
WRITE (io_unit, '(A)') " "
END SUBROUTINE

! **************************************************************************************************
!> \brief Initializes temporary storage needed when redistributing arrays
!> \author Nico Holmberg [01.2018]
@@ -141,8 +178,9 @@ END SUBROUTINE cp_fm_redistribute_init
!> \return the optimal number of CPUs
!> \author Nico Holmberg [01.2018]
! **************************************************************************************************
FUNCTION cp_fm_diag_get_optimal_ncpu(size) RESULT(ncpu)
INTEGER :: size, ncpu
PURE FUNCTION cp_fm_diag_get_optimal_ncpu(size) RESULT(ncpu)
INTEGER, INTENT(IN) :: size
INTEGER :: ncpu

CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_diag_get_optimal_ncpu', &
routineP = moduleN//':'//routineN
@@ -153,17 +191,17 @@ FUNCTION cp_fm_diag_get_optimal_ncpu(size) RESULT(ncpu)
END FUNCTION cp_fm_diag_get_optimal_ncpu

! **************************************************************************************************
!> \brief Determines the largest number of CPUs a matrix can be distributed on when the matrix is
!> diagonalized with the ELPA library.
!> \brief Determines the largest number of CPUs a matrix can be distributed on without any of the
!> processors getting a zero-width column (currently only needed for ELPA).
!> \param matrix the matrix that will be diagonalized
!> \return the maximum number of CPUs for ELPA
!> \author Nico Holmberg [01.2018]
! **************************************************************************************************
FUNCTION cp_fm_elpa_get_max_ncpu(matrix) RESULT(ncpu)
FUNCTION cp_fm_max_ncpu_non_zero_column(matrix) RESULT(ncpu)
TYPE(cp_fm_type), POINTER :: matrix
INTEGER :: ncpu

CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_elpa_get_max_ncpu', &
CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_max_ncpu_non_zero_column', &
routineP = moduleN//':'//routineN

#if defined(__SCALAPACK)
@@ -228,7 +266,7 @@ FUNCTION cp_fm_elpa_get_max_ncpu(matrix) RESULT(ncpu)
CPABORT("Routine called in non-parallel case.")
#endif

END FUNCTION cp_fm_elpa_get_max_ncpu
END FUNCTION cp_fm_max_ncpu_non_zero_column

! **************************************************************************************************
!> \brief Determines the optimal number of CPUs for matrix diagonalization and redistributes
@@ -240,12 +278,13 @@ END FUNCTION cp_fm_elpa_get_max_ncpu
!> \param eigenvectors_new the redistributed eigenvectors matrix, or a pointer to the original
!> matrix if no redistribution is required
!> \param caller_is_elpa flag that determines if ELPA is used for diagonalization
!> \param redist_info get info about the redistribution
!> \par History
!> - [01.2018] created by moving redistribution related code from cp_fm_syevd here
!> \author Nico Holmberg [01.2018]
! **************************************************************************************************
SUBROUTINE cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvectors_new, &
caller_is_elpa)
caller_is_elpa, redist_info)

TYPE(cp_fm_type), POINTER :: matrix, eigenvectors
TYPE(cp_fm_type), POINTER, INTENT(OUT) :: matrix_new, eigenvectors_new
@@ -256,16 +295,16 @@ SUBROUTINE cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvecto

INTEGER :: handle
LOGICAL :: is_elpa
TYPE(cp_fm_redistribute_info), OPTIONAL, INTENT(OUT) :: redist_info

#if defined(__SCALAPACK)
REAL(KIND=dp) :: fake_local_data(1, 1)
INTEGER :: fake_descriptor(9), mepos_old, &
io_unit, n, ngroups, num_pe_new, &
num_pe_old, ncpu_max_elpa, num_opt
LOGICAL :: do_redistribute
io_unit, ngroups
TYPE(cp_fm_struct_type), POINTER :: fm_struct_new
TYPE(cp_para_env_type), POINTER :: para_env
TYPE(cp_logger_type), POINTER :: logger
TYPE(cp_fm_redistribute_info) :: rdinfo
#endif

CALL timeset(routineN, handle)
@@ -284,59 +323,49 @@ SUBROUTINE cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvecto
NULLIFY (eigenvectors_new)
logger => cp_get_default_logger()
io_unit = cp_logger_get_default_io_unit(logger)
n = matrix%matrix_struct%nrow_global
do_redistribute = .FALSE.

! first figure out the optimal number of cpus
! this is pure heuristics, the defaults are based on rosa timings
! that demonstrate that timings go up sharply if too many tasks are used
! we take a multiple of 4, and approximately n/60
para_env => matrix%matrix_struct%para_env
num_pe_old = para_env%num_pe
mepos_old = para_env%mepos
num_opt = cp_fm_diag_get_optimal_ncpu(n)
num_pe_new = num_opt

rdinfo%matrix_order = matrix%matrix_struct%nrow_global
rdinfo%num_pe_old = para_env%num_pe
rdinfo%num_pe_opt = cp_fm_diag_get_optimal_ncpu(rdinfo%matrix_order)
rdinfo%num_pe_new = rdinfo%num_pe_opt
rdinfo%num_pe_max_nz_col = -1
rdinfo%redistribute = .FALSE.

IF (.NOT. is_elpa) THEN
do_redistribute = (num_pe_new < num_pe_old)
rdinfo%redistribute = (rdinfo%num_pe_new < rdinfo%num_pe_old)
ELSE
! Diagonalization with ELPA fails when a processor column has zero width
! Determine the maximum number of CPUs the matrix can be distributed on for ELPA
ncpu_max_elpa = cp_fm_elpa_get_max_ncpu(matrix)
IF (ncpu_max_elpa .LT. num_pe_old) THEN
! Must redistribute to avoid crash
num_pe_new = ncpu_max_elpa
do_redistribute = .TRUE.
END IF
IF (work_redistribute%elpa_force_redistribute .AND. num_opt .LT. ncpu_max_elpa) THEN
! Use heuristics to determine the need for redistribution (when num_opt is smaller than the safe maximum)
num_pe_new = num_opt
do_redistribute = (num_pe_new < num_pe_old)
! Determine the maximum number of CPUs the matrix can be distributed without zero-width columns
rdinfo%num_pe_max_nz_col = cp_fm_max_ncpu_non_zero_column(matrix)
IF (rdinfo%num_pe_old > rdinfo%num_pe_max_nz_col) THEN
! Must redistribute to avoid crash if we exceed the max number of processors
rdinfo%num_pe_new = rdinfo%num_pe_max_nz_col
rdinfo%redistribute = .TRUE.
END IF
END IF

IF (work_redistribute%should_print .AND. io_unit > 0) THEN
WRITE (io_unit, '(/,A,I10)') "CP_FM_DIAG| Matrix order: ", n
WRITE (io_unit, '(A,I6,A)') "CP_FM_DIAG| Matrix distributed on ", num_pe_old, " processes"
WRITE (io_unit, '(A,I5)') "CP_FM_DIAG| Optimal number of CPUs: ", num_opt
IF (is_elpa) THEN
WRITE (io_unit, '(A,I5)') "CP_FM_DIAG| Max number of CPUs (ELPA):", ncpu_max_elpa
WRITE (io_unit, '(A,L5)') "CP_FM_DIAG| Force redistribute (ELPA):", work_redistribute%elpa_force_redistribute
END IF
IF (do_redistribute) THEN
WRITE (io_unit, '(A,I5,A)') "CP_FM_DIAG| The matrix will be redistributed onto ", num_pe_new, " processes"
ELSE
WRITE (io_unit, '(A)') "CP_FM_DIAG| The matrix will NOT be redistributed"
IF (work_redistribute%elpa_force_redistribute .AND. rdinfo%num_pe_opt < rdinfo%num_pe_max_nz_col) THEN
! Use heuristics to determine the need for redistribution (when num_pe_opt is smaller than the safe maximum)
rdinfo%num_pe_new = rdinfo%num_pe_opt
rdinfo%redistribute = (rdinfo%num_pe_old > rdinfo%num_pe_new)
END IF
WRITE (io_unit, '(A)') " "
END IF

! if the optimal is smaller than num_pe, we will redistribute the input matrix
IF (do_redistribute) THEN
IF (work_redistribute%should_print .AND. io_unit > 0) &
CALL rdinfo%write(io_unit)

! if the optimal is smaller than num_pe, we will redistribute the input matrix
IF (rdinfo%redistribute) THEN
! split comm, the first num_pe_new tasks will do the work
ALLOCATE (work_redistribute%group_distribution(0:num_pe_old - 1))
ALLOCATE (work_redistribute%group_distribution(0:rdinfo%num_pe_old - 1))
ALLOCATE (work_redistribute%group_partition(0:1))
work_redistribute%group_partition = (/num_pe_new, num_pe_old - num_pe_new/)
work_redistribute%group_partition = (/rdinfo%num_pe_new, rdinfo%num_pe_old - rdinfo%num_pe_new/)
CALL mp_comm_split(comm=para_env%group, sub_comm=work_redistribute%subgroup, &
ngroups=ngroups, group_distribution=work_redistribute%group_distribution, &
n_subgroups=2, group_partition=work_redistribute%group_partition)
@@ -358,37 +387,41 @@ SUBROUTINE cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvecto
CALL cp_fm_struct_create(fmstruct=fm_struct_new, &
para_env=work_redistribute%para_env_new, &
context=work_redistribute%blacs_env_new, &
nrow_global=n, ncol_global=n)
nrow_global=rdinfo%matrix_order, ncol_global=rdinfo%matrix_order)
CALL cp_fm_create(matrix_new, matrix_struct=fm_struct_new, name="yevd_new_mat")
CALL cp_fm_create(eigenvectors_new, matrix_struct=fm_struct_new, name="yevd_new_vec")
CALL cp_fm_struct_release(fm_struct_new)

! redistribute old
CALL pdgemr2d(n, n, matrix%local_data(1, 1), 1, 1, matrix%matrix_struct%descriptor, &
CALL pdgemr2d(rdinfo%matrix_order, rdinfo%matrix_order, matrix%local_data(1, 1), 1, 1, &
matrix%matrix_struct%descriptor, &
matrix_new%local_data(1, 1), 1, 1, matrix_new%matrix_struct%descriptor, &
matrix%matrix_struct%context%group)
ELSE
! these tasks must help redistribute (they own part of the data),
! but need fake 'new' data, and their descriptor must indicate this with -1
! see also scalapack comments on pdgemr2d
fake_descriptor = -1
CALL pdgemr2d(n, n, matrix%local_data(1, 1), 1, 1, matrix%matrix_struct%descriptor, &
CALL pdgemr2d(rdinfo%matrix_order, rdinfo%matrix_order, matrix%local_data(1, 1), 1, 1, &
matrix%matrix_struct%descriptor, &
fake_local_data(1, 1), 1, 1, fake_descriptor, &
matrix%matrix_struct%context%group)
ENDIF
ELSE
! No need to redistribute, just return pointers to the original arrays
matrix_new => matrix
eigenvectors_new => eigenvectors

ENDIF

IF (PRESENT(redist_info)) &
redist_info = rdinfo
#else

MARK_USED(matrix)
MARK_USED(eigenvectors)
MARK_USED(matrix_new)
MARK_USED(eigenvectors_new)
MARK_USED(redist_info)
CPABORT("Routine called in non-parallel case.")
#endif

0 comments on commit cfa49ca

Please sign in to comment.
You can’t perform that action at this time.