Skip to content

Commit

Permalink
Offload: Add timing ranges and mem_info
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Jul 28, 2021
1 parent bec3d8f commit e62d1b4
Show file tree
Hide file tree
Showing 12 changed files with 204 additions and 253 deletions.
4 changes: 3 additions & 1 deletion INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ the FFTW3 threading library libfftw3_threads (or libfftw3_omp) is required.
- CUFFT 7.0 has a known bug and is therefore disabled by default.
NVIDIA's webpage list a patch (an upgraded version cufft i.e. >= 7.0.35) -
use this together with `-D__HAS_PATCHED_CUFFT_70`.
- Use `-D__CUDA_PROFILING` to turn on Nvidia Tools Extensions.
- Use `-D__OFFLOAD_PROFILING` to turn on Nvidia Tools Extensions.
It requires to link `-lnvToolsExt`.
- Link to a blas/scalapack library that accelerates large DGEMMs (e.g. libsci_acc)
- Use the `-D__GRID_CUDA` to compile the GPU and HYBRID backends for the grid library.
Expand Down Expand Up @@ -340,6 +340,8 @@ out of the box on Nvidia hardware as well.
and set the `OFFLOAD_FLAGS` with right `nvcc` parameters (see the cuda section
of this document). The environment variable `HIP_PLATFORM` should be set to
`HIP_PLATFORM=nvidia` to indicate to hipcc to use the nvcc compiler instead.
- Use `-D__OFFLOAD_PROFILING` to turn on the AMD ROC TX and Tracer libray.
It requires to link `-lroctx64 -lroctracer64`.

<!---
### 2u. LibMaxwell (External Maxwell Solver)
Expand Down
2 changes: 1 addition & 1 deletion src/common/PACKAGE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"description": "Basic routines which are conceptually independent from CP2K and its input",
"requires": ["../mpiwrap", "../base"],
"requires": ["../mpiwrap", "../base", "../offload"],
}
66 changes: 0 additions & 66 deletions src/common/cuda_nvtx_cu.cu

This file was deleted.

141 changes: 0 additions & 141 deletions src/common/cuda_profiling.F

This file was deleted.

18 changes: 7 additions & 11 deletions src/common/timings.F
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@ MODULE timings
callgraph_item_type,&
callgraph_items,&
callgraph_set
USE cuda_profiling, ONLY: cuda_mem_info,&
cuda_nvtx_range_pop,&
cuda_nvtx_range_push
USE kinds, ONLY: default_string_length,&
dp,&
int_8
Expand All @@ -36,6 +33,9 @@ MODULE timings
m_flush,&
m_memory,&
m_walltime
USE offload_api, ONLY: offload_mem_info,&
offload_timeset,&
offload_timestop
USE routine_map, ONLY: routine_map_destroy,&
routine_map_get,&
routine_map_init,&
Expand Down Expand Up @@ -264,7 +264,7 @@ SUBROUTINE timeset_handler(routineN, handle)
WRITE (sformat, *) "(A,A,", MAX(1, 3*stack_size - 4), "X,I4,1X,I6,1X,A,A)"
WRITE (mystring, sformat) timer_env%trace_str, ">>", stack_size + 1, &
r_stat%total_calls, TRIM(r_stat%routineN), " start"
CALL cuda_mem_info(gpumem_free, gpumem_total)
CALL offload_mem_info(gpumem_free, gpumem_total)
CALL m_memory(cpumem)
WRITE (line, '(A,A,I0,A,A,I0,A)') TRIM(mystring), &
" Hostmem: ", (cpumem + 1024*1024 - 1)/(1024*1024), " MB", &
Expand All @@ -275,9 +275,7 @@ SUBROUTINE timeset_handler(routineN, handle)
handle = routine_id
#if defined( __CUDA_PROFILING )
CALL cuda_nvtx_range_push(routineN)
#endif
CALL offload_timeset(routineN)
!$OMP END CRITICAL(time_setstop)
!$OMP END MASTER
Expand Down Expand Up @@ -310,9 +308,7 @@ SUBROUTINE timestop_handler(handle)
!$OMP MASTER
!$OMP CRITICAL(time_setstop)
#if defined( __CUDA_PROFILING )
CALL cuda_nvtx_range_pop()
#endif
CALL offload_timestop()
timer_env => list_peek(timers_stack)
cs_entry = list_pop(timer_env%callstack)
Expand Down Expand Up @@ -375,7 +371,7 @@ SUBROUTINE timestop_handler(handle)
WRITE (sformat, *) "(A,A,", MAX(1, 3*stack_size - 4), "X,I4,1X,I6,1X,A,F12.3)"
WRITE (mystring, sformat) timer_env%trace_str, "<<", stack_size + 1, &
r_stat%total_calls, TRIM(r_stat%routineN), wt_elapsed
CALL cuda_mem_info(gpumem_free, gpumem_total)
CALL offload_mem_info(gpumem_free, gpumem_total)
CALL m_memory(cpumem)
WRITE (line, '(A,A,I0,A,A,I0,A)') TRIM(mystring), &
" Hostmem: ", (cpumem + 1024*1024 - 1)/(1024*1024), " MB", &
Expand Down
6 changes: 3 additions & 3 deletions src/cp2k_info.F
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,6 @@ FUNCTION cp2k_flags() RESULT(flags)
#if defined(__CRAY_PM_FAKE_ENERGY)
flags = TRIM(flags)//" cray_pm_fake_energy"
#endif
#if defined(__CUDA_PROFILING)
flags = TRIM(flags)//" cuda_profiling"
#endif
#if defined(__DBCSR_ACC)
flags = TRIM(flags)//" dbcsr_acc"
#endif
Expand Down Expand Up @@ -292,6 +289,9 @@ FUNCTION cp2k_flags() RESULT(flags)
#if defined(__GRID_HIP)
flags = TRIM(flags)//" grid_hip"
#endif
#if defined(__OFFLOAD_PROFILING)
flags = TRIM(flags)//" offload_profiling"
#endif

END FUNCTION cp2k_flags

Expand Down
63 changes: 61 additions & 2 deletions src/offload/offload_api.F
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,15 @@
! **************************************************************************************************
MODULE offload_api
USE ISO_C_BINDING, ONLY: C_ASSOCIATED,&
C_CHAR,&
C_F_POINTER,&
C_INT,&
C_NULL_CHAR,&
C_NULL_PTR,&
C_PTR
USE kinds, ONLY: dp
C_PTR,&
C_SIZE_T
USE kinds, ONLY: dp,&
int_8
#include "../base/base_uses.f90"

IMPLICIT NONE
Expand All @@ -26,6 +30,7 @@ MODULE offload_api

PUBLIC :: offload_get_device_count
PUBLIC :: offload_set_device_id, offload_get_device_id, offload_set_device
PUBLIC :: offload_timeset, offload_timestop, offload_mem_info
PUBLIC :: offload_buffer_type, offload_create_buffer, offload_free_buffer

TYPE offload_buffer_type
Expand Down Expand Up @@ -114,6 +119,60 @@ END SUBROUTINE offload_set_device_c

END SUBROUTINE offload_set_device

! **************************************************************************************************
!> \brief Starts a timing range.
!> \param routineN ...
!> \author Ole Schuett
! **************************************************************************************************
SUBROUTINE offload_timeset(routineN)
CHARACTER(LEN=*), INTENT(IN) :: routineN

INTERFACE
SUBROUTINE offload_timeset_c(message) BIND(C, name="offload_timeset")
IMPORT :: C_CHAR
CHARACTER(kind=C_CHAR), DIMENSION(*), INTENT(IN) :: message
END SUBROUTINE offload_timeset_c
END INTERFACE

CALL offload_timeset_c(TRIM(routineN)//C_NULL_CHAR)

END SUBROUTINE offload_timeset

! **************************************************************************************************
!> \brief Ends a timing range.
!> \author Ole Schuett
! **************************************************************************************************
SUBROUTINE offload_timestop()

INTERFACE
SUBROUTINE offload_timestop_c() BIND(C, name="offload_timestop")
END SUBROUTINE offload_timestop_c
END INTERFACE

CALL offload_timestop_c()

END SUBROUTINE offload_timestop

! **************************************************************************************************
!> \brief Gets free and total device memory.
!> \param free ...
!> \param total ...
!> \author Ole Schuett
! **************************************************************************************************
SUBROUTINE offload_mem_info(free, total)
INTEGER(KIND=int_8), INTENT(OUT) :: free, total

INTERFACE
SUBROUTINE offload_mem_info_c(free, total) BIND(C, name="offload_mem_info")
IMPORT :: C_SIZE_T
INTEGER(KIND=C_SIZE_T) :: free, total
END SUBROUTINE offload_mem_info_c
END INTERFACE

CALL offload_mem_info_c(free, total)

END SUBROUTINE offload_mem_info

! **************************************************************************************************
!> \brief Allocates a buffer of given length, ie. number of elements.
!> \param length ...
Expand Down

0 comments on commit e62d1b4

Please sign in to comment.