Skip to content

Commit

Permalink
offload: Unify active device selection
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Apr 24, 2021
1 parent 20acf74 commit 0507888
Show file tree
Hide file tree
Showing 15 changed files with 201 additions and 85 deletions.
29 changes: 20 additions & 9 deletions src/f77_interface.F
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ MODULE f77_interface
unpack_subsys_particles
USE dbcsr_api, ONLY: dbcsr_finalize_lib,&
dbcsr_init_lib
USE dbcsr_config, ONLY: get_accdrv_active_device_id
USE eip_environment, ONLY: eip_init
USE eip_environment_types, ONLY: eip_env_create,&
eip_env_release,&
Expand Down Expand Up @@ -100,7 +99,7 @@ MODULE f77_interface
m_getcwd,&
m_memory
USE message_passing, ONLY: &
add_mp_perf_env, get_mp_perf_env, mp_comm_world, mp_max, mp_perf_env_release, &
add_mp_perf_env, get_mp_perf_env, mp_comm_world, mp_environ, mp_max, mp_perf_env_release, &
mp_perf_env_retain, mp_perf_env_type, mp_world_finalize, mp_world_init, rm_mp_perf_env
USE metadynamics_types, ONLY: meta_env_release,&
meta_env_type
Expand All @@ -112,7 +111,9 @@ MODULE f77_interface
USE nnp_environment_types, ONLY: nnp_env_create,&
nnp_env_release,&
nnp_type
USE offload_api, ONLY: offload_set_device_id
USE offload_api, ONLY: offload_get_device_count,&
offload_get_device_id,&
offload_set_device_id
USE periodic_table, ONLY: init_periodic_table
USE pw_cuda, ONLY: pw_cuda_finalize,&
pw_cuda_init
Expand Down Expand Up @@ -213,7 +214,8 @@ SUBROUTINE init_cp2k(init_mpi, ierr)
LOGICAL, INTENT(in) :: init_mpi
INTEGER, INTENT(out) :: ierr

INTEGER :: mpi_comm_default, unit_nr
INTEGER :: mpi_comm_default, numtask, taskid, &
unit_nr
TYPE(cp_logger_type), POINTER :: logger

IF (.NOT. module_initialized) THEN
Expand Down Expand Up @@ -267,13 +269,22 @@ SUBROUTINE init_cp2k(init_mpi, ierr)
! *** init the bibliography ***
CALL add_all_references()

! Select active offload device when available.
IF (offload_get_device_count() > 0) THEN
CALL mp_environ(numtask=numtask, taskid=taskid, groupid=mpi_comm_default)
CALL offload_set_device_id(MOD(taskid, offload_get_device_count()))
ENDIF

! Initialize the DBCSR configuration
! Attach the time handler hooks to DBCSR
! DBCSR sets the device for multi-gpu, make sure it is the first GPU call
CALL dbcsr_init_lib(default_para_env%group, timeset_hook, timestop_hook, &
cp_abort_hook, cp_warn_hook, io_unit=unit_nr)

CALL offload_set_device_id(device_id=get_accdrv_active_device_id())
IF (offload_get_device_count() > 0) THEN
CALL dbcsr_init_lib(default_para_env%group, timeset_hook, timestop_hook, &
cp_abort_hook, cp_warn_hook, io_unit=unit_nr, &
accdrv_active_device_id=offload_get_device_id())
ELSE
CALL dbcsr_init_lib(default_para_env%group, timeset_hook, timestop_hook, &
cp_abort_hook, cp_warn_hook, io_unit=unit_nr)
ENDIF

CALL pw_cuda_init()

Expand Down
5 changes: 3 additions & 2 deletions src/grid/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
all: grid_miniapp.x grid_unittest.x

clean:
rm -fv *.o */*.o *.x ../offload/offload_buffer.o
rm -fv *.o */*.o *.x ../offload/*.o

CFLAGS := -fopenmp -g -O3 -march=native -Wall -Wextra
NVFLAGS := -g -O3 -lineinfo -arch sm_70 -Wno-deprecated-gpu-targets -Xcompiler "$(CFLAGS)" -D__GRID_CUDA
LIBS := -lm -lblas

ALL_HEADERS := $(shell find . -name "*.h") ../offload/offload_buffer.h
ALL_HEADERS := $(shell find . -name "*.h") $(shell find ../offload/ -name "*.h")
ALL_OBJECTS := ../offload/offload_buffer.o \
../offload/offload_library.o \
grid_replay.o \
grid_task_list.o \
common/grid_library.o \
Expand Down
10 changes: 3 additions & 7 deletions src/grid/common/grid_library.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,8 @@ typedef struct {
static grid_library_globals **per_thread_globals = NULL;
static bool library_initialized = false;
static int max_threads = 0;
static grid_library_config config = {.backend = GRID_BACKEND_AUTO,
.device_id = 0,
.validate = false,
.apply_cutoff = false};
static grid_library_config config = {
.backend = GRID_BACKEND_AUTO, .validate = false, .apply_cutoff = false};

#if !defined(_OPENMP)
#error "OpenMP is required. Please add -fopenmp to your C compiler flags."
Expand Down Expand Up @@ -97,10 +95,8 @@ grid_sphere_cache *grid_library_get_sphere_cache(void) {
* \author Ole Schuett
******************************************************************************/
void grid_library_set_config(const enum grid_backend backend,
const int device_id, const bool validate,
const bool apply_cutoff) {
const bool validate, const bool apply_cutoff) {
config.backend = backend;
config.device_id = device_id;
config.validate = validate;
config.apply_cutoff = apply_cutoff;
}
Expand Down
4 changes: 1 addition & 3 deletions src/grid/common/grid_library.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ void grid_library_finalize(void);
typedef struct {
enum grid_backend
backend; // Selectes the backend to be used by the grid library.
int device_id; // gpu id
bool validate; // When true the reference backend runs in shadow mode.
bool apply_cutoff; // only important for the dgemm and gpu backends
} grid_library_config;
Expand All @@ -44,8 +43,7 @@ typedef struct {
* \author Ole Schuett
******************************************************************************/
void grid_library_set_config(const enum grid_backend backend,
const int device_id, const bool validate,
const bool apply_cutoff);
const bool validate, const bool apply_cutoff);

/*******************************************************************************
* \brief Returns the library config.
Expand Down
10 changes: 4 additions & 6 deletions src/grid/gpu/grid_gpu_task_list.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <stdlib.h>
#include <string.h>

#include "../../offload/offload_library.h"
#include "../common/grid_common.h"
#include "../common/grid_constants.h"
#include "../common/grid_library.h"
Expand Down Expand Up @@ -250,7 +251,7 @@ void grid_gpu_create_task_list(
const double dh_inv[][3][3], grid_gpu_task_list **task_list_out) {

// Select GPU device.
CHECK(cudaSetDevice(grid_library_get_config().device_id));
CHECK(cudaSetDevice(offload_get_device_id()));

if (*task_list_out != NULL) {
// This is actually an opportunity to reuse some buffers.
Expand Down Expand Up @@ -359,9 +360,6 @@ void grid_gpu_create_task_list(
******************************************************************************/
void grid_gpu_free_task_list(grid_gpu_task_list *task_list) {

// Select GPU device.
CHECK(cudaSetDevice(grid_library_get_config().device_id));

CHECK(cudaFree(task_list->tasks_dev));

CHECK(cudaStreamDestroy(task_list->main_stream));
Expand Down Expand Up @@ -392,7 +390,7 @@ void grid_gpu_collocate_task_list(const grid_gpu_task_list *task_list,
offload_buffer *grids[]) {

// Select GPU device.
CHECK(cudaSetDevice(grid_library_get_config().device_id));
CHECK(cudaSetDevice(offload_get_device_id()));

// Upload blocks buffer using the main stream
CHECK(cudaMemcpyAsync(pab_blocks->device_buffer, pab_blocks->host_buffer,
Expand Down Expand Up @@ -468,7 +466,7 @@ void grid_gpu_integrate_task_list(const grid_gpu_task_list *task_list,
double forces[][3], double virial[3][3]) {

// Select GPU device.
CHECK(cudaSetDevice(grid_library_get_config().device_id));
CHECK(cudaSetDevice(offload_get_device_id()));

// Prepare shared buffers using the main stream
double *forces_dev = NULL;
Expand Down
9 changes: 3 additions & 6 deletions src/grid/grid_api.F
Original file line number Diff line number Diff line change
Expand Up @@ -1154,28 +1154,25 @@ END SUBROUTINE grid_library_finalize
! **************************************************************************************************
!> \brief Configures the grid library
!> \param backend : backend to be used for collocate/integrate, possible values are REF, CPU, GPU
!> \param device_id : GPU id returned by dbcsr
!> \param validate : if set to true, compare the results of all backend to the reference backend
!> \param apply_cutoff : apply a spherical cutoff before collocating or integrating. Only relevant for CPU backend
!> \author Ole Schuett
! **************************************************************************************************
SUBROUTINE grid_library_set_config(backend, device_id, validate, apply_cutoff)
INTEGER, INTENT(IN) :: backend, device_id
SUBROUTINE grid_library_set_config(backend, validate, apply_cutoff)
INTEGER, INTENT(IN) :: backend
LOGICAL, INTENT(IN) :: validate, apply_cutoff

INTERFACE
SUBROUTINE grid_library_set_config_c(backend, device_id, validate, apply_cutoff) &
SUBROUTINE grid_library_set_config_c(backend, validate, apply_cutoff) &
BIND(C, name="grid_library_set_config")
IMPORT :: C_INT, C_BOOL
INTEGER(KIND=C_INT), VALUE :: backend
INTEGER(KIND=C_INT), VALUE :: device_id
LOGICAL(KIND=C_BOOL), VALUE :: validate
LOGICAL(KIND=C_BOOL), VALUE :: apply_cutoff
END SUBROUTINE grid_library_set_config_c
END INTERFACE

CALL grid_library_set_config_c(backend=backend, &
device_id=device_id, &
validate=LOGICAL(validate, C_BOOL), &
apply_cutoff=LOGICAL(apply_cutoff, C_BOOL))

Expand Down
2 changes: 1 addition & 1 deletion src/grid/grid_miniapp.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>

#include "../offload/offload_buffer.h"
#include "../offload/offload_library.h"
#include "common/grid_library.h"
#include "grid_replay.h"

Expand Down
2 changes: 1 addition & 1 deletion src/grid/grid_unittest.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>

#include "../offload/offload_buffer.h"
#include "../offload/offload_library.h"
#include "common/grid_library.h"
#include "grid_replay.h"

Expand Down
2 changes: 1 addition & 1 deletion src/offload/PACKAGE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"description": "Common infrastructure for offloading to accelerator devices",
"requires": ["../base",],
"public": ["offload_api.F", "offload_buffer.h"],
"public": ["offload_api.F", "offload_buffer.h", "offload_library.h"],
}
46 changes: 45 additions & 1 deletion src/offload/offload_api.F
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ MODULE offload_api

CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'offload_api'

PUBLIC :: offload_set_device_id
PUBLIC :: offload_get_device_count
PUBLIC :: offload_set_device_id, offload_get_device_id
PUBLIC :: offload_buffer_type, offload_create_buffer, offload_free_buffer

TYPE offload_buffer_type
Expand All @@ -34,6 +35,26 @@ MODULE offload_api

CONTAINS

! **************************************************************************************************
!> \brief Returns the number of available devices.
!> \return ...
!> \author Ole Schuett
! **************************************************************************************************
FUNCTION offload_get_device_count() RESULT(count)
INTEGER :: count

INTERFACE
FUNCTION offload_get_device_count_c() &
BIND(C, name="offload_get_device_count")
IMPORT :: C_INT
INTEGER(KIND=C_INT) :: offload_get_device_count_c
END FUNCTION offload_get_device_count_c
END INTERFACE

count = offload_get_device_count_c()

END FUNCTION offload_get_device_count

! **************************************************************************************************
!> \brief Selects the device to be used.
!> \param device_id ...
Expand All @@ -54,6 +75,29 @@ END SUBROUTINE offload_set_device_id_c

END SUBROUTINE offload_set_device_id

! **************************************************************************************************
!> \brief Returns the device to be used.
!> \return ...
!> \author Ole Schuett
! **************************************************************************************************
FUNCTION offload_get_device_id() RESULT(device_id)
INTEGER :: device_id

INTERFACE
FUNCTION offload_get_device_id_c() &
BIND(C, name="offload_get_device_id")
IMPORT :: C_INT
INTEGER(KIND=C_INT) :: offload_get_device_id_c
END FUNCTION offload_get_device_id_c
END INTERFACE

device_id = offload_get_device_id_c()

IF (device_id < 0) &
CPABORT("Offload device not selected.")

END FUNCTION offload_get_device_id

! **************************************************************************************************
!> \brief Allocates a buffer of given length, ie. number of elements.
!> \param length ...
Expand Down
39 changes: 7 additions & 32 deletions src/offload/offload_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,7 @@
#include <stdlib.h>

#include "offload_buffer.h"

#ifdef __GRID_CUDA
#define __OFFLOAD_CUDA
#endif

#ifdef __OFFLOAD_CUDA
#include <cuda_runtime.h>
#endif

static int current_device_id = -1;

/*******************************************************************************
* \brief Checks given Cuda status and upon failure abort with a nice message.
* \author Ole Schuett
******************************************************************************/
#define CHECK(status) \
if (status != cudaSuccess) { \
fprintf(stderr, "ERROR: %s %s %d\n", cudaGetErrorString(status), __FILE__, \
__LINE__); \
abort(); \
}

/*******************************************************************************
* \brief Selects the device to be used.
* \author Ole Schuett
******************************************************************************/
void offload_set_device_id(int device_id) { current_device_id = device_id; }
#include "offload_library.h"

/*******************************************************************************
* \brief Allocates a buffer of given length, ie. number of elements.
Expand All @@ -60,9 +34,10 @@ void offload_create_buffer(const int length, offload_buffer **buffer) {
#ifdef __OFFLOAD_CUDA
// With size 0 cudaMallocHost doesn't null the pointer and cudaFreeHost fails.
(*buffer)->host_buffer = NULL;
CHECK(cudaSetDevice(current_device_id));
CHECK(cudaMallocHost((void **)&(*buffer)->host_buffer, requested_size));
CHECK(cudaMalloc((void **)&(*buffer)->device_buffer, requested_size));
OFFLOAD_CHECK(cudaSetDevice(offload_get_device_id()));
OFFLOAD_CHECK(
cudaMallocHost((void **)&(*buffer)->host_buffer, requested_size));
OFFLOAD_CHECK(cudaMalloc((void **)&(*buffer)->device_buffer, requested_size));
#else
(*buffer)->host_buffer = malloc(requested_size);
(*buffer)->device_buffer = NULL;
Expand All @@ -79,8 +54,8 @@ void offload_free_buffer(offload_buffer *buffer) {
return;

#ifdef __OFFLOAD_CUDA
CHECK(cudaFreeHost(buffer->host_buffer));
CHECK(cudaFree(buffer->device_buffer));
OFFLOAD_CHECK(cudaFreeHost(buffer->host_buffer));
OFFLOAD_CHECK(cudaFree(buffer->device_buffer));
#else
free(buffer->host_buffer);
#endif
Expand Down
6 changes: 0 additions & 6 deletions src/offload/offload_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@ typedef struct {
double *device_buffer;
} offload_buffer;

/*******************************************************************************
* \brief Selects the device to be used.
* \author Ole Schuett
******************************************************************************/
void offload_set_device_id(int device_id);

/*******************************************************************************
* \brief Allocates a buffer of given length, ie. number of elements.
* \author Ole Schuett
Expand Down

0 comments on commit 0507888

Please sign in to comment.