Skip to content

Commit

Permalink
pw: Set offload device
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Apr 24, 2021
1 parent 0507888 commit 2da74a0
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/offload/offload_library.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#ifndef OFFLOAD_LIBRARY_H
#define OFFLOAD_LIBRARY_H

#ifdef __GRID_CUDA
#if defined(__GRID_CUDA) || defined(__PW_CUDA)
#define __OFFLOAD_CUDA
#endif

Expand Down
2 changes: 1 addition & 1 deletion src/pw/cuda/PACKAGE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"description": "CUDA FFT acceleration",
"requires": [],
"requires": ["../../offload"],
"archive": "libcp2kpwcuda",
}
10 changes: 10 additions & 0 deletions src/pw/cuda/pw_cuda_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <sys/types.h>
#include <unistd.h>

#include "../../offload/offload_library.h"

// local dependencies
#include "fft_cuda.h"
#include "fft_cuda_utils.h"
Expand Down Expand Up @@ -55,6 +57,12 @@ extern void pw_cuda_error_check(cudaError_t cudaError, int line) {
}
}

extern void pw_cuda_set_device() {
cErr = cudaSetDevice(offload_get_device_id());
if (CHECK)
pw_cuda_error_check(cErr, __LINE__);
}

// STREAMS INIT/GET/RELEASE
void pw_cuda_device_streams_alloc(cudaStream_t **streams) {
cudaStream_t *cuda_streams_ptr;
Expand Down Expand Up @@ -159,6 +167,7 @@ extern "C" int pw_cuda_init() {
if (is_configured == 0) {
int version;
cufftResult_t cufftErr;
pw_cuda_set_device();
pw_cuda_device_streams_alloc(&cuda_streams);
pw_cuda_device_events_alloc(&cuda_events);
is_configured = 1;
Expand All @@ -185,6 +194,7 @@ extern "C" int pw_cuda_init() {

extern "C" void pw_cuda_finalize() {
if (is_configured == 1) {
pw_cuda_set_device();
fftcu_release_();
pw_cuda_device_streams_release(&cuda_streams);
pw_cuda_device_events_release(&cuda_events);
Expand Down
2 changes: 2 additions & 0 deletions src/pw/cuda/pw_cuda_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

extern void pw_cuda_error_check(cudaError_t cudaError, int line);

extern void pw_cuda_set_device();

// STREAMS INIT/GET/RELEASE
extern void pw_cuda_device_streams_alloc(cudaStream_t **streams);
extern void pw_cuda_get_streams(cudaStream_t **streams);
Expand Down
25 changes: 17 additions & 8 deletions src/pw/cuda/pw_cuda_z.cu
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ extern "C" void pw_cuda_cfffg_z_(const double *din, cuDoubleComplex *zout,
if (nrpts == 0 || ngpts == 0)
return;

// get streams
// get streams and events
pw_cuda_set_device();
pw_cuda_get_streams(&cuda_streams);
pw_cuda_get_events(&cuda_events);

Expand Down Expand Up @@ -284,7 +285,8 @@ extern "C" void pw_cuda_sfffc_z_(const cuDoubleComplex *zin, double *dout,
if (nrpts == 0 || ngpts == 0)
return;

// get streams
// get streams and events
pw_cuda_set_device();
pw_cuda_get_streams(&cuda_streams);
pw_cuda_get_events(&cuda_events);

Expand Down Expand Up @@ -384,7 +386,8 @@ extern "C" void pw_cuda_cff_z_(const double *din, cuDoubleComplex *zout,
if (nrpts == 0)
return;

// get streams
// get streams and events
pw_cuda_set_device();
pw_cuda_get_streams(&cuda_streams);
pw_cuda_get_events(&cuda_events);

Expand Down Expand Up @@ -473,7 +476,8 @@ extern "C" void pw_cuda_ffc_z_(const cuDoubleComplex *zin, double *dout,
if (nrpts == 0)
return;

// get streams
// get streams and events
pw_cuda_set_device();
pw_cuda_get_streams(&cuda_streams);
pw_cuda_get_events(&cuda_events);

Expand Down Expand Up @@ -561,7 +565,8 @@ extern "C" void pw_cuda_cf_z_(const double *din, cuDoubleComplex *zout,
if (nrpts == 0)
return;

// get streams
// get streams and events
pw_cuda_set_device();
pw_cuda_get_streams(&cuda_streams);
pw_cuda_get_events(&cuda_events);

Expand Down Expand Up @@ -641,7 +646,8 @@ extern "C" void pw_cuda_fc_z_(const cuDoubleComplex *zin, double *dout,
if (nrpts == 0)
return;

// get streams
// get streams and events
pw_cuda_set_device();
pw_cuda_get_streams(&cuda_streams);
pw_cuda_get_events(&cuda_events);

Expand Down Expand Up @@ -718,7 +724,8 @@ extern "C" void pw_cuda_f_z_(const cuDoubleComplex *zin, cuDoubleComplex *zout,
if (nrpts == 0)
return;

// get streams
// get streams and events
pw_cuda_set_device();
pw_cuda_get_streams(&cuda_streams);
pw_cuda_get_events(&cuda_events);

Expand Down Expand Up @@ -789,6 +796,7 @@ extern "C" void pw_cuda_fg_z_(const cuDoubleComplex *zin, cuDoubleComplex *zout,
return;

// get streams and events
pw_cuda_set_device();
pw_cuda_get_streams(&cuda_streams);
pw_cuda_get_events(&cuda_events);

Expand Down Expand Up @@ -876,7 +884,8 @@ extern "C" void pw_cuda_sf_z_(const cuDoubleComplex *zin, cuDoubleComplex *zout,
if (nrpts == 0 || ngpts == 0)
return;

// get streams
// get streams and events
pw_cuda_set_device();
pw_cuda_get_streams(&cuda_streams);
pw_cuda_get_events(&cuda_events);

Expand Down
12 changes: 12 additions & 0 deletions src/pw/pw_grids.F
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ MODULE pw_grids
USE message_passing, ONLY: &
mp_allgather, mp_cart_coords, mp_cart_create, mp_cart_rank, mp_comm_compare, mp_comm_dup, &
mp_comm_free, mp_comm_self, mp_dims_create, mp_environ, mp_max, mp_min, mp_sum
USE offload_api, ONLY: offload_get_device_id
USE pw_grid_info, ONLY: pw_find_cutoff,&
pw_grid_bounds_from_n,&
pw_grid_init_setup
Expand All @@ -73,6 +74,15 @@ MODULE pw_grids
CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_grids'

#if defined ( __PW_CUDA ) && !defined ( __PW_CUDA_NO_HOSTALLOC )
INTERFACE
INTEGER(C_INT) FUNCTION cudaSetDevice(device_id) &
BIND(C, name="cudaSetDevice")
IMPORT
IMPLICIT NONE
INTEGER(C_INT), VALUE :: device_id
END FUNCTION cudaSetDevice
END INTERFACE

INTERFACE
INTEGER(C_INT) FUNCTION cudaHostAlloc(buffer, length, flag) &
BIND(C, name="cudaHostAlloc")
Expand Down Expand Up @@ -1733,6 +1743,8 @@ SUBROUTINE pw_grid_allocate(pw_grid, ng, bounds)
nmaps = 1
IF (pw_grid%grid_span == HALFSPACE) nmaps = 2
#if defined ( __PW_CUDA ) && !defined ( __PW_CUDA_NO_HOSTALLOC )
stat = cudaSetDevice(offload_get_device_id())
CPASSERT(stat == 0)
length = INT(int_size*MAX(ng, 1)*MAX(nmaps, 1), KIND=C_SIZE_T)
stat = cudaHostAlloc(cptr_g_hatmap, length, cudaHostAllocDefault)
CPASSERT(stat == 0)
Expand Down

0 comments on commit 2da74a0

Please sign in to comment.