Skip to content

Commit

Permalink
grid: Migrate GPU backend to offload_buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Apr 23, 2021
1 parent 1f2f64c commit 6d3a0ae
Show file tree
Hide file tree
Showing 15 changed files with 96 additions and 145 deletions.
2 changes: 1 addition & 1 deletion src/grid/cpu/cpu_private_header.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ extern void set_grid_parameters(
const double
dh[3][3], /* displacement vectors of the grid (cartesian) -> (ijk) */
const double dh_inv[3][3], /* (ijk) -> (x,y,z) */
double *grid_);
offload_buffer *grid_);

extern void collocate_one_grid_level_dgemm(grid_context *const ctx,
const int *const, const int *const,
Expand Down
4 changes: 2 additions & 2 deletions src/grid/cpu/grid_collocate_dgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1235,7 +1235,7 @@ void collocate_one_grid_level_dgemm(grid_context *const ctx,
void grid_cpu_collocate_task_list(grid_cpu_task_list *const ptr,
const enum grid_func func, const int nlevels,
const offload_buffer *pab_blocks,
double *grid[nlevels]) {
offload_buffer *grids[nlevels]) {

grid_context *const ctx = (grid_context *)ptr;

Expand All @@ -1251,7 +1251,7 @@ void grid_cpu_collocate_task_list(grid_cpu_task_list *const ptr,
set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
layout->npts_global, layout->npts_local,
layout->shift_local, layout->border_width, layout->dh,
layout->dh_inv, grid[level]);
layout->dh_inv, grids[level]);
memset(ctx->grid[level].data, 0,
sizeof(double) * ctx->grid[level].alloc_size_);
}
Expand Down
4 changes: 2 additions & 2 deletions src/grid/cpu/grid_context_cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -498,12 +498,12 @@ void set_grid_parameters(
const double
dh[3][3], /* displacement vectors of the grid (cartesian) -> (ijk) */
const double dh_inv[3][3], /* (ijk) -> (x,y,z) */
double *grid_) {
offload_buffer *grid_) {
memset(grid, 0, sizeof(tensor));
initialize_tensor_3(grid, grid_local_size[2], grid_local_size[1],
grid_local_size[0]);

grid->data = grid_;
grid->data = grid_->host_buffer;
grid->ld_ = grid_local_size[0];

setup_global_grid_size(grid, &grid_full_size[0]);
Expand Down
4 changes: 2 additions & 2 deletions src/grid/cpu/grid_cpu_task_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ void grid_cpu_free_task_list(grid_cpu_task_list *task_list);
void grid_cpu_collocate_task_list(grid_cpu_task_list *const task_list,
const enum grid_func func, const int nlevels,
const offload_buffer *pab_blocks,
double *grid[nlevels]);
offload_buffer *grids[nlevels]);

void grid_cpu_integrate_task_list(void *const ptr, const bool compute_tau,
const int natoms, const int nlevels,
const offload_buffer *const pab_blocks,
const double *grid[nlevels],
const offload_buffer *grids[nlevels],
offload_buffer *hab_blocks,
double forces[natoms][3],
double virial[3][3]);
Expand Down
6 changes: 3 additions & 3 deletions src/grid/cpu/grid_integrate_dgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1060,7 +1060,7 @@ void integrate_one_grid_level_dgemm(
******************************************************************************/
void grid_cpu_integrate_task_list(
void *ptr, const bool compute_tau, const int natoms, const int nlevels,
const offload_buffer *const pab_blocks, double *grid[nlevels],
const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels],
offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {

grid_context *const ctx = (grid_context *)ptr;
Expand All @@ -1083,8 +1083,8 @@ void grid_cpu_integrate_task_list(
set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
layout->npts_global, layout->npts_local,
layout->shift_local, layout->border_width, layout->dh,
layout->dh_inv, grid[level]);
ctx->grid[level].data = grid[level];
layout->dh_inv, grids[level]);
ctx->grid[level].data = grids[level]->host_buffer;
}

bool calculate_virial = (virial != NULL);
Expand Down
65 changes: 11 additions & 54 deletions src/grid/gpu/grid_gpu_task_list.cu
Original file line number Diff line number Diff line change
Expand Up @@ -349,14 +349,6 @@ void grid_gpu_create_task_list(
CHECK(cudaStreamCreate(&task_list->level_streams[i]));
}

size = nlevels * sizeof(double *);
task_list->grid_dev = (double **)malloc(size);
memset(task_list->grid_dev, 0, size);

size = nlevels * sizeof(size_t);
task_list->grid_dev_size = (size_t *)malloc(size);
memset(task_list->grid_dev_size, 0, size);

// return newly created task list
*task_list_out = task_list;
}
Expand All @@ -379,19 +371,11 @@ void grid_gpu_free_task_list(grid_gpu_task_list *task_list) {
}
free(task_list->level_streams);

for (int i = 0; i < task_list->nlevels; i++) {
if (task_list->grid_dev[i] != NULL) {
CHECK(cudaFree(task_list->grid_dev[i]));
}
}
free(task_list->grid_dev);

for (int i = 0; i < task_list->nkinds; i++) {
CHECK(cudaFree(task_list->sphis_dev[i]));
}
free(task_list->sphis_dev);

free(task_list->grid_dev_size);
free(task_list->tasks_per_level);
free(task_list->layouts);
free(task_list);
Expand All @@ -405,7 +389,7 @@ void grid_gpu_free_task_list(grid_gpu_task_list *task_list) {
void grid_gpu_collocate_task_list(const grid_gpu_task_list *task_list,
const enum grid_func func, const int nlevels,
const offload_buffer *pab_blocks,
double *grid[]) {
offload_buffer *grids[]) {

// Select GPU device.
CHECK(cudaSetDevice(grid_library_get_config().device_id));
Expand All @@ -427,27 +411,16 @@ void grid_gpu_collocate_task_list(const grid_gpu_task_list *task_list,
const int last_task = first_task + task_list->tasks_per_level[level] - 1;
const cudaStream_t level_stream = task_list->level_streams[level];
const grid_gpu_layout *layout = &task_list->layouts[level];
const size_t grid_size = layout->npts_local[0] * layout->npts_local[1] *
layout->npts_local[2] * sizeof(double);
offload_buffer *grid = grids[level];

// reallocate device grid buffers if needed
if (task_list->grid_dev_size[level] < grid_size) {
if (task_list->grid_dev[level] != NULL) {
CHECK(cudaFree(task_list->grid_dev[level]));
}
CHECK(cudaMalloc(&task_list->grid_dev[level], grid_size));
task_list->grid_dev_size[level] = grid_size;
}

// zero device grid buffers
CHECK(cudaMemsetAsync(task_list->grid_dev[level], 0, grid_size,
level_stream));
// zero grid device buffer
CHECK(cudaMemsetAsync(grid->device_buffer, 0, grid->size, level_stream));

// launch kernel, but only after blocks have arrived
CHECK(cudaStreamWaitEvent(level_stream, input_ready_event, 0));
grid_gpu_collocate_one_grid_level(
task_list, first_task, last_task, func, layout, level_stream,
pab_blocks->device_buffer, task_list->grid_dev[level], &lp_diff);
pab_blocks->device_buffer, grid->device_buffer, &lp_diff);

first_task = last_task + 1;
}
Expand All @@ -467,13 +440,9 @@ void grid_gpu_collocate_task_list(const grid_gpu_task_list *task_list,
}

// download result from device to host.
// TODO: Make these mem copies actually async by page locking the grid buffers
// This now takes 10% of the time!!!
for (int level = 0; level < task_list->nlevels; level++) {
const grid_gpu_layout *layout = &task_list->layouts[level];
const size_t grid_size = layout->npts_local[0] * layout->npts_local[1] *
layout->npts_local[2] * sizeof(double);
CHECK(cudaMemcpyAsync(grid[level], task_list->grid_dev[level], grid_size,
offload_buffer *grid = grids[level];
CHECK(cudaMemcpyAsync(grid->host_buffer, grid->device_buffer, grid->size,
cudaMemcpyDeviceToHost,
task_list->level_streams[level]));
}
Expand All @@ -494,7 +463,7 @@ void grid_gpu_integrate_task_list(const grid_gpu_task_list *task_list,
const bool compute_tau, const int natoms,
const int nlevels,
const offload_buffer *pab_blocks,
const double *grid[],
const offload_buffer *grids[],
offload_buffer *hab_blocks,
double forces[][3], double virial[3][3]) {

Expand Down Expand Up @@ -538,29 +507,17 @@ void grid_gpu_integrate_task_list(const grid_gpu_task_list *task_list,
const int last_task = first_task + task_list->tasks_per_level[level] - 1;
const cudaStream_t level_stream = task_list->level_streams[level];
const grid_gpu_layout *layout = &task_list->layouts[level];
const size_t grid_size = layout->npts_local[0] * layout->npts_local[1] *
layout->npts_local[2] * sizeof(double);

// reallocate device grid buffer if needed
if (task_list->grid_dev_size[level] < grid_size) {
if (task_list->grid_dev[level] != NULL) {
CHECK(cudaFree(task_list->grid_dev[level]));
}
CHECK(cudaMalloc(&task_list->grid_dev[level], grid_size));
task_list->grid_dev_size[level] = grid_size;
}
const offload_buffer *grid = grids[level];

// upload grid
// TODO: Make these copies actually async by page locking the grid buffers.
// This now takes 30% of the time!!!
CHECK(cudaMemcpyAsync(task_list->grid_dev[level], grid[level], grid_size,
CHECK(cudaMemcpyAsync(grid->device_buffer, grid->host_buffer, grid->size,
cudaMemcpyHostToDevice, level_stream));

// launch kernel, but only after hab, pab, virial, etc are ready
CHECK(cudaStreamWaitEvent(level_stream, input_ready_event, 0));
grid_gpu_integrate_one_grid_level(
task_list, first_task, last_task, compute_tau, layout, level_stream,
pab_blocks_dev, task_list->grid_dev[level], hab_blocks->device_buffer,
pab_blocks_dev, grid->device_buffer, hab_blocks->device_buffer,
forces_dev, virial_dev, &lp_diff);

// Have main stream wait for level to complete before downloading results.
Expand Down
6 changes: 2 additions & 4 deletions src/grid/gpu/grid_gpu_task_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,6 @@ typedef struct {
// device pointers
double **sphis_dev;
grid_gpu_task *tasks_dev;
double **grid_dev;
size_t *grid_dev_size;
} grid_gpu_task_list;

/*******************************************************************************
Expand Down Expand Up @@ -155,7 +153,7 @@ void grid_gpu_free_task_list(grid_gpu_task_list *task_list);
void grid_gpu_collocate_task_list(const grid_gpu_task_list *task_list,
const enum grid_func func, const int nlevels,
const offload_buffer *pab_blocks,
double *grid[]);
offload_buffer *grids[]);

/*******************************************************************************
* \brief Integrate all tasks of in given list onto given grids.
Expand All @@ -166,7 +164,7 @@ void grid_gpu_integrate_task_list(const grid_gpu_task_list *task_list,
const bool compute_tau, const int natoms,
const int nlevels,
const offload_buffer *pab_blocks,
const double *grid[],
const offload_buffer *grids[],
offload_buffer *hab_blocks,
double forces[][3], double virial[3][3]);

Expand Down
40 changes: 14 additions & 26 deletions src/grid/grid_api.F
Original file line number Diff line number Diff line change
Expand Up @@ -972,20 +972,19 @@ SUBROUTINE grid_collocate_task_list(task_list, ga_gb_function, pab_blocks, rs_gr

INTEGER :: handle, ilevel, nlevels
INTEGER, ALLOCATABLE, DIMENSION(:, :), TARGET :: npts_local
REAL(KIND=dp), DIMENSION(:, :, :), POINTER :: grid
TYPE(C_PTR), ALLOCATABLE, DIMENSION(:), TARGET :: grid_pointers
TYPE(C_PTR), ALLOCATABLE, DIMENSION(:), TARGET :: grids_c
TYPE(realspace_grid_type), POINTER :: rsgrid
INTERFACE
SUBROUTINE grid_collocate_task_list_c(task_list, func, nlevels, &
npts_local, pab_blocks, grid) &
npts_local, pab_blocks, grids) &
BIND(C, name="grid_collocate_task_list")
IMPORT :: C_PTR, C_INT, C_BOOL
TYPE(C_PTR), VALUE :: task_list
INTEGER(KIND=C_INT), VALUE :: func
INTEGER(KIND=C_INT), VALUE :: nlevels
TYPE(C_PTR), VALUE :: npts_local
TYPE(C_PTR), VALUE :: pab_blocks
TYPE(C_PTR), VALUE :: grid
TYPE(C_PTR), VALUE :: grids
END SUBROUTINE grid_collocate_task_list_c
END INTERFACE

Expand All @@ -994,22 +993,17 @@ END SUBROUTINE grid_collocate_task_list_c
nlevels = SIZE(rs_grids)
CPASSERT(nlevels > 0)

ALLOCATE (grid_pointers(nlevels))
ALLOCATE (grids_c(nlevels))
ALLOCATE (npts_local(3, nlevels))
DO ilevel = 1, nlevels
rsgrid => rs_grids(ilevel)%rs_grid
npts_local(:, ilevel) = rsgrid%ub_local - rsgrid%lb_local + 1
grid(1:, 1:, 1:) => rsgrid%r(:, :, :) ! ensure lower bounds are (1,1,1)
grid_pointers(ilevel) = C_LOC(grid(1, 1, 1))
grids_c(ilevel) = rsgrid%buffer%c_ptr
END DO

#if __GNUC__ >= 9
CPASSERT(IS_CONTIGUOUS(npts_local))
CPASSERT(IS_CONTIGUOUS(grid_pointers))
DO ilevel = 1, nlevels
grid(1:, 1:, 1:) => rs_grids(ilevel)%rs_grid%r(:, :, :)
CPASSERT(IS_CONTIGUOUS(grid))
END DO
CPASSERT(IS_CONTIGUOUS(grids_c))
#endif

CPASSERT(C_ASSOCIATED(task_list%c_ptr))
Expand All @@ -1020,7 +1014,7 @@ END SUBROUTINE grid_collocate_task_list_c
nlevels=nlevels, &
npts_local=C_LOC(npts_local(1, 1)), &
pab_blocks=pab_blocks%c_ptr, &
grid=C_LOC(grid_pointers(1)))
grids=grids_c)

CALL timestop(handle)
END SUBROUTINE grid_collocate_task_list
Expand Down Expand Up @@ -1055,14 +1049,13 @@ SUBROUTINE grid_integrate_task_list(task_list, compute_tau, calculate_forces, ca

INTEGER :: handle, ilevel, nlevels
INTEGER, ALLOCATABLE, DIMENSION(:, :), TARGET :: npts_local
REAL(KIND=dp), DIMENSION(:, :, :), POINTER :: grid
TYPE(C_PTR) :: forces_c, virial_c
TYPE(C_PTR), ALLOCATABLE, DIMENSION(:), TARGET :: grid_pointers
TYPE(C_PTR), ALLOCATABLE, DIMENSION(:), TARGET :: grids_c
TYPE(realspace_grid_type), POINTER :: rsgrid
INTERFACE
SUBROUTINE grid_integrate_task_list_c(task_list, compute_tau, natoms, &
nlevels, npts_local, &
pab_blocks, grid, hab_blocks, forces, virial) &
pab_blocks, grids, hab_blocks, forces, virial) &
BIND(C, name="grid_integrate_task_list")
IMPORT :: C_PTR, C_INT, C_BOOL
TYPE(C_PTR), VALUE :: task_list
Expand All @@ -1071,7 +1064,7 @@ SUBROUTINE grid_integrate_task_list_c(task_list, compute_tau, natoms, &
INTEGER(KIND=C_INT), VALUE :: nlevels
TYPE(C_PTR), VALUE :: npts_local
TYPE(C_PTR), VALUE :: pab_blocks
TYPE(C_PTR), VALUE :: grid
TYPE(C_PTR), VALUE :: grids
TYPE(C_PTR), VALUE :: hab_blocks
TYPE(C_PTR), VALUE :: forces
TYPE(C_PTR), VALUE :: virial
Expand All @@ -1083,13 +1076,12 @@ END SUBROUTINE grid_integrate_task_list_c
nlevels = SIZE(rs_grids)
CPASSERT(nlevels > 0)

ALLOCATE (grid_pointers(nlevels))
ALLOCATE (grids_c(nlevels))
ALLOCATE (npts_local(3, nlevels))
DO ilevel = 1, nlevels
rsgrid => rs_grids(ilevel)%rs_grid
npts_local(:, ilevel) = rsgrid%ub_local - rsgrid%lb_local + 1
grid(1:, 1:, 1:) => rsgrid%r(:, :, :) ! ensure lower bounds are (1,1,1)
grid_pointers(ilevel) = C_LOC(grid(1, 1, 1))
grids_c(ilevel) = rsgrid%buffer%c_ptr
END DO

IF (calculate_forces) THEN
Expand All @@ -1106,13 +1098,9 @@ END SUBROUTINE grid_integrate_task_list_c

#if __GNUC__ >= 9
CPASSERT(IS_CONTIGUOUS(npts_local))
CPASSERT(IS_CONTIGUOUS(grid_pointers))
CPASSERT(IS_CONTIGUOUS(grids_c))
CPASSERT(IS_CONTIGUOUS(forces))
CPASSERT(IS_CONTIGUOUS(virial))
DO ilevel = 1, nlevels
grid(1:, 1:, 1:) => rs_grids(ilevel)%rs_grid%r(:, :, :)
CPASSERT(IS_CONTIGUOUS(grid))
END DO
#endif

CPASSERT(SIZE(forces, 1) == 3)
Expand All @@ -1127,7 +1115,7 @@ END SUBROUTINE grid_integrate_task_list_c
nlevels=nlevels, &
npts_local=C_LOC(npts_local(1, 1)), &
pab_blocks=pab_blocks%c_ptr, &
grid=C_LOC(grid_pointers(1)), &
grids=grids_c, &
hab_blocks=hab_blocks%c_ptr, &
forces=forces_c, &
virial=virial_c)
Expand Down
2 changes: 2 additions & 0 deletions src/grid/grid_miniapp.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <stdlib.h>
#include <string.h>

#include "../offload/offload_buffer.h"
#include "common/grid_library.h"
#include "grid_replay.h"

Expand Down Expand Up @@ -63,6 +64,7 @@ int main(int argc, char *argv[]) {
return 1;
}

offload_set_device_id(0);
grid_library_init();

const double max_diff =
Expand Down

0 comments on commit 6d3a0ae

Please sign in to comment.