Skip to content

Commit

Permalink
precommit: Add file extension .cc
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Sep 15, 2021
1 parent 524554b commit 1b9c42d
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 145 deletions.
62 changes: 33 additions & 29 deletions src/grid/hip/grid_hip_collocate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,9 @@ __global__ void calculate_coefficients(const kernel_params dev_) {
calculate_coefficients. We only keep the non zero elements to same memory.
*/

template <typename T, typename T3, bool distributed__, bool orthorhombic_>
__global__ __launch_bounds__(64) void collocate_kernel(const kernel_params dev_) {
template <typename T, typename T3, bool distributed__, bool orthorhombic_>
__global__
__launch_bounds__(64) void collocate_kernel(const kernel_params dev_) {
// Copy task from global to shared memory and precompute some stuff.
__shared__ smem_task_reduced<T, T3> task;

Expand Down Expand Up @@ -180,11 +181,11 @@ __global__ __launch_bounds__(64) void collocate_kernel(const kernel_params dev_)

if (distributed__) {
if (task.apply_border_mask) {
compute_window_size(dev_.grid_local_size_, dev_.grid_lower_corner_,
dev_.grid_full_size_, // also full size of the grid
dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
dev_.grid_border_width_, &task.window_size,
&task.window_shift);
compute_window_size(
dev_.grid_local_size_, dev_.grid_lower_corner_,
dev_.grid_full_size_, // also full size of the grid
dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
dev_.grid_border_width_, &task.window_size, &task.window_shift);
}
}
}
Expand All @@ -200,10 +201,10 @@ __global__ __launch_bounds__(64) void collocate_kernel(const kernel_params dev_)
// check if the point is within the window
if (task.apply_border_mask) {
// this test is only relevant when the grid is split over several mpi
// ranks. in that case we take only the points contributing to local part
// of the grid.
// ranks. in that case we take only the points contributing to local
// part of the grid.
if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
continue;
continue;
}
}
}
Expand Down Expand Up @@ -274,16 +275,15 @@ __global__ __launch_bounds__(64) void collocate_kernel(const kernel_params dev_)
r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
} else {
r3 =
compute_coordinates(dh_, (x + task.lb_cube.x + task.roffset.x),
(y + task.lb_cube.y + task.roffset.y),
(z + task.lb_cube.z + task.roffset.z));
r3 = compute_coordinates(dh_, (x + task.lb_cube.x + task.roffset.x),
(y + task.lb_cube.y + task.roffset.y),
(z + task.lb_cube.z + task.roffset.z));
}

if (distributed__) {
// check if the point is inside the sphere or not. Note that it does not
// apply for the orthorhombic case when the full sphere is inside the
// region of interest.
// check if the point is inside the sphere or not. Note that it does
// not apply for the orthorhombic case when the full sphere is inside
// the region of interest.

if (((task.radius * task.radius) <=
(r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)) &&
Expand Down Expand Up @@ -408,32 +408,36 @@ void context_info::collocate_one_grid_level(const int level,

if (func == GRID_FUNC_AB) {
calculate_coefficients<double, true>
<<<number_of_tasks_per_level_[level], threads_per_block, smem_params.smem_per_block(), level_streams[level]>>>(
params);
<<<number_of_tasks_per_level_[level], threads_per_block,
smem_params.smem_per_block(), level_streams[level]>>>(params);
} else {
calculate_coefficients<double, false>
<<<number_of_tasks_per_level_[level], threads_per_block, smem_params.smem_per_block(), level_streams[level]>>>(
params);
<<<number_of_tasks_per_level_[level], threads_per_block,
smem_params.smem_per_block(), level_streams[level]>>>(params);
}

if (grid_[level].is_distributed()) {
if (grid_[level].is_orthorhombic())
collocate_kernel<double, double3, true, true>
<<<number_of_tasks_per_level_[level], threads_per_block, smem_params.cxyz_len() * sizeof(double),
level_streams[level]>>>(params);
<<<number_of_tasks_per_level_[level], threads_per_block,
smem_params.cxyz_len() * sizeof(double), level_streams[level]>>>(
params);
else
collocate_kernel<double, double3, true, false>
<<<number_of_tasks_per_level_[level], threads_per_block, smem_params.cxyz_len() * sizeof(double),
level_streams[level]>>>(params);
<<<number_of_tasks_per_level_[level], threads_per_block,
smem_params.cxyz_len() * sizeof(double), level_streams[level]>>>(
params);
} else {
if (grid_[level].is_orthorhombic())
collocate_kernel<double, double3, false, true>
<<<number_of_tasks_per_level_[level], threads_per_block, smem_params.cxyz_len() * sizeof(double),
level_streams[level]>>>(params);
<<<number_of_tasks_per_level_[level], threads_per_block,
smem_params.cxyz_len() * sizeof(double), level_streams[level]>>>(
params);
else
collocate_kernel<double, double3, false, false>
<<<number_of_tasks_per_level_[level], threads_per_block, smem_params.cxyz_len() * sizeof(double),
level_streams[level]>>>(params);
<<<number_of_tasks_per_level_[level], threads_per_block,
smem_params.cxyz_len() * sizeof(double), level_streams[level]>>>(
params);
}
}
} // namespace rocm_backend
Expand Down
75 changes: 38 additions & 37 deletions src/grid/hip/grid_hip_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ extern "C" void grid_hip_create_task_list(

tasks_host[i].block_transposed = (iatom > jatom);
tasks_host[i].subblock_offset =
(tasks_host[i].block_transposed)
(tasks_host[i].block_transposed)
? (tasks_host[i].sgfa * tasks_host[i].nsgfb + tasks_host[i].sgfb)
: (tasks_host[i].sgfb * tasks_host[i].nsgfa + tasks_host[i].sgfa);

Expand All @@ -199,13 +199,15 @@ extern "C" void grid_hip_create_task_list(
/* this block is only as temporary scratch for calculating the coefficients.
* Doing this avoid a lot of atomic operations that are costly on hardware
* that only have partial support of them. For better performance we should
* most probably align the offsets as well. it is 256 bytes on Mi100 and above */
* most probably align the offsets as well. it is 256 bytes on Mi100 and
* above */
tasks_host[i].lp_max = tasks_host[i].lb_max + tasks_host[i].la_max + 6;
if (i == 0) {
tasks_host[i].coef_offset = 0;
} else {
tasks_host[i].coef_offset = tasks_host[i - 1].coef_offset +
rocm_backend::ncoset(tasks_host[i - 1].lp_max);
tasks_host[i].coef_offset =
tasks_host[i - 1].coef_offset +
rocm_backend::ncoset(tasks_host[i - 1].lp_max);
}
coef_size += rocm_backend::ncoset(tasks_host[i].lp_max);

Expand All @@ -215,44 +217,43 @@ extern "C" void grid_hip_create_task_list(
tasks_host[i].apply_border_mask = (tasks_host[i].border_mask != 0);

if (grid.is_orthorhombic() && (tasks_host[i].border_mask == 0)) {
tasks_host[i]
.discrete_radius = rocm_backend::compute_cube_properties<double, double3, true>(
tasks_host[i].radius, grid.dh(), grid.dh_inv(),
(double3 *)tasks_host[i].rp, // center of the gaussian
&tasks_host[i].roffset, // offset compared to the closest grid point
&tasks_host[i].cube_center, // center coordinates in grid space
&tasks_host[i].lb_cube, // lower boundary
&tasks_host[i].cube_size);
tasks_host[i].discrete_radius =
rocm_backend::compute_cube_properties<double, double3, true>(
tasks_host[i].radius, grid.dh(), grid.dh_inv(),
(double3 *)tasks_host[i].rp, // center of the gaussian
&tasks_host[i]
.roffset, // offset compared to the closest grid point
&tasks_host[i].cube_center, // center coordinates in grid space
&tasks_host[i].lb_cube, // lower boundary
&tasks_host[i].cube_size);
} else {
tasks_host[i]
.discrete_radius = rocm_backend::compute_cube_properties<double, double3, false>(
tasks_host[i].radius, grid.dh(), grid.dh_inv(),
(double3 *)tasks_host[i].rp, // center of the gaussian
&tasks_host[i].roffset, // offset compared to the closest grid point
&tasks_host[i].cube_center, // center coordinates in grid space
&tasks_host[i].lb_cube, // lower boundary
&tasks_host[i].cube_size);
tasks_host[i].discrete_radius =
rocm_backend::compute_cube_properties<double, double3, false>(
tasks_host[i].radius, grid.dh(), grid.dh_inv(),
(double3 *)tasks_host[i].rp, // center of the gaussian
&tasks_host[i]
.roffset, // offset compared to the closest grid point
&tasks_host[i].cube_center, // center coordinates in grid space
&tasks_host[i].lb_cube, // lower boundary
&tasks_host[i].cube_size);
}
}

// we need to sort the task list although I expect it to be sorted already
/*
* sorting with this lambda does not work
std::sort(tasks_host.begin(), tasks_host.end(), [](rocm_backend::task_info a, rocm_backend::task_info b) {
if (a.level == b.level) {
if (a.block_num <= b.block_num)
return true;
else
return false;
} else {
return (a.level < b.level);
}
});
*/
/*
* sorting with this lambda does not work
std::sort(tasks_host.begin(), tasks_host.end(), [](rocm_backend::task_info a,
rocm_backend::task_info b) { if (a.level == b.level) { if (a.block_num <=
b.block_num) return true; else return false; } else { return (a.level <
b.level);
}
});
*/
// it is a exclusive scan actually
for (int level = 1; level < ctx->number_of_tasks_per_level_.size(); level++) {
ctx->first_task_per_level_[level] = ctx->first_task_per_level_[level - 1] +
ctx->number_of_tasks_per_level_[level - 1];
ctx->first_task_per_level_[level] =
ctx->first_task_per_level_[level - 1] +
ctx->number_of_tasks_per_level_[level - 1];
}

ctx->tasks_dev.clear();
Expand Down Expand Up @@ -323,7 +324,7 @@ std::sort(tasks_host.begin(), tasks_host.end(), [](rocm_backend::task_info a, ro
ctx->num_tasks_per_block_dev_.resize(num_tasks_per_block.size());
ctx->num_tasks_per_block_dev_.copy_to_gpu(num_tasks_per_block);

// collect stats
// collect stats
memset(ctx->stats, 0, 2 * 20 * sizeof(int));
for (int itask = 0; itask < ntasks; itask++) {
const int iatom = iatom_list[itask] - 1;
Expand Down Expand Up @@ -440,7 +441,7 @@ extern "C" void grid_hip_integrate_task_list(

rocm_backend::context_info *ctx = (rocm_backend::context_info *)ptr;

if(ptr == nullptr)
if (ptr == nullptr)
return;
assert(ctx->nlevels == nlevels);

Expand Down

0 comments on commit 1b9c42d

Please sign in to comment.