Skip to content
Permalink
Browse files

Merge #2878

2878: Lbgpu node vel r=KaiSzuttor a=fweik

Description of changes:
 - Factored out velocity getter, and obey boundary velocity for
   both interpolation schemes,
- Removed some globals
- Leak less


Co-authored-by: Florian Weik <fweik@icp.uni-stuttgart.de>
Co-authored-by: RudolfWeeber <weeber@icp.uni-stuttgart.de>
Co-authored-by: Kai Szuttor <2150555+kaiszuttor@users.noreply.github.com>
Co-authored-by: Kai Szuttor <kai@icp.uni-stuttgart.de>
  • Loading branch information...
4 people committed Jul 11, 2019
2 parents a1f1af8 + e416e7e commit a99a8a6d36820eb0ca9a00acec67596e6f486291
Showing with 75 additions and 104 deletions.
  1. +75 −104 src/core/grid_based_algorithms/lbgpu_cuda.cu
@@ -100,16 +100,6 @@ static LB_extern_nodeforcedensity_gpu *extern_node_force_densities = nullptr;
/** @brief Force on the boundary nodes */
static float *lb_boundary_force = nullptr;

/** @brief Velocity at the boundary */
static float *lb_boundary_velocity = nullptr;

/** @name pointers for bound index array */
/*@{*/
static int *boundary_node_list = nullptr;
static int *boundary_index_list = nullptr;
static size_t size_of_boundindex = 0;
/*@}*/

/** @name pointers for additional cuda check flag */
/*@{*/
static int *gpu_check = nullptr;
@@ -1371,9 +1361,27 @@ __device__ __inline__ float three_point_polynomial_larger_than_half(float u) {
(5.f + -3 * fabsf(u) - sqrtf(-2.f + 6.f * fabsf(u) - 3.f * u * u));
}

/**
* @brief Get velocity of at index.
*
*/
__device__ __inline__ float3 node_velocity(float rho_eq, LB_nodes_gpu n_a,
int index) {
auto const boundary_index = n_a.boundary[index];

if (boundary_index) {
auto const &u = n_a.boundary_velocity[index];
return make_float3(u[0], u[1], u[2]);
}

auto const rho = rho_eq + calc_mode_x_from_n(n_a, index, 0);
return make_float3(calc_mode_x_from_n(n_a, index, 1) / rho,
calc_mode_x_from_n(n_a, index, 2) / rho,
calc_mode_x_from_n(n_a, index, 3) / rho);
}

__device__ __inline__ float3
velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
float *lb_boundary_velocity,
Utils::Array<unsigned int, 27> &node_indices,
Utils::Array<float, 27> &delta) {
Utils::Array<int, 3> center_node_index{};
@@ -1431,31 +1439,14 @@ velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
auto const z =
fold_if_necessary(center_node_index[2] - 1 + k, para->dim_z);
delta[cnt] = temp_delta[i].x * temp_delta[j].y * temp_delta[k].z;
node_indices[cnt] = xyz_to_index(x, y, z);
auto const boundary_index = n_a.boundary[node_indices[cnt]];
if (not boundary_index) {
float totmass = 0.0f;
auto const mass_mode = calc_mode_x_from_n(n_a, node_indices[cnt], 0);

totmass += mass_mode + para->rho;

auto const j_x = calc_mode_x_from_n(n_a, node_indices[cnt], 1);
auto const j_y = calc_mode_x_from_n(n_a, node_indices[cnt], 2);
auto const j_z = calc_mode_x_from_n(n_a, node_indices[cnt], 3);
interpolated_u.x += (j_x / totmass) * delta[cnt];
interpolated_u.y += (j_y / totmass) * delta[cnt];
interpolated_u.z += (j_z / totmass) * delta[cnt];
} else {
interpolated_u.x +=
lb_boundary_velocity[3 * (boundary_index - 1) + 0] * para->tau /
para->agrid * delta[cnt];
interpolated_u.y +=
lb_boundary_velocity[3 * (boundary_index - 1) + 1] * para->tau /
para->agrid * delta[cnt];
interpolated_u.z +=
lb_boundary_velocity[3 * (boundary_index - 1) + 2] * para->tau /
para->agrid * delta[cnt];
}
auto const index = xyz_to_index(x, y, z);
node_indices[cnt] = index;

auto const node_u = node_velocity(para->rho, n_a, index);
interpolated_u.x += delta[cnt] * node_u.x;
interpolated_u.y += delta[cnt] * node_u.y;
interpolated_u.z += delta[cnt] * node_u.z;

++cnt;
}
}
@@ -1469,12 +1460,12 @@ velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
* @param[in] particle_position Particle position
* @param[out] node_index Node index around (8) particle
* @param[out] delta Weighting of particle position
* @param[in] lb_boundary_velocity Velocity at the boundary
* @retval Interpolated velocity
*/
__device__ __inline__ float3 velocity_interpolation(
LB_nodes_gpu n_a, float *particle_position, float *lb_boundary_velocity,
Utils::Array<unsigned int, 8> &node_index, Utils::Array<float, 8> &delta) {
__device__ __inline__ float3
velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
Utils::Array<unsigned int, 8> &node_index,
Utils::Array<float, 8> &delta) {
Utils::Array<int, 3> left_node_index;
Utils::Array<float, 6> temp_delta;
// see ahlrichs + duenweg page 8227 equ (10) and (11)
@@ -1497,9 +1488,9 @@ __device__ __inline__ float3 velocity_interpolation(

// modulo for negative numbers is strange at best, shift to make sure we are
// positive
int x = (left_node_index[0] + para->dim_x) % para->dim_x;
int y = (left_node_index[1] + para->dim_y) % para->dim_y;
int z = (left_node_index[2] + para->dim_z) % para->dim_z;
int const x = (left_node_index[0] + para->dim_x) % para->dim_x;
int const y = (left_node_index[1] + para->dim_y) % para->dim_y;
int const z = (left_node_index[2] + para->dim_z) % para->dim_z;
auto xp1 = x + 1;
auto yp1 = y + 1;
auto zp1 = z + 1;
@@ -1521,30 +1512,10 @@ __device__ __inline__ float3 velocity_interpolation(
float3 interpolated_u{0.0f, 0.0f, 0.0f};
#pragma unroll
for (int i = 0; i < 8; ++i) {
float totmass = 0.0f;
Utils::Array<float, 19> mode;

calc_m_from_n(n_a, node_index[i], mode);
auto const mass_mode = calc_mode_x_from_n(n_a, node_index[i], 0);

totmass += mass_mode + para->rho;

/* The boolean expression (n_a.boundary[node_index[i]] == 0) causes boundary
nodes to couple with velocity 0 to particles. This is necessary, since
boundary nodes undergo the same LB dynamics as fluid nodes do. The flow
within the boundaries does not interact with the physical fluid, since
these populations are overwritten by the bounce back kernel. Particles
close to walls can couple to this unphysical flow, though.
*/
auto const j_x = calc_mode_x_from_n(n_a, node_index[i], 1);
auto const j_y = calc_mode_x_from_n(n_a, node_index[i], 2);
auto const j_z = calc_mode_x_from_n(n_a, node_index[i], 3);
interpolated_u.x +=
(j_x / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0);
interpolated_u.y +=
(j_y / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0);
interpolated_u.z +=
(j_z / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0);
auto const node_u = node_velocity(para->rho, n_a, node_index[i]);
interpolated_u.x += delta[i] * node_u.x;
interpolated_u.y += delta[i] * node_u.y;
interpolated_u.z += delta[i] * node_u.z;
}
return interpolated_u;
}
@@ -1562,19 +1533,16 @@ __device__ __inline__ float3 velocity_interpolation(
* @param[in] flag_cs Determine if we are at the centre (0,
* typical) or at the source (1, swimmer only)
* @param[in] friction Friction constant for the particle coupling
* @param[in] lb_boundary_velocity Velocity at the boundary
* @tparam no_of_neighbours The number of neighbours to consider for
* interpolation
*/
template <std::size_t no_of_neighbours>
__device__ void
calc_viscous_force(LB_nodes_gpu n_a,
Utils::Array<float, no_of_neighbours> &delta,
CUDA_particle_data *particle_data, float *particle_force,
unsigned int part_index, float *delta_j,
Utils::Array<unsigned int, no_of_neighbours> &node_index,
LB_rho_v_gpu *d_v, int flag_cs, uint64_t philox_counter,
float friction, float *lb_boundary_velocity) {
__device__ void calc_viscous_force(
LB_nodes_gpu n_a, Utils::Array<float, no_of_neighbours> &delta,
CUDA_particle_data *particle_data, float *particle_force,
unsigned int part_index, float *delta_j,
Utils::Array<unsigned int, no_of_neighbours> &node_index, LB_rho_v_gpu *d_v,
int flag_cs, uint64_t philox_counter, float friction) {
// Zero out workspace
#pragma unroll
for (int jj = 0; jj < 3; ++jj) {
@@ -1614,8 +1582,8 @@ calc_viscous_force(LB_nodes_gpu n_a,
flag_cs * direction * particle_data[part_index].swim.director[2];
#endif

float3 const interpolated_u = velocity_interpolation(
n_a, position, lb_boundary_velocity, node_index, delta);
float3 const interpolated_u =
velocity_interpolation(n_a, position, node_index, delta);

#ifdef ENGINE
velocity[0] -= particle_data[part_index].swim.v_swim *
@@ -2230,15 +2198,15 @@ __global__ void integrate(LB_nodes_gpu n_a, LB_nodes_gpu n_b, LB_rho_v_gpu *d_v,
* @param[out] node_f Local node force
* @param[in] d_v Local device values
* @param[in] friction Friction constant for the particle coupling
* @param[in] lb_boundary_velocity Velocity at the boundary
* @tparam no_of_neighbours The number of neighbours to consider for
* interpolation
*/
template <std::size_t no_of_neighbours>
__global__ void calc_fluid_particle_ia(
LB_nodes_gpu n_a, CUDA_particle_data *particle_data, float *particle_force,
LB_node_force_density_gpu node_f, LB_rho_v_gpu *d_v, bool couple_virtual,
uint64_t philox_counter, float friction, float *lb_boundary_velocity) {
__global__ void
calc_fluid_particle_ia(LB_nodes_gpu n_a, CUDA_particle_data *particle_data,
float *particle_force, LB_node_force_density_gpu node_f,
LB_rho_v_gpu *d_v, bool couple_virtual,
uint64_t philox_counter, float friction) {

unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x +
blockDim.x * blockIdx.x + threadIdx.x;
@@ -2254,14 +2222,14 @@ __global__ void calc_fluid_particle_ia(
* force that acts back onto the fluid. */
calc_viscous_force<no_of_neighbours>(
n_a, delta, particle_data, particle_force, part_index, delta_j,
node_index, d_v, 0, philox_counter, friction, lb_boundary_velocity);
node_index, d_v, 0, philox_counter, friction);
calc_node_force<no_of_neighbours>(delta, delta_j, node_index, node_f);

#ifdef ENGINE
if (particle_data[part_index].swim.swimming) {
calc_viscous_force<no_of_neighbours>(
n_a, delta, particle_data, particle_force, part_index, delta_j,
node_index, d_v, 1, philox_counter, friction, lb_boundary_velocity);
node_index, d_v, 1, philox_counter, friction);
calc_node_force<no_of_neighbours>(delta, delta_j, node_index, node_f);
}
#endif
@@ -2540,7 +2508,11 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
if (this_node != 0)
return;

size_of_boundindex = number_of_boundnodes * sizeof(int);
float *boundary_velocity = nullptr;
int *boundary_node_list = nullptr;
int *boundary_index_list = nullptr;

auto const size_of_boundindex = number_of_boundnodes * sizeof(int);
cuda_safe_mem(cudaMalloc((void **)&boundary_node_list, size_of_boundindex));
cuda_safe_mem(cudaMalloc((void **)&boundary_index_list, size_of_boundindex));
cuda_safe_mem(cudaMemcpy(boundary_index_list, host_boundary_index_list,
@@ -2549,10 +2521,10 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
size_of_boundindex, cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMalloc((void **)&lb_boundary_force,
3 * host_n_lb_boundaries * sizeof(float)));
cuda_safe_mem(cudaMalloc((void **)&lb_boundary_velocity,
cuda_safe_mem(cudaMalloc((void **)&boundary_velocity,
3 * host_n_lb_boundaries * sizeof(float)));
cuda_safe_mem(
cudaMemcpy(lb_boundary_velocity, host_lb_boundary_velocity,
cudaMemcpy(boundary_velocity, host_lb_boundary_velocity,
3 * LBBoundaries::lbboundaries.size() * sizeof(float),
cudaMemcpyHostToDevice));

@@ -2585,10 +2557,14 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
make_uint3(blocks_per_grid_bound_x, blocks_per_grid_bound_y, 1);

KERNELCALL(init_boundaries, dim_grid_bound, threads_per_block_bound,
boundary_node_list, boundary_index_list, lb_boundary_velocity,
boundary_node_list, boundary_index_list, boundary_velocity,
number_of_boundnodes, boundaries);
}

cudaFree(boundary_velocity);
cudaFree(boundary_node_list);
cudaFree(boundary_index_list);

cudaDeviceSynchronize();
}
#endif
@@ -2669,15 +2645,13 @@ void lb_calc_particle_lattice_ia_gpu(bool couple_virtual, double friction) {
threads_per_block_particles, *current_nodes,
gpu_get_particle_pointer(), gpu_get_particle_force_pointer(),
node_f, device_rho_v, couple_virtual,
rng_counter_coupling_gpu->value(), friction,
lb_boundary_velocity);
rng_counter_coupling_gpu->value(), friction);
} else {
// We use a dummy value for the RNG counter if no temperature is set.
KERNELCALL(calc_fluid_particle_ia<no_of_neighbours>, dim_grid_particles,
threads_per_block_particles, *current_nodes,
gpu_get_particle_pointer(), gpu_get_particle_force_pointer(),
node_f, device_rho_v, couple_virtual, 0, friction,
lb_boundary_velocity);
node_f, device_rho_v, couple_virtual, 0, friction);
}
}
}
@@ -3052,17 +3026,14 @@ void lb_lbfluid_get_population(const Utils::Vector3i &xyz,
template <std::size_t no_of_neighbours> struct interpolation {
LB_nodes_gpu current_nodes_gpu;
LB_rho_v_gpu *d_v_gpu;
float *lb_boundary_velocity;
interpolation(LB_nodes_gpu _current_nodes_gpu, LB_rho_v_gpu *_d_v_gpu,
float *lb_boundary_velocity)
: current_nodes_gpu(_current_nodes_gpu), d_v_gpu(_d_v_gpu),
lb_boundary_velocity(lb_boundary_velocity){};
interpolation(LB_nodes_gpu _current_nodes_gpu, LB_rho_v_gpu *_d_v_gpu)
: current_nodes_gpu(_current_nodes_gpu), d_v_gpu(_d_v_gpu) {}
__device__ float3 operator()(const float3 &position) const {
float _position[3] = {position.x, position.y, position.z};
Utils::Array<unsigned int, no_of_neighbours> node_indices;
Utils::Array<float, no_of_neighbours> delta;
return velocity_interpolation(current_nodes_gpu, _position,
lb_boundary_velocity, node_indices, delta);
return velocity_interpolation(current_nodes_gpu, _position, node_indices,
delta);
}
};

@@ -3078,10 +3049,10 @@ void lb_get_interpolated_velocity_gpu(double const *positions,
}
thrust::device_vector<float3> positions_device = positions_host;
thrust::device_vector<float3> velocities_device(length);
thrust::transform(positions_device.begin(), positions_device.end(),
velocities_device.begin(),
interpolation<no_of_neighbours>(
*current_nodes, device_rho_v, lb_boundary_velocity));
thrust::transform(
positions_device.begin(), positions_device.end(),
velocities_device.begin(),
interpolation<no_of_neighbours>(*current_nodes, device_rho_v));
thrust::host_vector<float3> velocities_host = velocities_device;
int index = 0;
for (auto v : velocities_host) {

0 comments on commit a99a8a6

Please sign in to comment.
You can’t perform that action at this time.