Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce overhead in calls to tensor product value function #15182

Merged
merged 2 commits into from
May 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
49 changes: 27 additions & 22 deletions include/deal.II/matrix_free/fe_point_evaluation.h
Original file line number Diff line number Diff line change
Expand Up @@ -1494,6 +1494,7 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::do_reinit()
mapping_info->get_n_q_points_unvectorized(current_cell_index,
current_face_number);

// round up n_points_scalar / n_lanes_user_interface
const_cast<unsigned int &>(n_q_points) =
(n_q_points_scalar + n_lanes_user_interface - 1) / n_lanes_user_interface;

Expand Down Expand Up @@ -1533,15 +1534,13 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::do_reinit()

if (fast_path && !polynomials_are_hat_functions)
{
// round up n_q_points_scalar / n_lanes_internal
const std::size_t n_batches =
n_q_points_scalar / n_lanes_internal +
(n_q_points_scalar % n_lanes_internal > 0 ? 1 : 0);
(n_q_points_scalar + n_lanes_internal - 1) / n_lanes_internal;
Comment on lines 1538 to +1539
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What you're doing here is rounding up n_q_points_scalar/n_lanes_internal. Can you put this into a comment?

const std::size_t n_shapes = poly.size();
shapes.resize_fast(n_batches * n_shapes);
for (unsigned int qb = 0; qb < n_batches; ++qb)
internal::compute_values_of_array(make_array_view(shapes,
qb * n_shapes,
n_shapes),
internal::compute_values_of_array(shapes.data() + qb * n_shapes,
poly,
unit_point_ptr[qb]);
}
Expand All @@ -1561,13 +1560,15 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::evaluate_fast(
if (solution_renumbered.size() != dofs_per_component)
solution_renumbered.resize(dofs_per_component);
for (unsigned int comp = 0; comp < n_components; ++comp)
for (unsigned int i = 0; i < dofs_per_component; ++i)
ETT::read_value(
solution_values[renumber[(component_in_base_element + comp) *
dofs_per_component +
i]],
comp,
solution_renumbered[i]);
{
const unsigned int *renumber_ptr =
renumber.data() +
(component_in_base_element + comp) * dofs_per_component;
for (unsigned int i = 0; i < dofs_per_component; ++i)
ETT::read_value(solution_values[renumber_ptr[i]],
comp,
solution_renumbered[i]);
}

// unit gradients are currently only implemented with the fast tensor
// path
Expand All @@ -1586,15 +1587,15 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::evaluate_fast(
internal::evaluate_tensor_product_value_and_gradient_shapes<
dim,
scalar_value_type,
VectorizedArrayType>(make_array_view(shapes,
qb * n_shapes,
n_shapes),
VectorizedArrayType>(shapes.data() + qb * n_shapes,
n_shapes,
solution_renumbered);

if (evaluation_flags & EvaluationFlags::values)
{
for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
for (unsigned int v = 0;
v < stride && (stride == 1 || q + v < n_q_points_scalar);
++v)
ETT::set_value(val_and_grad.first, v, values[qb * stride + v]);
}
if (evaluation_flags & EvaluationFlags::gradients)
Expand All @@ -1603,7 +1604,9 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::evaluate_fast(
update_flags & update_inverse_jacobians,
ExcNotInitialized());

for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
for (unsigned int v = 0;
v < stride && (stride == 1 || q + v < n_q_points_scalar);
++v)
{
const unsigned int offset = qb * stride + v;
ETT::set_gradient(val_and_grad.second, v, unit_gradients[offset]);
Expand Down Expand Up @@ -1762,7 +1765,9 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::integrate_fast(
ETT::set_zero_value(values[qb], v);
}

for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
for (unsigned int v = 0;
v < stride && (stride == 1 || q + v < n_q_points_scalar);
++v)
ETT::get_value(value, v, values[qb * stride + v]);
}
if (integration_flags & EvaluationFlags::gradients)
Expand All @@ -1778,7 +1783,9 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::integrate_fast(
ETT::set_zero_gradient(gradients[qb], v);
}

for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
for (unsigned int v = 0;
v < stride && (stride == 1 || q + v < n_q_points_scalar);
++v)
{
const unsigned int offset = qb * stride + v;
ETT::get_gradient(
Expand All @@ -1801,9 +1808,7 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::integrate_fast(
internal::integrate_add_tensor_product_value_and_gradient_shapes<
dim,
VectorizedArrayType,
vectorized_value_type>(make_array_view(shapes,
qb * n_shapes,
n_shapes),
vectorized_value_type>(shapes.data() + qb * n_shapes,
n_shapes,
value,
gradient,
Expand Down
62 changes: 34 additions & 28 deletions include/deal.II/matrix_free/tensor_product_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -2989,7 +2989,7 @@ namespace internal
template <int dim, typename Number>
inline void
compute_values_of_array(
ArrayView<dealii::ndarray<Number, 2, dim>> shapes,
dealii::ndarray<Number, 2, dim> * shapes,
const std::vector<Polynomials::Polynomial<double>> &poly,
const Point<dim, Number> & p)
{
Expand All @@ -3000,7 +3000,7 @@ namespace internal
for (unsigned int d = 0; d < dim; ++d)
point[d] = p[d];
for (int i = 0; i < n_shapes; ++i)
poly[i].values_of_array(point, 1, &shapes[i][0]);
poly[i].values_of_array(point, 1, shapes[i].data());
}


Expand All @@ -3009,12 +3009,16 @@ namespace internal
* Interpolate inner dimensions of tensor product shape functions.
*/
template <int dim, int length, typename Number2, typename Number>
inline std::array<typename ProductTypeNoPoint<Number, Number2>::type, 3>
do_interpolate_xy(const std::vector<Number> & values,
const std::vector<unsigned int> & renumber,
const ArrayView<dealii::ndarray<Number2, 2, dim>> &shapes,
const int n_shapes_runtime,
int & i)
inline
#ifndef DEBUG
DEAL_II_ALWAYS_INLINE
#endif
std::array<typename ProductTypeNoPoint<Number, Number2>::type, 3>
do_interpolate_xy(const std::vector<Number> & values,
const std::vector<unsigned int> & renumber,
const dealii::ndarray<Number2, 2, dim> *shapes,
const int n_shapes_runtime,
int & i)
{
const int n_shapes = length > 0 ? length : n_shapes_runtime;
using Number3 = typename ProductTypeNoPoint<Number, Number2>::type;
Expand Down Expand Up @@ -3066,10 +3070,10 @@ namespace internal
typename ProductTypeNoPoint<Number, Number2>::type,
Tensor<1, dim, typename ProductTypeNoPoint<Number, Number2>::type>>
evaluate_tensor_product_value_and_gradient_shapes(
const ArrayView<dealii::ndarray<Number2, 2, dim>> &shapes,
const int n_shapes,
const std::vector<Number> & values,
const std::vector<unsigned int> & renumber = {})
const dealii::ndarray<Number2, 2, dim> *shapes,
const int n_shapes,
const std::vector<Number> & values,
const std::vector<unsigned int> & renumber = {})
{
static_assert(dim >= 1 && dim <= 3, "Only dim=1,2,3 implemented");

Expand Down Expand Up @@ -3253,16 +3257,15 @@ namespace internal
}
else
{
AssertIndexRange(poly.size(), 200);
std::array<dealii::ndarray<Number2, 2, dim>, 200> shapes;

auto view = make_array_view(shapes);

compute_values_of_array(view, poly, p);
compute_values_of_array(shapes.data(), poly, p);

return evaluate_tensor_product_value_and_gradient_shapes<dim,
Number,
Number2>(
view, poly.size(), values, renumber);
shapes.data(), poly.size(), values, renumber);
}
}

Expand Down Expand Up @@ -3469,13 +3472,16 @@ namespace internal
* Test inner dimensions of tensor product shape functions and accumulate.
*/
template <int dim, int length, typename Number2, typename Number>
inline void
do_apply_test_functions_xy(
AlignedVector<Number2> & values,
const ArrayView<dealii::ndarray<Number, 2, dim>> &shapes,
const std::array<Number2, 3> & test_grads_value,
const int n_shapes_runtime,
int & i)
inline
#ifndef DEBUG
DEAL_II_ALWAYS_INLINE
#endif
void
do_apply_test_functions_xy(AlignedVector<Number2> & values,
const dealii::ndarray<Number, 2, dim> *shapes,
const std::array<Number2, 3> &test_grads_value,
const int n_shapes_runtime,
int & i)
{
if (length > 0)
{
Expand Down Expand Up @@ -3537,11 +3543,11 @@ namespace internal
template <int dim, typename Number, typename Number2>
inline void
integrate_add_tensor_product_value_and_gradient_shapes(
const ArrayView<dealii::ndarray<Number, 2, dim>> &shapes,
const int n_shapes,
const Number2 & value,
const Tensor<1, dim, Number2> & gradient,
AlignedVector<Number2> & values)
const dealii::ndarray<Number, 2, dim> *shapes,
const int n_shapes,
const Number2 & value,
const Tensor<1, dim, Number2> & gradient,
AlignedVector<Number2> & values)
{
static_assert(dim >= 1 && dim <= 3, "Only dim=1,2,3 implemented");

Expand Down