Skip to content

Commit

Permalink
Reduce overhead in calls to tensor product value function
Browse files Browse the repository at this point in the history
  • Loading branch information
kronbichler committed May 5, 2023
1 parent d62c94c commit 8cfb2f2
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 42 deletions.
43 changes: 24 additions & 19 deletions include/deal.II/matrix_free/fe_point_evaluation.h
Original file line number Diff line number Diff line change
Expand Up @@ -1534,14 +1534,11 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::do_reinit()
if (fast_path && !polynomials_are_hat_functions)
{
const std::size_t n_batches =
n_q_points_scalar / n_lanes_internal +
(n_q_points_scalar % n_lanes_internal > 0 ? 1 : 0);
(n_q_points_scalar + n_lanes_internal - 1) / n_lanes_internal;
const std::size_t n_shapes = poly.size();
shapes.resize_fast(n_batches * n_shapes);
for (unsigned int qb = 0; qb < n_batches; ++qb)
internal::compute_values_of_array(make_array_view(shapes,
qb * n_shapes,
n_shapes),
internal::compute_values_of_array(shapes.data() + qb * n_shapes,
poly,
unit_point_ptr[qb]);
}
Expand All @@ -1561,13 +1558,15 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::evaluate_fast(
if (solution_renumbered.size() != dofs_per_component)
solution_renumbered.resize(dofs_per_component);
for (unsigned int comp = 0; comp < n_components; ++comp)
for (unsigned int i = 0; i < dofs_per_component; ++i)
ETT::read_value(
solution_values[renumber[(component_in_base_element + comp) *
dofs_per_component +
i]],
comp,
solution_renumbered[i]);
{
const unsigned int *renumber_ptr =
renumber.data() +
(component_in_base_element + comp) * dofs_per_component;
for (unsigned int i = 0; i < dofs_per_component; ++i)
ETT::read_value(solution_values[renumber_ptr[i]],
comp,
solution_renumbered[i]);
}

// unit gradients are currently only implemented with the fast tensor
// path
Expand All @@ -1586,15 +1585,15 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::evaluate_fast(
internal::evaluate_tensor_product_value_and_gradient_shapes<
dim,
scalar_value_type,
VectorizedArrayType>(make_array_view(shapes,
qb * n_shapes,
n_shapes),
VectorizedArrayType>(shapes.data() + qb * n_shapes,
n_shapes,
solution_renumbered);

if (evaluation_flags & EvaluationFlags::values)
{
for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
for (unsigned int v = 0;
v < stride && (stride > 1 ? q + v < n_q_points_scalar : true);
++v)
ETT::set_value(val_and_grad.first, v, values[qb * stride + v]);
}
if (evaluation_flags & EvaluationFlags::gradients)
Expand All @@ -1603,7 +1602,9 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::evaluate_fast(
update_flags & update_inverse_jacobians,
ExcNotInitialized());

for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
for (unsigned int v = 0;
v < stride && (stride > 1 ? q + v < n_q_points_scalar : true);
++v)
{
const unsigned int offset = qb * stride + v;
ETT::set_gradient(val_and_grad.second, v, unit_gradients[offset]);
Expand Down Expand Up @@ -1762,7 +1763,9 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::integrate_fast(
ETT::set_zero_value(values[qb], v);
}

for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
for (unsigned int v = 0;
v < stride && (stride > 1 ? q + v < n_q_points_scalar : true);
++v)
ETT::get_value(value, v, values[qb * stride + v]);
}
if (integration_flags & EvaluationFlags::gradients)
Expand All @@ -1778,7 +1781,9 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::integrate_fast(
ETT::set_zero_gradient(gradients[qb], v);
}

for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
for (unsigned int v = 0;
v < stride && (stride > 1 ? q + v < n_q_points_scalar : true);
++v)
{
const unsigned int offset = qb * stride + v;
ETT::get_gradient(
Expand Down
53 changes: 30 additions & 23 deletions include/deal.II/matrix_free/tensor_product_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -2989,7 +2989,7 @@ namespace internal
template <int dim, typename Number>
inline void
compute_values_of_array(
ArrayView<dealii::ndarray<Number, 2, dim>> shapes,
dealii::ndarray<Number, 2, dim> * shapes,
const std::vector<Polynomials::Polynomial<double>> &poly,
const Point<dim, Number> & p)
{
Expand All @@ -3000,7 +3000,7 @@ namespace internal
for (unsigned int d = 0; d < dim; ++d)
point[d] = p[d];
for (int i = 0; i < n_shapes; ++i)
poly[i].values_of_array(point, 1, &shapes[i][0]);
poly[i].values_of_array(point, 1, shapes[i].data());
}


Expand All @@ -3009,12 +3009,16 @@ namespace internal
* Interpolate inner dimensions of tensor product shape functions.
*/
template <int dim, int length, typename Number2, typename Number>
inline std::array<typename ProductTypeNoPoint<Number, Number2>::type, 3>
do_interpolate_xy(const std::vector<Number> & values,
const std::vector<unsigned int> & renumber,
const ArrayView<dealii::ndarray<Number2, 2, dim>> &shapes,
const int n_shapes_runtime,
int & i)
inline
#ifndef DEBUG
DEAL_II_ALWAYS_INLINE
#endif
std::array<typename ProductTypeNoPoint<Number, Number2>::type, 3>
do_interpolate_xy(const std::vector<Number> & values,
const std::vector<unsigned int> & renumber,
const dealii::ndarray<Number2, 2, dim> *shapes,
const int n_shapes_runtime,
int & i)
{
const int n_shapes = length > 0 ? length : n_shapes_runtime;
using Number3 = typename ProductTypeNoPoint<Number, Number2>::type;
Expand Down Expand Up @@ -3066,10 +3070,10 @@ namespace internal
typename ProductTypeNoPoint<Number, Number2>::type,
Tensor<1, dim, typename ProductTypeNoPoint<Number, Number2>::type>>
evaluate_tensor_product_value_and_gradient_shapes(
const ArrayView<dealii::ndarray<Number2, 2, dim>> &shapes,
const int n_shapes,
const std::vector<Number> & values,
const std::vector<unsigned int> & renumber = {})
const dealii::ndarray<Number2, 2, dim> *shapes,
const int n_shapes,
const std::vector<Number> & values,
const std::vector<unsigned int> & renumber = {})
{
static_assert(dim >= 1 && dim <= 3, "Only dim=1,2,3 implemented");

Expand Down Expand Up @@ -3253,16 +3257,15 @@ namespace internal
}
else
{
AssertIndexRange(poly.size(), 200);
std::array<dealii::ndarray<Number2, 2, dim>, 200> shapes;

auto view = make_array_view(shapes);

compute_values_of_array(view, poly, p);
compute_values_of_array(shapes.data(), poly, p);

return evaluate_tensor_product_value_and_gradient_shapes<dim,
Number,
Number2>(
view, poly.size(), values, renumber);
shapes.data(), poly.size(), values, renumber);
}
}

Expand Down Expand Up @@ -3469,13 +3472,17 @@ namespace internal
* Test inner dimensions of tensor product shape functions and accumulate.
*/
template <int dim, int length, typename Number2, typename Number>
inline void
do_apply_test_functions_xy(
AlignedVector<Number2> & values,
const ArrayView<dealii::ndarray<Number, 2, dim>> &shapes,
const std::array<Number2, 3> & test_grads_value,
const int n_shapes_runtime,
int & i)
inline
#ifndef DEBUG
DEAL_II_ALWAYS_INLINE
#endif
void
do_apply_test_functions_xy(
AlignedVector<Number2> & values,
const ArrayView<dealii::ndarray<Number, 2, dim>> &shapes,
const std::array<Number2, 3> & test_grads_value,
const int n_shapes_runtime,
int & i)
{
if (length > 0)
{
Expand Down

0 comments on commit 8cfb2f2

Please sign in to comment.