Reduce overhead in calls to tensor product value function

dealii · May 5, 2023 · 8cfb2f2 · 8cfb2f2
1 parent d62c94c
commit 8cfb2f2
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 42 deletions.
diff --git a/include/deal.II/matrix_free/fe_point_evaluation.h b/include/deal.II/matrix_free/fe_point_evaluation.h
@@ -1534,14 +1534,11 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::do_reinit()
   if (fast_path && !polynomials_are_hat_functions)
     {
       const std::size_t n_batches =
-        n_q_points_scalar / n_lanes_internal +
-        (n_q_points_scalar % n_lanes_internal > 0 ? 1 : 0);
+        (n_q_points_scalar + n_lanes_internal - 1) / n_lanes_internal;
       const std::size_t n_shapes = poly.size();
       shapes.resize_fast(n_batches * n_shapes);
       for (unsigned int qb = 0; qb < n_batches; ++qb)
-        internal::compute_values_of_array(make_array_view(shapes,
-                                                          qb * n_shapes,
-                                                          n_shapes),
+        internal::compute_values_of_array(shapes.data() + qb * n_shapes,
                                           poly,
                                           unit_point_ptr[qb]);
     }
@@ -1561,13 +1558,15 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::evaluate_fast(
   if (solution_renumbered.size() != dofs_per_component)
     solution_renumbered.resize(dofs_per_component);
   for (unsigned int comp = 0; comp < n_components; ++comp)
-    for (unsigned int i = 0; i < dofs_per_component; ++i)
-      ETT::read_value(
-        solution_values[renumber[(component_in_base_element + comp) *
-                                   dofs_per_component +
-                                 i]],
-        comp,
-        solution_renumbered[i]);
+    {
+      const unsigned int *renumber_ptr =
+        renumber.data() +
+        (component_in_base_element + comp) * dofs_per_component;
+      for (unsigned int i = 0; i < dofs_per_component; ++i)
+        ETT::read_value(solution_values[renumber_ptr[i]],
+                        comp,
+                        solution_renumbered[i]);
+    }
 
   // unit gradients are currently only implemented with the fast tensor
   // path
@@ -1586,15 +1585,15 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::evaluate_fast(
           internal::evaluate_tensor_product_value_and_gradient_shapes<
             dim,
             scalar_value_type,
-            VectorizedArrayType>(make_array_view(shapes,
-                                                 qb * n_shapes,
-                                                 n_shapes),
+            VectorizedArrayType>(shapes.data() + qb * n_shapes,
                                  n_shapes,
                                  solution_renumbered);
 
       if (evaluation_flags & EvaluationFlags::values)
         {
-          for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
+          for (unsigned int v = 0;
+               v < stride && (stride > 1 ? q + v < n_q_points_scalar : true);
+               ++v)
             ETT::set_value(val_and_grad.first, v, values[qb * stride + v]);
         }
       if (evaluation_flags & EvaluationFlags::gradients)
@@ -1603,7 +1602,9 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::evaluate_fast(
                    update_flags & update_inverse_jacobians,
                  ExcNotInitialized());
 
-          for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
+          for (unsigned int v = 0;
+               v < stride && (stride > 1 ? q + v < n_q_points_scalar : true);
+               ++v)
             {
               const unsigned int offset = qb * stride + v;
               ETT::set_gradient(val_and_grad.second, v, unit_gradients[offset]);
@@ -1762,7 +1763,9 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::integrate_fast(
                 ETT::set_zero_value(values[qb], v);
             }
 
-          for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
+          for (unsigned int v = 0;
+               v < stride && (stride > 1 ? q + v < n_q_points_scalar : true);
+               ++v)
             ETT::get_value(value, v, values[qb * stride + v]);
         }
       if (integration_flags & EvaluationFlags::gradients)
@@ -1778,7 +1781,9 @@ FEPointEvaluation<n_components, dim, spacedim, Number>::integrate_fast(
                 ETT::set_zero_gradient(gradients[qb], v);
             }
 
-          for (unsigned int v = 0; v < stride && q + v < n_q_points_scalar; ++v)
+          for (unsigned int v = 0;
+               v < stride && (stride > 1 ? q + v < n_q_points_scalar : true);
+               ++v)
             {
               const unsigned int offset = qb * stride + v;
               ETT::get_gradient(

diff --git a/include/deal.II/matrix_free/tensor_product_kernels.h b/include/deal.II/matrix_free/tensor_product_kernels.h
@@ -2989,7 +2989,7 @@ namespace internal
   template <int dim, typename Number>
   inline void
   compute_values_of_array(
-    ArrayView<dealii::ndarray<Number, 2, dim>>          shapes,
+    dealii::ndarray<Number, 2, dim> *                   shapes,
     const std::vector<Polynomials::Polynomial<double>> &poly,
     const Point<dim, Number> &                          p)
   {
@@ -3000,7 +3000,7 @@ namespace internal
     for (unsigned int d = 0; d < dim; ++d)
       point[d] = p[d];
     for (int i = 0; i < n_shapes; ++i)
-      poly[i].values_of_array(point, 1, &shapes[i][0]);
+      poly[i].values_of_array(point, 1, shapes[i].data());
   }
 
 
@@ -3009,12 +3009,16 @@ namespace internal
    * Interpolate inner dimensions of tensor product shape functions.
    */
   template <int dim, int length, typename Number2, typename Number>
-  inline std::array<typename ProductTypeNoPoint<Number, Number2>::type, 3>
-  do_interpolate_xy(const std::vector<Number> &                        values,
-                    const std::vector<unsigned int> &                  renumber,
-                    const ArrayView<dealii::ndarray<Number2, 2, dim>> &shapes,
-                    const int n_shapes_runtime,
-                    int &     i)
+  inline
+#ifndef DEBUG
+    DEAL_II_ALWAYS_INLINE
+#endif
+      std::array<typename ProductTypeNoPoint<Number, Number2>::type, 3>
+      do_interpolate_xy(const std::vector<Number> &             values,
+                        const std::vector<unsigned int> &       renumber,
+                        const dealii::ndarray<Number2, 2, dim> *shapes,
+                        const int n_shapes_runtime,
+                        int &     i)
   {
     const int n_shapes = length > 0 ? length : n_shapes_runtime;
     using Number3      = typename ProductTypeNoPoint<Number, Number2>::type;
@@ -3066,10 +3070,10 @@ namespace internal
     typename ProductTypeNoPoint<Number, Number2>::type,
     Tensor<1, dim, typename ProductTypeNoPoint<Number, Number2>::type>>
   evaluate_tensor_product_value_and_gradient_shapes(
-    const ArrayView<dealii::ndarray<Number2, 2, dim>> &shapes,
-    const int                                          n_shapes,
-    const std::vector<Number> &                        values,
-    const std::vector<unsigned int> &                  renumber = {})
+    const dealii::ndarray<Number2, 2, dim> *shapes,
+    const int                               n_shapes,
+    const std::vector<Number> &             values,
+    const std::vector<unsigned int> &       renumber = {})
   {
     static_assert(dim >= 1 && dim <= 3, "Only dim=1,2,3 implemented");
 
@@ -3253,16 +3257,15 @@ namespace internal
       }
     else
       {
+        AssertIndexRange(poly.size(), 200);
         std::array<dealii::ndarray<Number2, 2, dim>, 200> shapes;
 
-        auto view = make_array_view(shapes);
-
-        compute_values_of_array(view, poly, p);
+        compute_values_of_array(shapes.data(), poly, p);
 
         return evaluate_tensor_product_value_and_gradient_shapes<dim,
                                                                  Number,
                                                                  Number2>(
-          view, poly.size(), values, renumber);
+          shapes.data(), poly.size(), values, renumber);
       }
   }
 
@@ -3469,13 +3472,17 @@ namespace internal
    * Test inner dimensions of tensor product shape functions and accumulate.
    */
   template <int dim, int length, typename Number2, typename Number>
-  inline void
-  do_apply_test_functions_xy(
-    AlignedVector<Number2> &                          values,
-    const ArrayView<dealii::ndarray<Number, 2, dim>> &shapes,
-    const std::array<Number2, 3> &                    test_grads_value,
-    const int                                         n_shapes_runtime,
-    int &                                             i)
+  inline
+#ifndef DEBUG
+    DEAL_II_ALWAYS_INLINE
+#endif
+    void
+    do_apply_test_functions_xy(
+      AlignedVector<Number2> &                          values,
+      const ArrayView<dealii::ndarray<Number, 2, dim>> &shapes,
+      const std::array<Number2, 3> &                    test_grads_value,
+      const int                                         n_shapes_runtime,
+      int &                                             i)
   {
     if (length > 0)
       {