dealii · masterleinad · May 25, 2023 · May 17, 2023 · May 22, 2023 · May 22, 2023
diff --git a/include/deal.II/base/vectorization.h b/include/deal.II/base/vectorization.h
@@ -656,6 +656,16 @@ class VectorizedArray
     base_ptr[offsets[0]] = data;
   }
 
+  /**
+   * Returns horizontal sum of data field.
+   */
+  DEAL_II_ALWAYS_INLINE
+  Number
+  horizontal_add()
+  {
+    return data;
+  }
+
   /**
    * Actual data field. To be consistent with the standard layout type and to
    * enable interaction with external SIMD functionality, this member is
@@ -1216,6 +1226,12 @@ class VectorizedArray<double, 8>
     _mm512_i32scatter_pd(base_ptr, index, data, 8);
   }
 
+  /**
+   * Returns horizontal sum of data field.
+   */
+  double
+  horizontal_add();
+
   /**
    * Actual data field. To be consistent with the standard layout type and to
    * enable interaction with external SIMD functionality, this member is
@@ -1224,6 +1240,26 @@ class VectorizedArray<double, 8>
   __m512d data;
 
 private:
+  /**
+   * Extract lower half of data field.
+   */
+  DEAL_II_ALWAYS_INLINE
+  __m256d
+  get_low() const
+  {
+    return _mm512_castpd512_pd256(data);
+  }
+
+  /**
+   * Extract higher half of data field.
+   */
+  DEAL_II_ALWAYS_INLINE
+  __m256d
+  get_high() const
+  {
+    return _mm512_extractf64x4_pd(data, 1);
+  }
+
   /**
    * Return the square root of this field. Not for use in user code. Use
    * sqrt(x) instead.
@@ -1789,6 +1825,12 @@ class VectorizedArray<float, 16>
     _mm512_i32scatter_ps(base_ptr, index, data, 4);
   }
 
+  /**
+   * Returns horizontal sum of data field.
+   */
+  float
+  horizontal_add();
+
   /**
    * Actual data field. To be consistent with the standard layout type and to
    * enable interaction with external SIMD functionality, this member is
@@ -1797,6 +1839,26 @@ class VectorizedArray<float, 16>
   __m512 data;
 
 private:
+  /**
+   * Extract lower half of data field.
+   */
+  DEAL_II_ALWAYS_INLINE
+  __m256
+  get_low() const
+  {
+    return _mm512_castps512_ps256(data);
+  }
+
+  /**
+   * Extract higher half of data field.
+   */
+  DEAL_II_ALWAYS_INLINE
+  __m256
+  get_high() const
+  {
+    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(data), 1));
+  }
+
   /**
    * Return the square root of this field. Not for use in user code. Use
    * sqrt(x) instead.
@@ -2252,6 +2314,14 @@ class VectorizedArray<double, 4>
     : VectorizedArrayBase<VectorizedArray<double, 4>, 4>(list)
   {}
 
+  /**
+   * Construct an array with the data field.
+   */
+  VectorizedArray(__m256d const &x)
+  {
+    data = x;
+  }
+
   /**
    * This function can be used to set all data fields to a given scalar.
    */
@@ -2467,6 +2537,12 @@ class VectorizedArray<double, 4>
       base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
   }
 
+  /**
+   * Returns horizontal sum of data field.
+   */
+  double
+  horizontal_add();
+
   /**
    * Actual data field. To be consistent with the standard layout type and to
    * enable interaction with external SIMD functionality, this member is
@@ -2475,6 +2551,26 @@ class VectorizedArray<double, 4>
   __m256d data;
 
 private:
+  /**
+   * Extract lower half of data field.
+   */
+  DEAL_II_ALWAYS_INLINE
+  __m128d
+  get_low() const
+  {
+    return _mm256_castpd256_pd128(data);
+  }
+
+  /**
+   * Extract higher half of data field.
+   */
+  DEAL_II_ALWAYS_INLINE
+  __m128d
+  get_high() const
+  {
+    return _mm256_extractf128_pd(data, 1);
+  }
+
   /**
    * Return the square root of this field. Not for use in user code. Use
    * sqrt(x) instead.
@@ -2798,6 +2894,14 @@ class VectorizedArray<float, 8>
     : VectorizedArrayBase<VectorizedArray<float, 8>, 8>(list)
   {}
 
+  /**
+   * Construct an array with the data field.
+   */
+  VectorizedArray(__m256 const &x)
+  {
+    data = x;
+  }
+
   /**
    * This function can be used to set all data fields to a given scalar.
    */
@@ -2999,6 +3103,12 @@ class VectorizedArray<float, 8>
       base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
   }
 
+  /**
+   * Returns horizontal sum of data field.
+   */
+  float
+  horizontal_add();
+
   /**
    * Actual data field. To be consistent with the standard layout type and to
    * enable interaction with external SIMD functionality, this member is
@@ -3007,6 +3117,26 @@ class VectorizedArray<float, 8>
   __m256 data;
 
 private:
+  /**
+   * Extract lower half of data field.
+   */
+  DEAL_II_ALWAYS_INLINE
+  __m128
+  get_low() const
+  {
+    return _mm256_castps256_ps128(data);
+  }
+
+  /**
+   * Extract higher half of data field.
+   */
+  DEAL_II_ALWAYS_INLINE
+  __m128
+  get_high() const
+  {
+    return _mm256_extractf128_ps(data, 1);
+  }
+
   /**
    * Return the square root of this field. Not for use in user code. Use
    * sqrt(x) instead.
@@ -3364,6 +3494,14 @@ class VectorizedArray<double, 2>
     : VectorizedArrayBase<VectorizedArray<double, 2>, 2>(list)
   {}
 
+  /**
+   * Construct an array with the data field.
+   */
+  VectorizedArray(__m128d const &x)
+  {
+    data = x;
+  }
+
   /**
    * This function can be used to set all data fields to a given scalar.
    */
@@ -3561,6 +3699,12 @@ class VectorizedArray<double, 2>
       base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
   }
 
+  /**
+   * Returns horizontal sum of data field.
+   */
+  double
+  horizontal_add();
+
   /**
    * Actual data field. To be consistent with the standard layout type and to
    * enable interaction with external SIMD functionality, this member is
@@ -3849,6 +3993,14 @@ class VectorizedArray<float, 4>
     return *this;
   }
 
+  /**
+   * Construct an array with the data field.
+   */
+  VectorizedArray(__m128 const &x)
+  {
+    data = x;
+  }
+
   /**
    * Assign a scalar to the current object. This overload is used for
    * rvalue references; because it does not make sense to assign
@@ -4017,6 +4169,12 @@ class VectorizedArray<float, 4>
       base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
   }
 
+  /**
+   * Returns horizontal sum of data field.
+   */
+  float
+  horizontal_add();
+
   /**
    * Actual data field. To be consistent with the standard layout type and to
    * enable interaction with external SIMD functionality, this member is
@@ -4812,6 +4970,74 @@ class VectorizedArray<float, 4>
 
 #endif // DOXYGEN
 
+/**
+ * horizontal_add() functions.
+ */
+
+#if DEAL_II_VECTORIZATION_WIDTH_IN_BITS >= 128 && defined(__SSE2__)
+inline double
+VectorizedArray<double, 2>::horizontal_add()
+{
+  __m128d t1 = _mm_unpackhi_pd(data, data);
+  __m128d t2 = _mm_add_pd(data, t1);
+  return _mm_cvtsd_f64(t2);
+}
+
+
+
+inline float
+VectorizedArray<float, 4>::horizontal_add()
+{
+  __m128 t1 = _mm_movehl_ps(data, data);
+  __m128 t2 = _mm_add_ps(data, t1);
+  __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+  __m128 t4 = _mm_add_ss(t2, t3);
+  return _mm_cvtss_f32(t4);
+}
+#endif
+
+
+
+#if DEAL_II_VECTORIZATION_WIDTH_IN_BITS >= 256 && defined(__AVX__)
+inline double
+VectorizedArray<double, 4>::horizontal_add()
+{
+  VectorizedArray<double, 2> t1(this->get_low() + this->get_high());
+  return t1.horizontal_add();
+}
+
+
+
+inline float
+VectorizedArray<float, 8>::horizontal_add()
+{
+  VectorizedArray<float, 4> t1(this->get_low() + this->get_high());
+  return t1.horizontal_add();
+}
+#endif
+
+
+
+#if DEAL_II_VECTORIZATION_WIDTH_IN_BITS >= 512 && defined(__AVX512F__)
+inline double
+VectorizedArray<double, 8>::horizontal_add()
+{
+  VectorizedArray<double, 4> t1(this->get_low() + this->get_high());
+  return t1.horizontal_add();
+}
+
+
+
+inline float
+VectorizedArray<float, 16>::horizontal_add()
+{
+  VectorizedArray<float, 8> t1(this->get_low() + this->get_high());
+  return t1.horizontal_add();
+}
+#endif
+
+
+
 /**
  * @name Arithmetic operations with VectorizedArray
  * @{

diff --git a/include/deal.II/matrix_free/fe_point_evaluation.h b/include/deal.II/matrix_free/fe_point_evaluation.h
@@ -2067,11 +2067,7 @@ FEPointEvaluation<n_components_, dim, spacedim, Number>::finish_integrate_fast(
               ETT::write_value(vectorized_input,
                                comp,
                                solution_renumbered_vectorized[i]);
-              for (unsigned int lane = n_lanes_internal / 2; lane > 0;
-                   lane /= 2)
-                for (unsigned int j = 0; j < lane; ++j)
-                  vectorized_input[j] += vectorized_input[lane + j];
-              input[i] = vectorized_input[0];
+              input[i] = vectorized_input.horizontal_add();
             }
 
           internal::FEFaceNormalEvaluationImpl<dim, -1, ScalarNumber>::
@@ -2092,12 +2088,8 @@ FEPointEvaluation<n_components_, dim, spacedim, Number>::finish_integrate_fast(
             {
               VectorizedArrayType result;
               ETT::write_value(result, comp, solution_renumbered_vectorized[i]);
-              for (unsigned int lane = n_lanes_internal / 2; lane > 0;
-                   lane /= 2)
-                for (unsigned int j = 0; j < lane; ++j)
-                  result[j] += result[lane + j];
               solution_values[renumber[comp * dofs_per_component + i]] =
-                result[0];
+                result.horizontal_add();
             }
         }
     }