dealii · masterleinad · Aug 14, 2023 · Aug 14, 2023 · Aug 14, 2023 · tjhei
diff --git a/cmake/setup_cached_variables.cmake b/cmake/setup_cached_variables.cmake
@@ -46,6 +46,7 @@
 #     DEAL_II_DEFINITIONS
 #     DEAL_II_DEFINITIONS_DEBUG
 #     DEAL_II_DEFINITIONS_RELEASE
+#     DEAL_II_USE_VECTORIZATION_GATHER
 #
 # Components and miscellaneous options:
 #
@@ -175,6 +176,12 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH "ON" CACHE BOOL
   )
 mark_as_advanced(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
 
+option(DEAL_II_USE_VECTORIZATION_GATHER
+  "For the x86 compilation target, the use of SIMD gather/scatter instructions can be much slower than using scalar loads. This includes a wide range of Intel hardware (in particular, server processors of the Broadwell, Skylake, Cascade Lake, and Ice Lake families released between 2015 and 2021). While the default is to aggressively use these instructions, this variable can be used to disable their use if deemed to give better performance."
+  ON
+  )
+mark_as_advanced(DEAL_II_USE_VECTORIZATION_GATHER)
+
 
 ########################################################################
 #                                                                      #

diff --git a/doc/news/changes/minor/20230814Kronbichler b/doc/news/changes/minor/20230814Kronbichler
@@ -0,0 +1,12 @@
+Improved: deal.II now has a flag DEAL_II_USE_VECTORIZATION_GATHER to control
+the use of gather/scatter instructions on the x86 architecture. On a wide
+range of Intel hardware with microcode mitigation for the Intel Gather Data
+Speculation (GDS, aka Downfall) side channel vulnerability, in particular,
+server processors of the Broadwell, Skylake, Cascade Lake, and Ice Lake
+families released between 2015 and 2021, these instructions can be much slower
+than scalar loads. While the default behavior of deal.II is to aggressively
+enable these instructions in the intrinsics-class VectorizedArray, the new
+variable can be used to disable their use if deemed to give better
+performance.
+<br>
+(Martin Kronbichler, Matthias Maier, 2023/08/14)
diff --git a/include/deal.II/base/config.h.in b/include/deal.II/base/config.h.in
@@ -70,6 +70,7 @@
 #cmakedefine DEAL_II_WITH_TRILINOS
 #cmakedefine DEAL_II_WITH_UMFPACK
 #cmakedefine DEAL_II_FEATURE_UMFPACK_BUNDLED_CONFIGURED
+#cmakedefine DEAL_II_USE_VECTORIZATION_GATHER
 #cmakedefine DEAL_II_WITH_VTK
 #cmakedefine DEAL_II_WITH_ZLIB
 

diff --git a/include/deal.II/base/vectorization.h b/include/deal.II/base/vectorization.h
@@ -2170,7 +2170,7 @@ class VectorizedArray<double, 4>
   void
   gather(const double *base_ptr, const unsigned int *offsets)
   {
-#    ifdef __AVX2__
+#    if defined(__AVX2__) && defined(DEAL_II_USE_VECTORIZATION_GATHER)
     // unfortunately, there does not appear to be a 128 bit integer load, so
     // do it by some reinterpret casts here. this is allowed because the Intel
     // API allows aliasing between different vector types.
@@ -2734,7 +2734,7 @@ class VectorizedArray<float, 8>
   void
   gather(const float *base_ptr, const unsigned int *offsets)
   {
-#    ifdef __AVX2__
+#    if defined(__AVX2__) && defined(DEAL_II_USE_VECTORIZATION_GATHER)
     // unfortunately, there does not appear to be a 256 bit integer load, so
     // do it by some reinterpret casts here. this is allowed because the Intel
     // API allows aliasing between different vector types.
@@ -3350,6 +3350,7 @@ class VectorizedArray<double, 8>
   void
   gather(const double *base_ptr, const unsigned int *offsets)
   {
+#    ifdef DEAL_II_USE_VECTORIZATION_GATHER
     // unfortunately, there does not appear to be a 256 bit integer load, so
     // do it by some reinterpret casts here. this is allowed because the Intel
     // API allows aliasing between different vector types.
@@ -3364,6 +3365,10 @@ class VectorizedArray<double, 8>
     __mmask8 mask = 0xFF;
 
     data = _mm512_mask_i32gather_pd(zero, mask, index, base_ptr, 8);
+#    else
+    for (unsigned int i = 0; i < 8; ++i)
+      *(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
+#    endif
   }
 
   /**
@@ -3382,6 +3387,7 @@ class VectorizedArray<double, 8>
   void
   scatter(const unsigned int *offsets, double *base_ptr) const
   {
+#    ifdef DEAL_II_USE_VECTORIZATION_GATHER
     for (unsigned int i = 0; i < 8; ++i)
       for (unsigned int j = i + 1; j < 8; ++j)
         Assert(offsets[i] != offsets[j],
@@ -3395,6 +3401,10 @@ class VectorizedArray<double, 8>
       _mm256_loadu_ps(reinterpret_cast<const float *>(offsets));
     const __m256i index = *reinterpret_cast<const __m256i *>(&index_val);
     _mm512_i32scatter_pd(base_ptr, index, data, 8);
+#    else
+    for (unsigned int i = 0; i < 8; ++i)
+      base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
+#    endif
   }
 
   /**
@@ -3955,6 +3965,7 @@ class VectorizedArray<float, 16>
   void
   gather(const float *base_ptr, const unsigned int *offsets)
   {
+#    ifdef DEAL_II_USE_VECTORIZATION_GATHER
     // unfortunately, there does not appear to be a 512 bit integer load, so
     // do it by some reinterpret casts here. this is allowed because the Intel
     // API allows aliasing between different vector types.
@@ -3969,6 +3980,10 @@ class VectorizedArray<float, 16>
     __mmask16 mask = 0xFFFF;
 
     data = _mm512_mask_i32gather_ps(zero, mask, index, base_ptr, 4);
+#    else
+    for (unsigned int i = 0; i < 16; ++i)
+      *(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
+#    endif
   }
 
   /**
@@ -3987,6 +4002,7 @@ class VectorizedArray<float, 16>
   void
   scatter(const unsigned int *offsets, float *base_ptr) const
   {
+#    ifdef DEAL_II_USE_VECTORIZATION_GATHER
     for (unsigned int i = 0; i < 16; ++i)
       for (unsigned int j = i + 1; j < 16; ++j)
         Assert(offsets[i] != offsets[j],
@@ -4000,6 +4016,10 @@ class VectorizedArray<float, 16>
       _mm512_loadu_ps(reinterpret_cast<const float *>(offsets));
     const __m512i index = *reinterpret_cast<const __m512i *>(&index_val);
     _mm512_i32scatter_ps(base_ptr, index, data, 4);
+#    else
+    for (unsigned int i = 0; i < 16; ++i)
+      base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
+#    endif
   }
 
   /**