Skip to content

Commit

Permalink
Merge pull request #15875 from kronbichler/gather_mitigation
Browse files Browse the repository at this point in the history
  • Loading branch information
masterleinad committed Aug 14, 2023
2 parents 5824d33 + 6d92ba3 commit 53ac485
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 2 deletions.
7 changes: 7 additions & 0 deletions cmake/setup_cached_variables.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
# DEAL_II_DEFINITIONS
# DEAL_II_DEFINITIONS_DEBUG
# DEAL_II_DEFINITIONS_RELEASE
# DEAL_II_USE_VECTORIZATION_GATHER
#
# Components and miscellaneous options:
#
Expand Down Expand Up @@ -175,6 +176,12 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH "ON" CACHE BOOL
)
mark_as_advanced(CMAKE_INSTALL_RPATH_USE_LINK_PATH)

option(DEAL_II_USE_VECTORIZATION_GATHER
"For the x86 compilation target, the use of SIMD gather/scatter instructions can be much slower than using scalar loads. This includes a wide range of Intel hardware (in particular, server processors of the Broadwell, Skylake, Cascade Lake, and Ice Lake families released between 2015 and 2021). While the default is to aggressively use these instructions, this variable can be used to disable their use if deemed to give better performance."
ON
)
mark_as_advanced(DEAL_II_USE_VECTORIZATION_GATHER)


########################################################################
# #
Expand Down
12 changes: 12 additions & 0 deletions doc/news/changes/minor/20230814Kronbichler
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Improved: deal.II now has a flag DEAL_II_USE_VECTORIZATION_GATHER to control
the use of gather/scatter instructions on the x86 architecture. On a wide
range of Intel hardware with microcode mitigation for the Intel Gather Data
Speculation (GDS, aka Downfall) side channel vulnerability, in particular,
server processors of the Broadwell, Skylake, Cascade Lake, and Ice Lake
families released between 2015 and 2021, these instructions can be much slower
than scalar loads. While the default behavior of deal.II is to aggressively
enable these instructions in the intrinsics-class VectorizedArray, the new
variable can be used to disable their use if deemed to give better
performance.
<br>
(Martin Kronbichler, Matthias Maier, 2023/08/14)
1 change: 1 addition & 0 deletions include/deal.II/base/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
#cmakedefine DEAL_II_WITH_TRILINOS
#cmakedefine DEAL_II_WITH_UMFPACK
#cmakedefine DEAL_II_FEATURE_UMFPACK_BUNDLED_CONFIGURED
#cmakedefine DEAL_II_USE_VECTORIZATION_GATHER
#cmakedefine DEAL_II_WITH_VTK
#cmakedefine DEAL_II_WITH_ZLIB

Expand Down
24 changes: 22 additions & 2 deletions include/deal.II/base/vectorization.h
Original file line number Diff line number Diff line change
Expand Up @@ -2170,7 +2170,7 @@ class VectorizedArray<double, 4>
void
gather(const double *base_ptr, const unsigned int *offsets)
{
# ifdef __AVX2__
# if defined(__AVX2__) && defined(DEAL_II_USE_VECTORIZATION_GATHER)
// unfortunately, there does not appear to be a 128 bit integer load, so
// do it by some reinterpret casts here. this is allowed because the Intel
// API allows aliasing between different vector types.
Expand Down Expand Up @@ -2734,7 +2734,7 @@ class VectorizedArray<float, 8>
void
gather(const float *base_ptr, const unsigned int *offsets)
{
# ifdef __AVX2__
# if defined(__AVX2__) && defined(DEAL_II_USE_VECTORIZATION_GATHER)
// unfortunately, there does not appear to be a 256 bit integer load, so
// do it by some reinterpret casts here. this is allowed because the Intel
// API allows aliasing between different vector types.
Expand Down Expand Up @@ -3350,6 +3350,7 @@ class VectorizedArray<double, 8>
void
gather(const double *base_ptr, const unsigned int *offsets)
{
# ifdef DEAL_II_USE_VECTORIZATION_GATHER
// unfortunately, there does not appear to be a 256 bit integer load, so
// do it by some reinterpret casts here. this is allowed because the Intel
// API allows aliasing between different vector types.
Expand All @@ -3364,6 +3365,10 @@ class VectorizedArray<double, 8>
__mmask8 mask = 0xFF;

data = _mm512_mask_i32gather_pd(zero, mask, index, base_ptr, 8);
# else
for (unsigned int i = 0; i < 8; ++i)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
# endif
}

/**
Expand All @@ -3382,6 +3387,7 @@ class VectorizedArray<double, 8>
void
scatter(const unsigned int *offsets, double *base_ptr) const
{
# ifdef DEAL_II_USE_VECTORIZATION_GATHER
for (unsigned int i = 0; i < 8; ++i)
for (unsigned int j = i + 1; j < 8; ++j)
Assert(offsets[i] != offsets[j],
Expand All @@ -3395,6 +3401,10 @@ class VectorizedArray<double, 8>
_mm256_loadu_ps(reinterpret_cast<const float *>(offsets));
const __m256i index = *reinterpret_cast<const __m256i *>(&index_val);
_mm512_i32scatter_pd(base_ptr, index, data, 8);
# else
for (unsigned int i = 0; i < 8; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
# endif
}

/**
Expand Down Expand Up @@ -3955,6 +3965,7 @@ class VectorizedArray<float, 16>
void
gather(const float *base_ptr, const unsigned int *offsets)
{
# ifdef DEAL_II_USE_VECTORIZATION_GATHER
// unfortunately, there does not appear to be a 512 bit integer load, so
// do it by some reinterpret casts here. this is allowed because the Intel
// API allows aliasing between different vector types.
Expand All @@ -3969,6 +3980,10 @@ class VectorizedArray<float, 16>
__mmask16 mask = 0xFFFF;

data = _mm512_mask_i32gather_ps(zero, mask, index, base_ptr, 4);
# else
for (unsigned int i = 0; i < 16; ++i)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
# endif
}

/**
Expand All @@ -3987,6 +4002,7 @@ class VectorizedArray<float, 16>
void
scatter(const unsigned int *offsets, float *base_ptr) const
{
# ifdef DEAL_II_USE_VECTORIZATION_GATHER
for (unsigned int i = 0; i < 16; ++i)
for (unsigned int j = i + 1; j < 16; ++j)
Assert(offsets[i] != offsets[j],
Expand All @@ -4000,6 +4016,10 @@ class VectorizedArray<float, 16>
_mm512_loadu_ps(reinterpret_cast<const float *>(offsets));
const __m512i index = *reinterpret_cast<const __m512i *>(&index_val);
_mm512_i32scatter_ps(base_ptr, index, data, 4);
# else
for (unsigned int i = 0; i < 16; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
# endif
}

/**
Expand Down

0 comments on commit 53ac485

Please sign in to comment.