Skip to content

Commit

Permalink
Add SSE2 version
Browse files Browse the repository at this point in the history
  • Loading branch information
kronbichler committed Sep 29, 2022
1 parent 001baa4 commit 52b278b
Showing 1 changed file with 24 additions and 16 deletions.
40 changes: 24 additions & 16 deletions include/deal.II/base/vectorization.h
Original file line number Diff line number Diff line change
Expand Up @@ -1148,9 +1148,9 @@ class VectorizedArray<double, 8>
// work around a warning with gcc-12 about an uninitialized initial state
// for gather by starting with a zero guess, even though all lanes will be
// overwritten
__m512d zero = {};
__m512d zero = {};
const __m256i invalid = _mm256_set1_epi32(numbers::invalid_unsigned_int);
__mmask8 mask = _mm256_cmpneq_epu32_mask(invalid, index);
__mmask8 mask = _mm256_cmpneq_epu32_mask(invalid, index);

data = _mm512_mask_i32gather_pd(zero, mask, index, base_ptr, 8);
}
Expand Down Expand Up @@ -1182,9 +1182,9 @@ class VectorizedArray<double, 8>
// API allows aliasing between different vector types.
const __m256 index_val =
_mm256_loadu_ps(reinterpret_cast<const float *>(offsets));
const __m256i index = *reinterpret_cast<const __m256i *>(&index_val);
const __m256i index = *reinterpret_cast<const __m256i *>(&index_val);
const __m256i invalid = _mm256_set1_epi32(numbers::invalid_unsigned_int);
__mmask8 mask = _mm256_cmpneq_epu32_mask(invalid, index);
__mmask8 mask = _mm256_cmpneq_epu32_mask(invalid, index);
_mm512_mask_i32scatter_pd(base_ptr, mask, index, data, 8);
}

Expand Down Expand Up @@ -1716,9 +1716,9 @@ class VectorizedArray<float, 16>
// work around a warning with gcc-12 about an uninitialized initial state
// for gather by starting with a zero guess, even though all lanes will be
// overwritten
__m512 zero = {};
__m512 zero = {};
const __m512i invalid = _mm512_set1_epi32(numbers::invalid_unsigned_int);
__mmask16 mask = _mm512_cmpneq_epu32_mask(invalid, index);
__mmask16 mask = _mm512_cmpneq_epu32_mask(invalid, index);

data = _mm512_mask_i32gather_ps(zero, mask, index, base_ptr, 4);
}
Expand Down Expand Up @@ -1750,9 +1750,9 @@ class VectorizedArray<float, 16>
// API allows aliasing between different vector types.
const __m512 index_val =
_mm512_loadu_ps(reinterpret_cast<const float *>(offsets));
const __m512i index = *reinterpret_cast<const __m512i *>(&index_val);
const __m512i index = *reinterpret_cast<const __m512i *>(&index_val);
const __m512i invalid = _mm512_set1_epi32(numbers::invalid_unsigned_int);
__mmask16 mask = _mm512_cmpneq_epu32_mask(invalid, index);
__mmask16 mask = _mm512_cmpneq_epu32_mask(invalid, index);
_mm512_mask_i32scatter_ps(base_ptr, mask, index, data, 4);
}

Expand Down Expand Up @@ -3475,7 +3475,8 @@ class VectorizedArray<double, 2>
gather(const double *base_ptr, const unsigned int *offsets)
{
for (unsigned int i = 0; i < 2; ++i)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
if (offsets[i] != numbers::invalid_unsigned)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
}

/**
Expand All @@ -3495,7 +3496,8 @@ class VectorizedArray<double, 2>
scatter(const unsigned int *offsets, double *base_ptr) const
{
for (unsigned int i = 0; i < 2; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
}

/**
Expand Down Expand Up @@ -3923,7 +3925,8 @@ class VectorizedArray<float, 4>
gather(const float *base_ptr, const unsigned int *offsets)
{
for (unsigned int i = 0; i < 4; ++i)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
if (offsets[i] != numbers::invalid_unsigned)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
}

/**
Expand All @@ -3943,7 +3946,8 @@ class VectorizedArray<float, 4>
scatter(const unsigned int *offsets, float *base_ptr) const
{
for (unsigned int i = 0; i < 4; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
}

/**
Expand Down Expand Up @@ -4381,7 +4385,8 @@ class VectorizedArray<double, 2>
gather(const double *base_ptr, const unsigned int *offsets)
{
for (unsigned int i = 0; i < 2; ++i)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
if (offsets[i] != numbers::invalid_unsigned)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
}

/**
Expand All @@ -4392,7 +4397,8 @@ class VectorizedArray<double, 2>
scatter(const unsigned int *offsets, double *base_ptr) const
{
for (unsigned int i = 0; i < 2; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
}

/**
Expand Down Expand Up @@ -4628,7 +4634,8 @@ class VectorizedArray<float, 4>
gather(const float *base_ptr, const unsigned int *offsets)
{
for (unsigned int i = 0; i < 4; ++i)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
if (offsets[i] != numbers::invalid_unsigned)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
}

/**
Expand All @@ -4639,7 +4646,8 @@ class VectorizedArray<float, 4>
scatter(const unsigned int *offsets, float *base_ptr) const
{
for (unsigned int i = 0; i < 4; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
}

/**
Expand Down

0 comments on commit 52b278b

Please sign in to comment.