Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MatrixFree: Store indices of Dirichlet constrained DoFs as -1 #14324

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
106 changes: 73 additions & 33 deletions include/deal.II/base/vectorization.h
Original file line number Diff line number Diff line change
Expand Up @@ -1162,8 +1162,9 @@ class VectorizedArray<double, 8>
// work around a warning with gcc-12 about an uninitialized initial state
// for gather by starting with a zero guess, even though all lanes will be
// overwritten
__m512d zero = {};
__mmask8 mask = 0xFF;
__m512d zero = {};
const __m256i invalid = _mm256_set1_epi32(numbers::invalid_unsigned_int);
__mmask8 mask = _mm256_cmpneq_epu32_mask(invalid, index);
Comment on lines +1166 to +1167
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for the record, this function is not in AVX512F but in AVX512VL, so we probably want to guard this with an ifdef.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is https://en.wikichip.org/wiki/x86/avx-512 correct in that this is only a problem for KNL?


data = _mm512_mask_i32gather_pd(zero, mask, index, base_ptr, 8);
}
Expand Down Expand Up @@ -1195,8 +1196,10 @@ class VectorizedArray<double, 8>
// API allows aliasing between different vector types.
const __m256 index_val =
_mm256_loadu_ps(reinterpret_cast<const float *>(offsets));
const __m256i index = *reinterpret_cast<const __m256i *>(&index_val);
_mm512_i32scatter_pd(base_ptr, index, data, 8);
const __m256i index = *reinterpret_cast<const __m256i *>(&index_val);
const __m256i invalid = _mm256_set1_epi32(numbers::invalid_unsigned_int);
__mmask8 mask = _mm256_cmpneq_epu32_mask(invalid, index);
_mm512_mask_i32scatter_pd(base_ptr, mask, index, data, 8);
}

/**
Expand Down Expand Up @@ -1727,8 +1730,9 @@ class VectorizedArray<float, 16>
// work around a warning with gcc-12 about an uninitialized initial state
// for gather by starting with a zero guess, even though all lanes will be
// overwritten
__m512 zero = {};
__mmask16 mask = 0xFFFF;
__m512 zero = {};
const __m512i invalid = _mm512_set1_epi32(numbers::invalid_unsigned_int);
__mmask16 mask = _mm512_cmpneq_epu32_mask(invalid, index);

data = _mm512_mask_i32gather_ps(zero, mask, index, base_ptr, 4);
}
Expand Down Expand Up @@ -1760,8 +1764,10 @@ class VectorizedArray<float, 16>
// API allows aliasing between different vector types.
const __m512 index_val =
_mm512_loadu_ps(reinterpret_cast<const float *>(offsets));
const __m512i index = *reinterpret_cast<const __m512i *>(&index_val);
_mm512_i32scatter_ps(base_ptr, index, data, 4);
const __m512i index = *reinterpret_cast<const __m512i *>(&index_val);
const __m512i invalid = _mm512_set1_epi32(numbers::invalid_unsigned_int);
__mmask16 mask = _mm512_cmpneq_epu32_mask(invalid, index);
_mm512_mask_i32scatter_ps(base_ptr, mask, index, data, 4);
}

/**
Expand Down Expand Up @@ -2404,9 +2410,14 @@ class VectorizedArray<double, 4>
// for gather by starting with a zero guess, even though all lanes will be
// overwritten
__m256d zero = _mm256_setzero_pd();
__m256d mask = _mm256_cmp_pd(zero, zero, _CMP_EQ_OQ);

data = _mm256_mask_i32gather_pd(zero, base_ptr, index, mask, 8);
__m256i invalid =
_mm256_set1_epi64x((long long)numbers::invalid_unsigned_int);
__m256i index64 = _mm256_cvtepu32_epi64(index);
__m256i inverse_mask = _mm256_cmpeq_epi64(index64, invalid);
__m256i mask = _mm256_xor_si256(_mm256_set1_epi64x(-1LL), inverse_mask);

data = _mm256_mask_i32gather_pd(
zero, base_ptr, index, *reinterpret_cast<__m256d *>(&mask), 8);
# else
for (unsigned int i = 0; i < 4; ++i)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
Expand All @@ -2430,8 +2441,10 @@ class VectorizedArray<double, 4>
scatter(const unsigned int *offsets, double *base_ptr) const
{
// no scatter operation in AVX/AVX2
DEAL_II_OPENMP_SIMD_PRAGMA
for (unsigned int i = 0; i < 4; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned_int)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
}

/**
Expand Down Expand Up @@ -2652,12 +2665,18 @@ vectorized_transpose_and_store(const bool add_into,
// remainder loop of work that does not divide by 4
if (add_into)
for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
for (unsigned int v = 0; v < 4; ++v)
out[offsets[v] + i] += in[i][v];
{
DEAL_II_OPENMP_SIMD_PRAGMA
for (unsigned int v = 0; v < 4; ++v)
out[offsets[v] + i] += in[i][v];
}
else
for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
for (unsigned int v = 0; v < 4; ++v)
out[offsets[v] + i] = in[i][v];
{
DEAL_II_OPENMP_SIMD_PRAGMA
for (unsigned int v = 0; v < 4; ++v)
out[offsets[v] + i] = in[i][v];
}
}


Expand Down Expand Up @@ -2720,12 +2739,18 @@ vectorized_transpose_and_store(const bool add_into,
// remainder loop of work that does not divide by 4
if (add_into)
for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
for (unsigned int v = 0; v < 4; ++v)
out[v][i] += in[i][v];
{
DEAL_II_OPENMP_SIMD_PRAGMA
for (unsigned int v = 0; v < 4; ++v)
out[v][i] += in[i][v];
}
else
for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
for (unsigned int v = 0; v < 4; ++v)
out[v][i] = in[i][v];
{
DEAL_II_OPENMP_SIMD_PRAGMA
for (unsigned int v = 0; v < 4; ++v)
out[v][i] = in[i][v];
}
}


Expand Down Expand Up @@ -2927,13 +2952,18 @@ class VectorizedArray<float, 8>
// work around a warning with gcc-12 about an uninitialized initial state
// for gather by starting with a zero guess, even though all lanes will be
// overwritten
__m256 zero = _mm256_setzero_ps();
__m256 mask = _mm256_cmp_ps(zero, zero, _CMP_EQ_OQ);
__m256 zero = _mm256_setzero_ps();
__m256i invalid = _mm256_set1_epi32(numbers::invalid_unsigned_int);
__m256i inverse_mask = _mm256_cmpeq_epi32(index, invalid);
__m256i mask = _mm256_xor_si256(invalid, inverse_mask);

data = _mm256_mask_i32gather_ps(zero, base_ptr, index, mask, 4);
data = _mm256_mask_i32gather_ps(
zero, base_ptr, index, *reinterpret_cast<__m256 *>(&mask), 4);
# else
DEAL_II_OPENMP_SIMD_PRAGMA
for (unsigned int i = 0; i < 8; ++i)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
if (offsets[i] != numbers::invalid_unsigned_int)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
# endif
}

Expand All @@ -2954,8 +2984,10 @@ class VectorizedArray<float, 8>
scatter(const unsigned int *offsets, float *base_ptr) const
{
// no scatter operation in AVX/AVX2
DEAL_II_OPENMP_SIMD_PRAGMA
for (unsigned int i = 0; i < 8; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned_int)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
}

/**
Expand Down Expand Up @@ -3489,7 +3521,8 @@ class VectorizedArray<double, 2>
gather(const double *base_ptr, const unsigned int *offsets)
{
for (unsigned int i = 0; i < 2; ++i)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
if (offsets[i] != numbers::invalid_unsigned_int)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
}

/**
Expand All @@ -3509,7 +3542,8 @@ class VectorizedArray<double, 2>
scatter(const unsigned int *offsets, double *base_ptr) const
{
for (unsigned int i = 0; i < 2; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned_int)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
}

/**
Expand Down Expand Up @@ -3937,7 +3971,8 @@ class VectorizedArray<float, 4>
gather(const float *base_ptr, const unsigned int *offsets)
{
for (unsigned int i = 0; i < 4; ++i)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
if (offsets[i] != numbers::invalid_unsigned_int)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
}

/**
Expand All @@ -3957,7 +3992,8 @@ class VectorizedArray<float, 4>
scatter(const unsigned int *offsets, float *base_ptr) const
{
for (unsigned int i = 0; i < 4; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned_int)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
}

/**
Expand Down Expand Up @@ -4395,7 +4431,8 @@ class VectorizedArray<double, 2>
gather(const double *base_ptr, const unsigned int *offsets)
{
for (unsigned int i = 0; i < 2; ++i)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
if (offsets[i] != numbers::invalid_unsigned_int)
*(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
}

/**
Expand All @@ -4406,7 +4443,8 @@ class VectorizedArray<double, 2>
scatter(const unsigned int *offsets, double *base_ptr) const
{
for (unsigned int i = 0; i < 2; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned_int)
base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
}

/**
Expand Down Expand Up @@ -4642,7 +4680,8 @@ class VectorizedArray<float, 4>
gather(const float *base_ptr, const unsigned int *offsets)
{
for (unsigned int i = 0; i < 4; ++i)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
if (offsets[i] != numbers::invalid_unsigned_int)
*(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
}

/**
Expand All @@ -4653,7 +4692,8 @@ class VectorizedArray<float, 4>
scatter(const unsigned int *offsets, float *base_ptr) const
{
for (unsigned int i = 0; i < 4; ++i)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
if (offsets[i] != numbers::invalid_unsigned_int)
base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
}

/**
Expand Down
44 changes: 28 additions & 16 deletions include/deal.II/matrix_free/dof_info.templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,15 @@ namespace internal
goto no_constraint;
}

// case with zero-constrained DoF: Append invalid number,
// this will be handled by the read functions
if (n_entries == 0)
{
dof_indices.push_back(numbers::invalid_unsigned_int);
constraint_iterator.first++;
continue;
}

// append a new index to the indicators
constraint_indicator.push_back(constraint_iterator);
constraint_indicator.back().second =
Expand Down Expand Up @@ -409,14 +418,15 @@ namespace internal
n_components]
.first;
++it)
{
const unsigned int myindex =
dof_indices[it] / chunk_size_zero_vector;
if (touched_first_by[myindex] ==
numbers::invalid_unsigned_int)
touched_first_by[myindex] = chunk;
touched_last_by[myindex] = chunk;
}
if (dof_indices[it] != numbers::invalid_unsigned_int)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wrote the same comparison yesterday in another project :D

{
const unsigned int myindex =
dof_indices[it] / chunk_size_zero_vector;
if (touched_first_by[myindex] ==
numbers::invalid_unsigned_int)
touched_first_by[myindex] = chunk;
touched_last_by[myindex] = chunk;
}
}
if (faces.size() > 0)
{
Expand All @@ -433,14 +443,16 @@ namespace internal
it !=
row_starts[(cell + 1) * n_components].first;
++it)
{
const unsigned int myindex =
dof_indices[it] / chunk_size_zero_vector;
if (touched_first_by[myindex] ==
numbers::invalid_unsigned_int)
touched_first_by[myindex] = chunk;
touched_last_by[myindex] = chunk;
}
if (dof_indices[it] !=
numbers::invalid_unsigned_int)
{
const unsigned int myindex =
dof_indices[it] / chunk_size_zero_vector;
if (touched_first_by[myindex] ==
numbers::invalid_unsigned_int)
touched_first_by[myindex] = chunk;
touched_last_by[myindex] = chunk;
}
};

fill_touched_by_for_cell(faces[face].cells_interior[v]);
Expand Down