Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Copy scalars in vector operations to make compiler optimize more #14253

Merged
merged 2 commits into from
Sep 12, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
129 changes: 81 additions & 48 deletions include/deal.II/lac/vector_operations_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,12 +277,16 @@ namespace internal
{
Vectorization_multiply_factor(Number *const val, const Number factor)
: val(val)
, factor(factor)
, stored_factor(factor)
{}

void
operator()(const size_type begin, const size_type end) const
{
// create a local copy of the variable to help the compiler with the
// aliasing analysis
const Number factor = stored_factor;

if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -297,7 +301,7 @@ namespace internal
}

Number *const val;
const Number factor;
const Number stored_factor;
};

template <typename Number>
Expand All @@ -308,12 +312,15 @@ namespace internal
const Number factor)
: val(val)
, v_val(v_val)
, factor(factor)
, stored_factor(factor)
{}

void
operator()(const size_type begin, const size_type end) const
{
// create a local copy of the variable to help the compiler with the
// aliasing analysis
const Number factor = stored_factor;
if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -329,7 +336,7 @@ namespace internal

Number *const val;
const Number *const v_val;
const Number factor;
const Number stored_factor;
};

template <typename Number>
Expand All @@ -341,13 +348,17 @@ namespace internal
const Number x)
: val(val)
, v_val(v_val)
, a(a)
, x(x)
, stored_a(a)
, stored_x(x)
{}

void
operator()(const size_type begin, const size_type end) const
{
// create a local copy of the variable to help the compiler with the
// aliasing analysis
const Number x = stored_x, a = stored_a;

if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -363,8 +374,8 @@ namespace internal

Number *const val;
const Number *const v_val;
const Number a;
const Number x;
const Number stored_a;
const Number stored_x;
};

template <typename Number>
Expand Down Expand Up @@ -400,12 +411,14 @@ namespace internal
{
Vectorization_add_factor(Number *const val, const Number factor)
: val(val)
, factor(factor)
, stored_factor(factor)
{}

void
operator()(const size_type begin, const size_type end) const
{
const Number factor = stored_factor;

if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -420,7 +433,7 @@ namespace internal
}

Number *const val;
const Number factor;
const Number stored_factor;
};

template <typename Number>
Expand Down Expand Up @@ -462,13 +475,15 @@ namespace internal
: val(val)
, v_val(v_val)
, w_val(w_val)
, a(a)
, b(b)
, stored_a(a)
, stored_b(b)
{}

void
operator()(const size_type begin, const size_type end) const
{
const Number a = stored_a, b = stored_b;

if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -485,8 +500,8 @@ namespace internal
Number *const val;
const Number *const v_val;
const Number *const w_val;
const Number a;
const Number b;
const Number stored_a;
const Number stored_b;
};

template <typename Number>
Expand All @@ -497,12 +512,14 @@ namespace internal
const Number x)
: val(val)
, v_val(v_val)
, x(x)
, stored_x(x)
{}

void
operator()(const size_type begin, const size_type end) const
{
const Number x = stored_x;

if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -518,7 +535,7 @@ namespace internal

Number *const val;
const Number *const v_val;
const Number x;
const Number stored_x;
};

template <typename Number>
Expand All @@ -533,14 +550,16 @@ namespace internal
: val(val)
, v_val(v_val)
, w_val(w_val)
, x(x)
, a(a)
, b(b)
, stored_x(x)
, stored_a(a)
, stored_b(b)
{}

void
operator()(const size_type begin, const size_type end) const
{
const Number x = stored_x, a = stored_a, b = stored_b;

if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -557,9 +576,9 @@ namespace internal
Number *const val;
const Number *const v_val;
const Number *const w_val;
const Number x;
const Number a;
const Number b;
const Number stored_x;
const Number stored_a;
const Number stored_b;
};

template <typename Number>
Expand Down Expand Up @@ -598,12 +617,14 @@ namespace internal
const Number a)
: val(val)
, u_val(u_val)
, a(a)
, stored_a(a)
{}

void
operator()(const size_type begin, const size_type end) const
{
const Number a = stored_a;

if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -619,7 +640,7 @@ namespace internal

Number *const val;
const Number *const u_val;
const Number a;
const Number stored_a;
};

template <typename Number>
Expand All @@ -633,13 +654,15 @@ namespace internal
: val(val)
, u_val(u_val)
, v_val(v_val)
, a(a)
, b(b)
, stored_a(a)
, stored_b(b)
{}

void
operator()(const size_type begin, const size_type end) const
{
const Number a = stored_a, b = stored_b;

if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -656,8 +679,8 @@ namespace internal
Number *const val;
const Number *const u_val;
const Number *const v_val;
const Number a;
const Number b;
const Number stored_a;
const Number stored_b;
};

template <typename Number>
Expand All @@ -674,14 +697,16 @@ namespace internal
, u_val(u_val)
, v_val(v_val)
, w_val(w_val)
, a(a)
, b(b)
, c(c)
, stored_a(a)
, stored_b(b)
, stored_c(c)
{}

void
operator()(const size_type begin, const size_type end) const
{
const Number a = stored_a, b = stored_b, c = stored_c;

if (::dealii::parallel::internal::EnableOpenMPSimdFor<Number>::value)
{
DEAL_II_OPENMP_SIMD_PRAGMA
Expand All @@ -699,9 +724,9 @@ namespace internal
const Number *const u_val;
const Number *const v_val;
const Number *const w_val;
const Number a;
const Number b;
const Number c;
const Number stored_a;
const Number stored_b;
const Number stored_c;
};

template <typename Number>
Expand Down Expand Up @@ -1122,11 +1147,15 @@ namespace internal
// this is the inner working routine for the accumulation loops
// below. This is the standard case where the loop bounds are known. We
// pulled this function out of the regular accumulate routine because we
// might do this thing vectorized (see specialized function below)
// might do this thing vectorized (see specialized function below). As
// opposed to the vector add functions above, we here pass the functor
// 'op' by value, because we cannot create a copy of the scalar inline,
// and instead make sure that the numbers get local (and thus definitely
// not aliased) for the compiler
template <typename Operation, typename ResultType>
void
accumulate_regular(
const Operation &op,
const Operation op,
const size_type &n_chunks,
size_type & index,
ResultType (&outer_results)[vector_accumulation_recursion_threshold],
Expand Down Expand Up @@ -1160,13 +1189,14 @@ namespace internal
// below. This is the specialized case where the loop bounds are known and
// where we can vectorize. In that case, we request the 'do_vectorized'
// routine of the operation instead of the regular one which does several
// operations at once.
// operations at once. As above, pass in the functor by value to create a
// local copy of the variables in the function (if there are any).
template <typename Operation, typename Number>
void
accumulate_regular(
const Operation &op,
size_type & n_chunks,
size_type & index,
const Operation op,
size_type & n_chunks,
size_type & index,
Number (&outer_results)[vector_accumulation_recursion_threshold],
std::integral_constant<bool, true>)
{
Expand Down Expand Up @@ -1350,14 +1380,17 @@ namespace internal
* parallel.
*/
template <typename Operation, typename ResultType>
void
parallel_reduce(
const Operation &op,
const size_type start,
const size_type end,
ResultType & result,
const std::shared_ptr<::dealii::parallel::internal::TBBPartitioner>
&partitioner)
#ifndef DEBUG
DEAL_II_ALWAYS_INLINE
#endif
void
parallel_reduce(
const Operation &op,
const size_type start,
const size_type end,
ResultType & result,
const std::shared_ptr<::dealii::parallel::internal::TBBPartitioner>
&partitioner)
{
#ifdef DEAL_II_WITH_TBB
const size_type vec_size = end - start;
Expand Down