Skip to content

Commit

Permalink
Use sys_byteorder.h functions in big_endian.h
Browse files Browse the repository at this point in the history
The compiler does not consistently manage to compile the existing code
down to use the architecture's byteswap instruction. By using the
functions from sys_byteorder.h instead we get consistently small and
performant code.

Also add a microbenchmark to validate the performance.

On Android this results in a 2-5 times speedup.

Before this change (arm32, measured on Pixel5):

BM_WriteBigEndianAligned<int16_t>      4.59 ns
BM_WriteBigEndianAligned<uint16_t>     4.17 ns
BM_WriteBigEndianAligned<int32_t>      5.49 ns
BM_WriteBigEndianAligned<uint32_t>     5.49 ns
BM_WriteBigEndianAligned<int64_t>      15.5 ns
BM_WriteBigEndianAligned<uint64_t>     15.5 ns
BM_WriteBigEndianMisaligned<int16_t>   4.59 ns
BM_WriteBigEndianMisaligned<uint16_t>  4.17 ns
BM_WriteBigEndianMisaligned<int32_t>   5.49 ns
BM_WriteBigEndianMisaligned<uint32_t>  5.49 ns
BM_WriteBigEndianMisaligned<int64_t>   16.0 ns
BM_WriteBigEndianMisaligned<uint64_t>  16.0 ns
BM_ReadBigEndianAligned<int16_t>       4.59 ns
BM_ReadBigEndianAligned<uint16_t>      4.59 ns
BM_ReadBigEndianAligned<int32_t>       5.84 ns
BM_ReadBigEndianAligned<uint32_t>      5.84 ns
BM_ReadBigEndianAligned<int64_t>       13.4 ns
BM_ReadBigEndianAligned<uint64_t>      13.4 ns
BM_ReadBigEndianMisaligned<int16_t>    4.59 ns
BM_ReadBigEndianMisaligned<uint16_t>   4.59 ns
BM_ReadBigEndianMisaligned<int32_t>    5.84 ns
BM_ReadBigEndianMisaligned<uint32_t>   5.84 ns
BM_ReadBigEndianMisaligned<int64_t>    13.4 ns
BM_ReadBigEndianMisaligned<uint64_t>   13.4 ns

After this change (arm32, measured on Pixel5):

BM_WriteBigEndianAligned<int16_t>      2.31 ns
BM_WriteBigEndianAligned<uint16_t>     1.98 ns
BM_WriteBigEndianAligned<int32_t>      1.98 ns
BM_WriteBigEndianAligned<uint32_t>     1.98 ns
BM_WriteBigEndianAligned<int64_t>      2.78 ns
BM_WriteBigEndianAligned<uint64_t>     2.80 ns
BM_WriteBigEndianMisaligned<int16_t>   2.30 ns
BM_WriteBigEndianMisaligned<uint16_t>  1.98 ns
BM_WriteBigEndianMisaligned<int32_t>   1.98 ns
BM_WriteBigEndianMisaligned<uint32_t>  1.98 ns
BM_WriteBigEndianMisaligned<int64_t>   2.95 ns
BM_WriteBigEndianMisaligned<uint64_t>  2.95 ns
BM_ReadBigEndianAligned<int16_t>       1.85 ns
BM_ReadBigEndianAligned<uint16_t>      1.85 ns
BM_ReadBigEndianAligned<int32_t>       1.60 ns
BM_ReadBigEndianAligned<uint32_t>      1.59 ns
BM_ReadBigEndianAligned<int64_t>       2.33 ns
BM_ReadBigEndianAligned<uint64_t>      2.34 ns
BM_ReadBigEndianMisaligned<int16_t>    1.88 ns
BM_ReadBigEndianMisaligned<uint16_t>   1.88 ns
BM_ReadBigEndianMisaligned<int32_t>    1.62 ns
BM_ReadBigEndianMisaligned<uint32_t>   1.62 ns
BM_ReadBigEndianMisaligned<int64_t>    2.36 ns
BM_ReadBigEndianMisaligned<uint64_t>   2.35 ns

On x86-64 the compiler seems to have less trouble optimizing the
existing code, and only the 64-bit integer read results change
significantly:

Before this change (x86-64, Linux):

BM_WriteBigEndianAligned<int16_t>      0.924 ns
BM_WriteBigEndianAligned<uint16_t>     0.903 ns
BM_WriteBigEndianAligned<int32_t>      0.933 ns
BM_WriteBigEndianAligned<uint32_t>     0.932 ns
BM_WriteBigEndianAligned<int64_t>      1.08 ns
BM_WriteBigEndianAligned<uint64_t>     1.09 ns
BM_WriteBigEndianMisaligned<int16_t>   0.952 ns
BM_WriteBigEndianMisaligned<uint16_t>  0.925 ns
BM_WriteBigEndianMisaligned<int32_t>   0.947 ns
BM_WriteBigEndianMisaligned<uint32_t>  0.931 ns
BM_WriteBigEndianMisaligned<int64_t>   1.08 ns
BM_WriteBigEndianMisaligned<uint64_t>  1.08 ns
BM_ReadBigEndianAligned<int16_t>       1.03 ns
BM_ReadBigEndianAligned<uint16_t>      0.988 ns
BM_ReadBigEndianAligned<int32_t>       0.956 ns
BM_ReadBigEndianAligned<uint32_t>      0.965 ns
BM_ReadBigEndianAligned<int64_t>       2.33 ns
BM_ReadBigEndianAligned<uint64_t>      2.30 ns
BM_ReadBigEndianMisaligned<int16_t>    0.994 ns
BM_ReadBigEndianMisaligned<uint16_t>   0.996 ns
BM_ReadBigEndianMisaligned<int32_t>    0.959 ns
BM_ReadBigEndianMisaligned<uint32_t>   0.964 ns
BM_ReadBigEndianMisaligned<int64_t>    2.31 ns
BM_ReadBigEndianMisaligned<uint64_t>   2.30 ns

After this change (x86-64, Linux):

BM_WriteBigEndianAligned<int16_t>      0.917 ns
BM_WriteBigEndianAligned<uint16_t>     0.927 ns
BM_WriteBigEndianAligned<int32_t>      0.956 ns
BM_WriteBigEndianAligned<uint32_t>     0.942 ns
BM_WriteBigEndianAligned<int64_t>      1.09 ns
BM_WriteBigEndianAligned<uint64_t>     1.09 ns
BM_WriteBigEndianMisaligned<int16_t>   0.925 ns
BM_WriteBigEndianMisaligned<uint16_t>  0.906 ns
BM_WriteBigEndianMisaligned<int32_t>   0.939 ns
BM_WriteBigEndianMisaligned<uint32_t>  0.936 ns
BM_WriteBigEndianMisaligned<int64_t>   1.11 ns
BM_WriteBigEndianMisaligned<uint64_t>  1.12 ns
BM_ReadBigEndianAligned<int16_t>       0.997 ns
BM_ReadBigEndianAligned<uint16_t>      0.996 ns
BM_ReadBigEndianAligned<int32_t>       0.972 ns
BM_ReadBigEndianAligned<uint32_t>      0.956 ns
BM_ReadBigEndianAligned<int64_t>       1.17 ns
BM_ReadBigEndianAligned<uint64_t>      1.17 ns
BM_ReadBigEndianMisaligned<int16_t>    0.999 ns
BM_ReadBigEndianMisaligned<uint16_t>   0.997 ns
BM_ReadBigEndianMisaligned<int32_t>    0.969 ns
BM_ReadBigEndianMisaligned<uint32_t>   0.965 ns
BM_ReadBigEndianMisaligned<int64_t>    1.21 ns
BM_ReadBigEndianMisaligned<uint64_t>   1.19 ns


Change-Id: I21119e03ef799458c4530b031ca3142a146580ed
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4756034
Reviewed-by: Daniel Cheng <dcheng@chromium.org>
Commit-Queue: Adam Rice <ricea@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1185111}
  • Loading branch information
ricea authored and Chromium LUCI CQ committed Aug 18, 2023
1 parent b37b4d7 commit 3807a76
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 33 deletions.
1 change: 1 addition & 0 deletions base/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -2699,6 +2699,7 @@ component("i18n") {

test("base_perftests") {
sources = [
"big_endian_perftest.cc",
"hash/hash_perftest.cc",
"json/json_perftest.cc",
"message_loop/message_pump_perftest.cc",
Expand Down
75 changes: 42 additions & 33 deletions base/big_endian.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,44 @@
#include "base/containers/span.h"
#include "base/memory/raw_ptr.h"
#include "base/strings/string_piece.h"
#include "base/sys_byteorder.h"
#include "build/build_config.h"

namespace base {

namespace internal {

// ByteSwapIfLittleEndian performs ByteSwap if this platform is little-endian,
// otherwise it is a no-op.

#if defined(ARCH_CPU_LITTLE_ENDIAN)

template <typename T>
inline auto ByteSwapIfLittleEndian(T val) -> decltype(ByteSwap(val)) {
return ByteSwap(val);
}

#else

// The use of decltype ensures this is only enabled for types for which
// ByteSwap() is defined, so the same set of overloads will work on both
// little-endian and big-endian platforms.

template <typename T>
inline auto ByteSwapIfLittleEndian(T val) -> decltype(ByteSwap(val)) {
return val;
}

#endif

// We never need to byte-swap a single-byte value, but it's convenient to have
// this overload to avoid a special case.
inline uint8_t ByteSwapIfLittleEndian(uint8_t val) {
return val;
}

} // namespace internal

// Read an integer (signed or unsigned) from |buf| in Big Endian order.
// Note: this loop is unrolled with -O1 and above.
// NOTE(szym): glibc dns-canon.c use ntohs(*(uint16_t*)ptr) which is
Expand All @@ -26,46 +61,20 @@ inline void ReadBigEndian(const uint8_t buf[], T* out) {
static_assert(std::is_integral<T>::value, "T has to be an integral type.");
// Make an unsigned version of the output type to make shift possible
// without UB.
typename std::make_unsigned<T>::type unsigned_result = buf[0];
for (size_t i = 1; i < sizeof(T); ++i) {
unsigned_result <<= 8;
// Must cast to uint8_t to avoid clobbering by sign extension.
unsigned_result |= buf[i];
}
*out = unsigned_result;
typename std::make_unsigned<T>::type raw;
memcpy(&raw, buf, sizeof(T));
*out = static_cast<T>(internal::ByteSwapIfLittleEndian(raw));
}

// Write an integer (signed or unsigned) |val| to |buf| in Big Endian order.
// Note: this loop is unrolled with -O1 and above.
template<typename T>
inline void WriteBigEndian(char buf[], T val) {
static_assert(std::is_integral<T>::value, "T has to be an integral type.");
auto unsigned_val = static_cast<typename std::make_unsigned<T>::type>(val);
for (size_t i = 0; i < sizeof(T); ++i) {
buf[sizeof(T) - i - 1] = static_cast<char>(unsigned_val & 0xFF);
unsigned_val >>= 8;
}
}

// Specializations to make clang happy about the (dead code) shifts above.
template <>
inline void ReadBigEndian<uint8_t>(const uint8_t buf[], uint8_t* out) {
*out = buf[0];
}

template <>
inline void WriteBigEndian<uint8_t>(char buf[], uint8_t val) {
buf[0] = static_cast<char>(val);
}

template <>
inline void ReadBigEndian<int8_t>(const uint8_t buf[], int8_t* out) {
*out = static_cast<int8_t>(buf[0]);
}

template <>
inline void WriteBigEndian<int8_t>(char buf[], int8_t val) {
buf[0] = static_cast<char>(val);
const auto unsigned_val =
static_cast<typename std::make_unsigned<T>::type>(val);
const auto raw = internal::ByteSwapIfLittleEndian(unsigned_val);
memcpy(buf, &raw, sizeof(T));
}

// Allows reading integers in network order (big endian) while iterating over
Expand Down
113 changes: 113 additions & 0 deletions base/big_endian_perftest.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "base/big_endian.h"

#include <stdint.h>

#include "base/check.h"
#include "base/containers/span.h"
#include "third_party/google_benchmark/src/include/benchmark/benchmark.h"

namespace base {
namespace {

constexpr size_t kSize = 128 * 1024 * 1024;
int64_t aligned_bytes[kSize / sizeof(int64_t)];
struct {
int64_t aligment;
char padding_to_cause_misalignment;
char bytes[kSize];
} misaligned_bytes;

void DoNotOptimizeSpan(span<const char> range) {
// ::benchmark::DoNotOptimize() generates quite large code, so instead of
// calling it for every byte in the range, calculate `sum` which depends on
// every byte in the range and then call DoNotOptimise() on that.
int sum = 0;
for (char c : range) {
sum += c;
}
::benchmark::DoNotOptimize(sum);
}

template <typename T>
inline void WriteBigEndianCommon(::benchmark::State& state, char* const start) {
size_t offset = 0;
T value = 0;
for (auto _ : state) {
WriteBigEndian(start + offset, value);
offset += sizeof(T);
static_assert(kSize % sizeof(T) == 0);
if (offset == kSize) {
offset = 0;
}
++value;
}
DoNotOptimizeSpan({start, kSize});
}

template <typename T>
void BM_WriteBigEndianAligned(::benchmark::State& state) {
char* const start = reinterpret_cast<char*>(aligned_bytes);
CHECK(reinterpret_cast<uintptr_t>(start) % alignof(T) == 0);
WriteBigEndianCommon<T>(state, start);
}

template <typename T>
void BM_WriteBigEndianMisaligned(::benchmark::State& state) {
char* const start = misaligned_bytes.bytes;
CHECK(reinterpret_cast<uintptr_t>(start) % alignof(T) != 0);
WriteBigEndianCommon<T>(state, start);
}

template <typename T>
inline void ReadBigEndianCommon(::benchmark::State& state,
const uint8_t* const start) {
size_t offset = 0;
for (auto _ : state) {
T value;
ReadBigEndian(start + offset, &value);
::benchmark::DoNotOptimize(value);
offset += sizeof(T);
static_assert(kSize % sizeof(T) == 0);
if (offset == kSize) {
offset = 0;
}
}
}

template <typename T>
void BM_ReadBigEndianAligned(::benchmark::State& state) {
const uint8_t* const start = reinterpret_cast<uint8_t*>(aligned_bytes);
CHECK(reinterpret_cast<uintptr_t>(start) % alignof(T) == 0);
ReadBigEndianCommon<T>(state, start);
}

template <typename T>
void BM_ReadBigEndianMisaligned(::benchmark::State& state) {
const uint8_t* const start =
reinterpret_cast<uint8_t*>(misaligned_bytes.bytes);
CHECK(reinterpret_cast<uintptr_t>(start) % alignof(T) != 0);
ReadBigEndianCommon<T>(state, start);
}

#define BENCHMARK_FOR_INT_TYPES(function) \
BENCHMARK(function<int16_t>)->MinWarmUpTime(1.0); \
BENCHMARK(function<uint16_t>)->MinWarmUpTime(1.0); \
BENCHMARK(function<int32_t>)->MinWarmUpTime(1.0); \
BENCHMARK(function<uint32_t>)->MinWarmUpTime(1.0); \
BENCHMARK(function<int64_t>)->MinWarmUpTime(1.0); \
BENCHMARK(function<uint64_t>)->MinWarmUpTime(1.0); \
typedef int force_semicolon

BENCHMARK_FOR_INT_TYPES(BM_WriteBigEndianAligned);
BENCHMARK_FOR_INT_TYPES(BM_WriteBigEndianMisaligned);
BENCHMARK_FOR_INT_TYPES(BM_ReadBigEndianAligned);
BENCHMARK_FOR_INT_TYPES(BM_ReadBigEndianMisaligned);

#undef BENCHMARK_FOR_INT_TYPES

} // namespace
} // namespace base

0 comments on commit 3807a76

Please sign in to comment.