Skip to content

Commit

Permalink
MB-46378: Implement hardware-accelerated AArch64 CRC32
Browse files Browse the repository at this point in the history
Implement Aarch64 support for our hardware-accelerated CRC32 routines
(in addition to existing SSE4.2 support).

Fortunately AArch64 has CRC32 instructions with identical semantics to
the SSE4.2 ones (including same coefficient), so this is
straightforward - just conditionally compile the correct instrinsic.

Results look decent even without tweaking the algorithm - we see up to
8x speedup in hardware-accelerated version (HW opt GiB/s)
v.s. original software (SW GiB/s) on Graviton 2 (c6gd.4xlarge, 2.331
GHz):

    $ ./tests/crc32/platform-crc32c-bench
    Data size (bytes) : SW ns      : SW GiB/s   : HW ns      : HW GiB/s   : HW vs SW : HW opt ns  : HW opt GiB/s : HW vs HW opt : SW vs HW opt :
    Power of 2 lengths.
    32                : 73 ns      : 0.408      : 40 ns      : 0.745      : 1.825x   : 42 ns      : 0.710        : 0.952x       : 1.738x       :
    64                : 79 ns      : 0.754      : 41 ns      : 1.454      : 1.927x   : 45 ns      : 1.325        : 0.911x       : 1.756x       :
    128               : 126 ns     : 0.946      : 48 ns      : 2.484      : 2.625x   : 51 ns      : 2.337        : 0.941x       : 2.471x       :
    256               : 219 ns     : 1.089      : 58 ns      : 4.111      : 3.776x   : 62 ns      : 3.845        : 0.935x       : 3.532x       :
    512               : 405 ns     : 1.177      : 76 ns      : 6.274      : 5.329x   : 88 ns      : 5.419        : 0.864x       : 4.602x       :
    1024              : 580 ns     : 1.644      : 114 ns     : 8.366      : 5.088x   : 109 ns     : 8.749        : 1.046x       : 5.321x       :
    2048              : 1121 ns    : 1.701      : 191 ns     : 9.986      : 5.869x   : 178 ns     : 10.715       : 1.073x       : 6.298x       :
    4096              : 1963 ns    : 1.943      : 353 ns     : 10.807     : 5.561x   : 300 ns     : 12.716       : 1.177x       : 6.543x       :
    8192              : 3889 ns    : 1.962      : 660 ns     : 11.560     : 5.892x   : 568 ns     : 13.432       : 1.162x       : 6.847x       :
    16384             : 7518 ns    : 2.030      : 1274 ns    : 11.977     : 5.901x   : 1064 ns    : 14.341       : 1.197x       : 7.066x       :
    32768             : 15 us      : 1.986      : 2505 ns    : 12.183     : 6.135x   : 1916 ns    : 15.928       : 1.307x       : 8.021x       :
    65536             : 31 us      : 1.969      : 5039 ns    : 12.113     : 6.150x   : 3881 ns    : 15.727       : 1.298x       : 7.985x       :
    131072            : 62 us      : 1.975      : 9903 ns    : 12.327     : 6.242x   : 7482 ns    : 16.315       : 1.324x       : 8.262x       :
    262144            : 123 us     : 1.982      : 20 us      : 12.357     : 6.235x   : 15 us      : 16.424       : 1.329x       : 8.287x       :
    524288            : 247 us     : 1.974      : 40 us      : 12.221     : 6.190x   : 30 us      : 16.277       : 1.332x       : 8.245x       :
    1048576           : 493 us     : 1.979      : 80 us      : 12.224     : 6.174x   : 59 us      : 16.440       : 1.345x       : 8.303x       :
    2097152           : 987 us     : 1.977      : 158 us     : 12.371     : 6.254x   : 120 us     : 16.318       : 1.319x       : 8.250x       :
    4194304           : 1975 us    : 1.977      : 315 us     : 12.387     : 6.264x   : 241 us     : 16.227       : 1.310x       : 8.205x       :
    8388608           : 3957 us    : 1.969      : 631 us     : 12.383     : 6.274x   : 485 us     : 16.102       : 1.300x       : 8.157x       :

    Non-power of 2 lengths.
    33                : 60 ns      : 0.512      : 40 ns      : 0.768      : 1.500x   : 41 ns      : 0.750        : 0.976x       : 1.463x       :
    132               : 137 ns     : 0.897      : 49 ns      : 2.509      : 2.796x   : 52 ns      : 2.364        : 0.942x       : 2.635x       :
    528               : 418 ns     : 1.176      : 77 ns      : 6.386      : 5.429x   : 90 ns      : 5.464        : 0.856x       : 4.644x       :
    2112              : 1161 ns    : 1.694      : 196 ns     : 10.035     : 5.923x   : 182 ns     : 10.807       : 1.077x       : 6.379x       :
    8448              : 3867 ns    : 2.035      : 679 ns     : 11.587     : 5.695x   : 571 ns     : 13.779       : 1.189x       : 6.772x       :
    33792             : 16 us      : 2.008      : 2583 ns    : 12.184     : 6.069x   : 1970 ns    : 15.975       : 1.311x       : 7.957x       :
    135168            : 64 us      : 1.980      : 10 us      : 12.327     : 6.226x   : 7766 ns    : 16.210       : 1.315x       : 8.187x       :
    540672            : 254 us     : 1.984      : 41 us      : 12.353     : 6.224x   : 30 us      : 16.738       : 1.355x       : 8.433x       :
    2162688           : 1017 us    : 1.980      : 163 us     : 12.375     : 6.251x   : 123 us     : 16.381       : 1.324x       : 8.273x       :

    Unaligned buffer of odd lengths
    33                : 74 ns      : 0.415      : 45 ns      : 0.683      : 1.644x   : 47 ns      : 0.654        : 0.957x       : 1.574x       :
    133               : 156 ns     : 0.794      : 55 ns      : 2.252      : 2.836x   : 58 ns      : 2.136        : 0.948x       : 2.690x       :
    529               : 436 ns     : 1.130      : 83 ns      : 5.936      : 5.253x   : 95 ns      : 5.186        : 0.874x       : 4.589x       :
    2113              : 1171 ns    : 1.681      : 201 ns     : 9.790      : 5.826x   : 182 ns     : 10.813       : 1.104x       : 6.434x       :
    8449              : 4090 ns    : 1.924      : 683 ns     : 11.521     : 5.988x   : 583 ns     : 13.497       : 1.172x       : 7.015x       :
    33793             : 16 us      : 1.979      : 2586 ns    : 12.170     : 6.149x   : 1988 ns    : 15.831       : 1.301x       : 7.998x       :
    135169            : 64 us      : 1.974      : 10 us      : 12.314     : 6.237x   : 7794 ns    : 16.152       : 1.312x       : 8.180x       :
    540673            : 254 us     : 1.980      : 41 us      : 12.362     : 6.243x   : 31 us      : 16.441       : 1.330x       : 8.302x       :
    2162689           : 1018 us    : 1.978      : 163 us     : 12.385     : 6.261x   : 123 us     : 16.335       : 1.319x       : 8.257x       :

Change-Id: Id5ab9a11dff64cd060727f9e799308acc927129b
Reviewed-on: http://review.couchbase.org/c/platform/+/154058
Tested-by: Build Bot <build@couchbase.com>
Reviewed-by: Trond Norbye <trond.norbye@couchbase.com>
  • Loading branch information
daverigby committed Jul 8, 2021
1 parent 948f2af commit d313d18
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 38 deletions.
24 changes: 16 additions & 8 deletions CMakeLists.txt
@@ -1,4 +1,4 @@
CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
CMAKE_MINIMUM_REQUIRED(VERSION 3.15)
PROJECT(Platform LANGUAGES C CXX)

INCLUDE(CheckFunctionExists)
Expand Down Expand Up @@ -100,15 +100,23 @@ IF (WIN32)
include/win32/getopt.h)
ELSE (WIN32)
SET(PLATFORM_FILES src/cb_pthreads.cc)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
set(crc32_hw_archs "aarch64;AMD64;x86_64")
if(CMAKE_SYSTEM_PROCESSOR IN_LIST crc32_hw_archs)
list(APPEND PLATFORM_FILES src/crc32c_sse4_2.cc)
# To avoid feature (sse4.2) mismatch errors when compiling we should avoid
# including the precompiled header in crc32c_sse4_2
endif()
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
set_source_files_properties(src/crc32c_sse4_2.cc
PROPERTIES COMPILE_FLAGS -msse4.2)
endif()
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
set_source_files_properties(src/crc32c_sse4_2.cc
PROPERTIES
COMPILE_FLAGS -msse4.2
SKIP_PRECOMPILE_HEADERS ON)
PROPERTIES COMPILE_FLAGS -march=armv8-a+crc)
endif()
# To avoid feature (sse4.2) mismatch errors when compiling we should avoid
# including the precompiled header in crc32c_sse4_2
set_source_files_properties(src/crc32c_sse4_2.cc
PROPERTIES
SKIP_PRECOMPILE_HEADERS ON)
LIST(APPEND PLATFORM_LIBRARIES "pthread")

IF (NOT APPLE)
Expand Down Expand Up @@ -355,7 +363,7 @@ SET_TARGET_PROPERTIES(platform PROPERTIES POSITION_INDEPENDENT_CODE true)
platform_enable_pch(platform)
cb_enable_unity_build(platform)
# Exclude our defintions of new/delete. When included in a unity build this
# file has a number of issues to address. Compile failure due to differing
# file has a number of issues to address. Compile failure due to differing
# defintions of malloc_usable_size on WIN32 and Linux. A second issue occurs
# on ASAN only where duplicate symbol errors are seen, all of our new/delete
# symbols now clash with the ASAN replacements).
Expand Down
4 changes: 3 additions & 1 deletion include/platform/crc32c.h
Expand Up @@ -19,7 +19,9 @@
#include <cstddef>
#include <cstdint>

#if defined(__x86_64__) || defined(_M_X64)
#include <folly/Portability.h>

#if FOLLY_X64 || FOLLY_AARCH64
#define CB_CRC32_HW_SUPPORTED 1
#endif

Expand Down
16 changes: 15 additions & 1 deletion src/crc32c.cc
Expand Up @@ -59,6 +59,9 @@
#include <limits>

#include <folly/CpuId.h>
#if defined(__linux__)
#include <sys/auxv.h>
#endif

static bool setup_tables();
static bool tables_setup = setup_tables();
Expand Down Expand Up @@ -358,11 +361,22 @@ typedef uint32_t (*crc32c_function)(const uint8_t* buf,

//
// Return the appropriate function for the platform.
// If SSE4.2 is available then hardware acceleration is used.
// - x86-64: If SSE4.2 is available then hardware acceleration is
// used.
// - AArch64: If CRC32 instructions are available then hardware
// acceleration is used.
//
crc32c_function setup_crc32c() {
#if CB_CRC32_HW_SUPPORTED
#if FOLLY_X64
return folly::CpuId().sse42() ? crc32c_hw : crc32c_sw;
#elif FOLLY_AARCH64
unsigned long features = getauxval(AT_HWCAP);
return (features & HWCAP_CRC32) ? crc32c_hw : crc32c_sw;
#else
// AArch64, non-linux - TODO.
#error Unhandled OS for AArch64.
#endif
#else
return crc32c_sw;
#endif
Expand Down
80 changes: 53 additions & 27 deletions src/crc32c_sse4_2.cc
Expand Up @@ -30,9 +30,10 @@
// This is an altered/adapted version of Mark Adler's crc32c.c
// - see http://stackoverflow.com/a/17646775
// - see above license.
// - This module provides the HW support and is built
// with -msse4.2 where applicable. We only execute inside
// this module if SSE4.2 is detected.
// - This module provides the HW support and is built with -msse4.2
// (x86-64) or -march=armv8.1-a (AArch64) where applicable. We only
// execute inside this module if SSE4.2 / AArch64-CRC instructions
// are detected.
//
// Changes from orginal version include.
// a) Compiler intrinsics instead of inline asm.
Expand All @@ -47,50 +48,75 @@
// ii) See crc32c_bench.cc for testing
// f) Validated with IETF test vectors.
// i) See crc32c_test.cc.
// g) Use of GCC4.8 attributes to select SSE4.2 vs SW version/
// g) Use of GCC4.8 attributes to select hardware-accel vs SW version/
// h) Custom cpuid code works for GCC(<4.8), CLANG and MSVC.
// i) Use static initialistion instead of pthread_once.
//
#if !defined(__x86_64__) && !defined(_M_X64)
#error "crc32c requires X86 SSE4.2 for hardware acceleration"

#include <folly/Portability.h>

#if !FOLLY_X64 && !FOLLY_AARCH64
#error "crc32c requires X86 SSE4.2 or Arch64 for hardware acceleration"
#endif

#include "crc32c_private.h"
#include <platform/crc32c.h>
#include <limits>

// select header file for crc instructions.
// select header file for crc instructions and define intrinsic to use.
#if FOLLY_X64
#if defined(WIN32)
#include <nmmintrin.h>
#elif defined(__clang__) || defined(__GNUC__)
#include <smmintrin.h>
#endif

#include <limits>
inline uint32_t crc32c_u8(uint32_t crc, uint8_t data) {
return _mm_crc32_u8(crc, data);
}

inline uint32_t crc32c_u64(uint32_t crc, uint64_t data) {
return _mm_crc32_u64(crc, data);
}
#endif // FOLLY_X64

#if FOLLY_AARCH64
#include <arm_acle.h>

inline uint32_t crc32c_u8(uint32_t crc, uint8_t data) {
return __crc32cb(crc, data);
}

inline uint32_t crc32c_u64(uint32_t crc, uint64_t data) {
return __crc32cd(crc, data);
}
#endif // FOLLY_AARCH64


//
// CRC32-C implementation using SSE4.2 acceleration
// CRC32-C implementation using SSE4.2 / AArch64 acceleration
// no pipeline optimisation.
//
uint32_t crc32c_hw_1way(const uint8_t* buf, size_t len, uint32_t crc_in) {
auto crc_flipped = ~crc_in;
auto crc = static_cast<uint64_t>(crc_flipped);
// use crc32-byte instruction until the buf pointer is 8-byte aligned
while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
crc = _mm_crc32_u8(static_cast<uint32_t>(crc), *buf);
crc = crc32c_u8(static_cast<uint32_t>(crc), *buf);
buf += sizeof(uint8_t);
len -= sizeof(uint8_t);
}

// Use 8 byte size until there's no more u64 to process.
while (len >= sizeof(uint64_t)) {
crc = _mm_crc32_u64(crc, *reinterpret_cast<const uint64_t*>(buf));
crc = crc32c_u64(crc, *reinterpret_cast<const uint64_t*>(buf));
buf += sizeof(uint64_t);
len -= sizeof(uint64_t);
}

// finish the rest using the byte instruction
while (len > 0) {
crc = _mm_crc32_u8(static_cast<uint32_t>(crc), *buf);
crc = crc32c_u8(static_cast<uint32_t>(crc), *buf);
buf += sizeof(uint8_t);
len -= sizeof(uint8_t);
}
Expand All @@ -113,7 +139,7 @@ uint32_t crc32c_hw_short_block(const uint8_t* buf, size_t len, uint32_t crc_in)
// use crc32-byte instruction until the buf pointer is 8-byte aligned
while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {

crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
crc0 = crc32c_u8(static_cast<uint32_t>(crc0), *buf);
buf += sizeof(uint8_t);
len -= sizeof(uint8_t);
}
Expand All @@ -125,9 +151,9 @@ uint32_t crc32c_hw_short_block(const uint8_t* buf, size_t len, uint32_t crc_in)
const uint8_t* end = buf + SHORT_BLOCK;
do
{
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
crc1 = _mm_crc32_u64(crc1, *reinterpret_cast<const uint64_t*>(buf + SHORT_BLOCK));
crc2 = _mm_crc32_u64(crc2, *reinterpret_cast<const uint64_t*>(buf + (2 * SHORT_BLOCK)));
crc0 = crc32c_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
crc1 = crc32c_u64(crc1, *reinterpret_cast<const uint64_t*>(buf + SHORT_BLOCK));
crc2 = crc32c_u64(crc2, *reinterpret_cast<const uint64_t*>(buf + (2 * SHORT_BLOCK)));
buf += sizeof(uint64_t);
} while (buf < end);
crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1;
Expand All @@ -138,14 +164,14 @@ uint32_t crc32c_hw_short_block(const uint8_t* buf, size_t len, uint32_t crc_in)

// Use 8 byte size until there's no more u64 to process.
while (len >= sizeof(uint64_t)) {
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
crc0 = crc32c_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
buf += sizeof(uint64_t);
len -= sizeof(uint64_t);
}

// finish the rest using the byte instruction
while (len > 0) {
crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
crc0 = crc32c_u8(static_cast<uint32_t>(crc0), *buf);
buf += sizeof(uint8_t);
len -= sizeof(uint8_t);
}
Expand All @@ -170,7 +196,7 @@ uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc_in) {
// use crc32-byte instruction until the buf pointer is 8-byte aligned
while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {

crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
crc0 = crc32c_u8(static_cast<uint32_t>(crc0), *buf);
buf += sizeof(uint8_t);
len -= sizeof(uint8_t);
}
Expand All @@ -185,9 +211,9 @@ uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc_in) {
const uint8_t* end = buf + LONG_BLOCK;
do
{
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
crc1 = _mm_crc32_u64(crc1, *reinterpret_cast<const uint64_t*>(buf + LONG_BLOCK));
crc2 = _mm_crc32_u64(crc2, *reinterpret_cast<const uint64_t*>(buf + (2 * LONG_BLOCK)));
crc0 = crc32c_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
crc1 = crc32c_u64(crc1, *reinterpret_cast<const uint64_t*>(buf + LONG_BLOCK));
crc2 = crc32c_u64(crc2, *reinterpret_cast<const uint64_t*>(buf + (2 * LONG_BLOCK)));
buf += sizeof(uint64_t);
} while (buf < end);
crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc1;
Expand All @@ -204,9 +230,9 @@ uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc_in) {
const uint8_t* end = buf + SHORT_BLOCK;
do
{
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
crc1 = _mm_crc32_u64(crc1, *reinterpret_cast<const uint64_t*>(buf + SHORT_BLOCK));
crc2 = _mm_crc32_u64(crc2, *reinterpret_cast<const uint64_t*>(buf + (2 * SHORT_BLOCK)));
crc0 = crc32c_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
crc1 = crc32c_u64(crc1, *reinterpret_cast<const uint64_t*>(buf + SHORT_BLOCK));
crc2 = crc32c_u64(crc2, *reinterpret_cast<const uint64_t*>(buf + (2 * SHORT_BLOCK)));
buf += sizeof(uint64_t);
} while (buf < end);
crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1;
Expand All @@ -217,14 +243,14 @@ uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc_in) {

// Use 8 byte size until there's no more u64 to process.
while (len >= sizeof(uint64_t)) {
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
crc0 = crc32c_u64(crc0, *reinterpret_cast<const uint64_t*>(buf));
buf += sizeof(uint64_t);
len -= sizeof(uint64_t);
}

// finish the rest using the byte instruction
while (len > 0) {
crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
crc0 = crc32c_u8(static_cast<uint32_t>(crc0), *buf);
buf += sizeof(uint8_t);
len -= sizeof(uint8_t);
}
Expand Down
2 changes: 1 addition & 1 deletion tests/crc32/CMakeLists.txt
Expand Up @@ -4,7 +4,7 @@ cb_add_test_executable(platform-crc32c-test crc32c_test.cc)
target_link_libraries(platform-crc32c-test PRIVATE platform)
platform_enable_pch(platform-crc32c-test)

set(crc32_hw_archs "AMD64;x86_64")
set(crc32_hw_archs "aarch64;AMD64;x86_64")
if(CMAKE_SYSTEM_PROCESSOR IN_LIST crc32_hw_archs)
cb_add_test_executable(platform-crc32c-sw_hw-test crc32c_test.cc)
target_compile_definitions(platform-crc32c-sw_hw-test PRIVATE CRC32C_UNIT_TEST)
Expand Down

0 comments on commit d313d18

Please sign in to comment.