From af62807b4fec12316ee90226b08ccc7ef81172cb Mon Sep 17 00:00:00 2001 From: Alexey Kopytov Date: Tue, 22 Dec 2015 12:16:28 +1100 Subject: [PATCH] Add support for autodetection and usage of hardware CRC32 instructions on the AArch64 architecture, as well as some minor code generalization via the UNIV_CRC32_HW define. --- mysys/ut0crc32.cc | 236 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 184 insertions(+), 52 deletions(-) diff --git a/mysys/ut0crc32.cc b/mysys/ut0crc32.cc index 0343fdd6ec39..e4d0f9c6a312 100644 --- a/mysys/ut0crc32.cc +++ b/mysys/ut0crc32.cc @@ -82,11 +82,32 @@ mysys/my_perf.c, contributed by Facebook under the following license. #include "my_config.h" #include -#if defined(__linux__) && defined(__powerpc__) +#if defined(__GNUC__) && defined(__x86_64__) +/* x86-specific CRC32 support may be available */ +# define UNIV_CRC32_HW +#endif /* __GNUC__ && __x86_64__ */ + +#if defined(__linux__) +#if (defined(__powerpc__) || defined(__aarch64__)) /* Used to detect at runtime if we have vpmsum instructions (PowerISA 2.07) */ +/* and also ARMv8-specific CRC32 support. */ #include +#endif /* (defined(__powerpc__) || defined(__aarch64__)) */ + +#if defined(__powerpc__) #include -#endif /* defined(__linux__) && defined(__powerpc__) */ + +#elif defined(__aarch64__) +#include +#ifndef HWCAP_CRC32 +# define HWCAP_CRC32 (1<<7) +#endif +/* ARMv8-specific CRC32 support may be available */ +# define UNIV_CRC32_HW +/* assembler directive to enable CRC32 instructions */ +asm(".cpu generic+crc"); +#endif /* defined(__aarch64__) */ +#endif /* defined(__linux__) */ #include "ut0crc32.h" @@ -159,10 +180,10 @@ ut_crc32_swap_byteorder( | i >> 56); } - -#if defined(__GNUC__) && defined(__x86_64__) +#ifdef UNIV_CRC32_HW +#if defined(__x86_64__) /********************************************************************//** -Fetches CPU info */ +Fetches x86_64 CPU info */ static void ut_cpuid( @@ -192,6 +213,19 @@ ut_cpuid( } } +#elif defined(__aarch64__) +/********************************************************************//** +Fetches AArch64 CPU info using kernel auxiliary vector */ +static +void +ut_cpuid( +/*=====*/ + unsigned long *hwcap) /*!< out: hwcap */ +{ + *hwcap = getauxval(AT_HWCAP); +} +#endif /* __aarch64__ */ + /** Calculate CRC32 over 8-bit data using a hardware/CPU instruction. @param[in,out] crc crc32 checksum so far when this function is called, when the function ends it will contain the new checksum @@ -205,11 +239,21 @@ ut_crc32_8_hw( const byte** data, ulint* len) { +#if defined(__x86_64__) asm("crc32b %1, %0" /* output operands */ : "+r" (*crc) /* input operands */ : "rm" ((*data)[0])); +#elif defined(__aarch64__) + asm("crc32cb %w[c], %w[c], %w[v]" + /* output operands */ + : [c]"+r"(*crc) + /* input operands */ + : [v]"r"((*data)[0])); +#else +#error No support for hardware CRC32 implementation +#endif (*data)++; (*len)--; @@ -225,6 +269,7 @@ ut_crc32_64_low_hw( uint32_t crc, uint64_t data) { +#if defined(__x86_64__) uint64_t crc_64bit = crc; asm("crc32q %1, %0" @@ -234,6 +279,17 @@ ut_crc32_64_low_hw( : "rm" (data)); return(static_cast(crc_64bit)); +#elif defined(__aarch64__) + asm("crc32cx %w[c], %w[c], %x[v]" + /* output operands */ + : [c]"+r"(crc) + /* input operands */ + : [v]"r"(data)); + + return(crc); +#else +#error No support for hardware CRC32 implementation +#endif } /** Calculate CRC32 over 64-bit byte string using a hardware/CPU instruction. @@ -252,9 +308,9 @@ ut_crc32_64_hw( uint64_t data_int = *reinterpret_cast(*data); #ifdef WORDS_BIGENDIAN - /* Currently we only support x86_64 (little endian) CPUs. In case - some big endian CPU supports a CRC32 instruction, then maybe we will - need a byte order swap here. */ + /* Currently we only support little endian CPUs. In case some big endian + CPU supports a CRC32 instruction, then maybe we will NOT need a byte order + swap here. */ #error Dont know how to handle big endian CPUs /* data_int = ut_crc32_swap_byteorder(data_int); @@ -298,6 +354,92 @@ ut_crc32_64_legacy_big_endian_hw( *len -= 8; } +/** Calculate CRC32 over 2 64-bit byte string using a hardware/CPU instruction. +@param[in,out] crc crc32 checksum so far when this function is called, +when the function ends it will contain the new checksum +@param[in,out] data data to be checksummed, the pointer will be advanced +with 16 bytes +@param[in,out] len remaining bytes, it will be decremented with 16 */ +inline +void +ut_crc32_128_hw( + uint32_t* crc, + const byte** data, + ulint* len) +{ +#ifdef WORDS_BIGENDIAN + /* Currently we only support little endian CPUs. In case some big endian + CPU supports a CRC32 instruction, then maybe we will need a byte order + swap here. */ +#error Dont know how to handle big endian CPUs + /* + data_int = ut_crc32_swap_byteorder(data_int); + */ +#endif /* WORDS_BIGENDIAN */ +#if defined(__aarch64__) + uint64_t v0, v1; + + /* Load a pair of registers with one instruction to spare some cycles. + Note that post-index addressing also increments the source address + automatically. */ + asm("ldp %x[a], %x[b], [%x[c]], #16" + /* output operands */ + : [a]"=r"(v0), [b]"=r"(v1), [c]"+r"(*data)); + + *crc = ut_crc32_64_low_hw(*crc, v0); + *crc = ut_crc32_64_low_hw(*crc, v1); + + *len -= 16; +#else + ut_crc32_64_hw(crc, data, len); + ut_crc32_64_hw(crc, data, len); +#endif +} + +/** Calculate CRC32 over 2 64-bit byte string using a hardware/CPU instruction. +The byte strings are converted to 64-bit integers using big endian byte order. +@param[in,out] crc crc32 checksum so far when this function is called, +when the function ends it will contain the new checksum +@param[in,out] data data to be checksummed, the pointer will be advanced +with 16 bytes +@param[in,out] len remaining bytes, it will be decremented with 16 */ +inline +void +ut_crc32_128_legacy_big_endian_hw( + uint32_t* crc, + const byte** data, + ulint* len) +{ +#if defined(__aarch64__) + uint64_t v0, v1; + + /* Load a pair of registers with one instruction to spare some cycles. + Note that post-index addressing also increments the source address + automatically. */ + asm("ldp %x[a], %x[b], [%x[c]], #16" + /* output operands */ + : [a]"=r"(v0), [b]"=r"(v1), [c]"+r"(*data)); + +#ifndef WORDS_BIGENDIAN + v0 = ut_crc32_swap_byteorder(v0); + v1 = ut_crc32_swap_byteorder(v1); +#else + /* Currently we only support little endian CPUs. In case some big endian + CPU supports a CRC32 instruction, then maybe we will NOT need a byte + order swap here. */ +#error Dont know how to handle big endian CPUs +#endif /* WORDS_BIGENDIAN */ + + *crc = ut_crc32_64_low_hw(*crc, v0); + *crc = ut_crc32_64_low_hw(*crc, v1); + + *len -= 16; +#else + ut_crc32_64_legacy_big_endian_hw(crc, data, len); + ut_crc32_64_legacy_big_endian_hw(crc, data, len); +#endif +} + /** Calculates CRC32 using hardware/CPU instructions. @param[in] buf data over which to calculate CRC32 @param[in] len data length @@ -357,23 +499,15 @@ ut_crc32_hw_ex( (4.51% slowdown over N=256) */ while (len >= 128) { - /* This call is repeated 16 times. 16 * 8 = 128. */ - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); - ut_crc32_64_hw(&crc, &buf, &len); + /* This call is repeated 8 times. 128 bits * 8 = 128 bytes */ + ut_crc32_128_hw(&crc, &buf, &len); + ut_crc32_128_hw(&crc, &buf, &len); + ut_crc32_128_hw(&crc, &buf, &len); + ut_crc32_128_hw(&crc, &buf, &len); + ut_crc32_128_hw(&crc, &buf, &len); + ut_crc32_128_hw(&crc, &buf, &len); + ut_crc32_128_hw(&crc, &buf, &len); + ut_crc32_128_hw(&crc, &buf, &len); } while (len >= 8) { @@ -416,23 +550,15 @@ ut_crc32_legacy_big_endian_hw( } while (len >= 128) { - /* This call is repeated 16 times. 16 * 8 = 128. */ - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); + /* This call is repeated 8 times. 128 bits * 8 = 128 bytes */ + ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len); + ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len); + ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len); + ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len); + ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len); + ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len); + ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len); + ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len); } while (len >= 8) { @@ -465,7 +591,7 @@ ut_crc32_byte_by_byte_hw( return(~crc); } -#endif /* defined(__GNUC__) && defined(__x86_64__) */ +#endif /* UNIV_CRC32_HW */ /* CRC32 software implementation. */ @@ -782,7 +908,8 @@ void ut_crc32_init() /*===========*/ { - bool ut_crc32_sse2_enabled = false; + bool ut_crc32_hw_enabled = false; +#if !defined(UNIV_DEBUG_VALGRIND) && defined(UNIV_CRC32_HW) #if defined(__GNUC__) && defined(__x86_64__) uint32_t vend[3]; uint32_t model; @@ -810,19 +937,24 @@ ut_crc32_init() probably kill your program. */ -#ifndef UNIV_DEBUG_VALGRIND - ut_crc32_sse2_enabled = (features_ecx >> 20) & 1; -#endif /* UNIV_DEBUG_VALGRIND */ + ut_crc32_hw_enabled = (features_ecx >> 20) & 1; +#endif /* defined(__GNUC__) && defined(__x86_64__) */ + +#if defined(__GNUC__) && defined(__aarch64__) + unsigned long hwcap; - if (ut_crc32_sse2_enabled) { + ut_cpuid(&hwcap); + ut_crc32_hw_enabled = hwcap & HWCAP_CRC32; +#endif /* defined (__GNUC__) && defined(__aarch64__) */ + + if (ut_crc32_hw_enabled) { ut_crc32 = ut_crc32_hw; ut_crc32_ex = ut_crc32_hw_ex; ut_crc32_legacy_big_endian = ut_crc32_legacy_big_endian_hw; ut_crc32_byte_by_byte = ut_crc32_byte_by_byte_hw; - ut_crc32_implementation = "SSE2 crc32 instructions"; + ut_crc32_implementation = "Hardware crc32 instructions"; } - -#endif /* defined(__GNUC__) && defined(__x86_64__) */ +#endif /* !defined(UNIV_DEBUG_VALGRIND) && defined(UNIV_CRC32_HW) */ #if defined(__linux__) && defined(__powerpc__) && defined(AT_HWCAP2) if (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07) { @@ -835,7 +967,7 @@ ut_crc32_init() ut_crc32_slice8_table_init(); } else #endif /* defined(__linux__) && defined(__powerpc__) */ - if (!ut_crc32_sse2_enabled) { + if (!ut_crc32_hw_enabled) { ut_crc32_slice8_table_init(); ut_crc32 = ut_crc32_sw; ut_crc32_ex = ut_crc32_sw_ex;