Skip to content

Commit

Permalink
Add support for autodetection and usage of hardware CRC32 instructions
Browse files Browse the repository at this point in the history
on the AArch64 architecture, as well as some minor code generalization
via the UNIV_CRC32_HW define.
  • Loading branch information
akopytov authored and grooverdan committed Dec 22, 2015
1 parent d0ebe00 commit af62807
Showing 1 changed file with 184 additions and 52 deletions.
236 changes: 184 additions & 52 deletions mysys/ut0crc32.cc
Expand Up @@ -82,11 +82,32 @@ mysys/my_perf.c, contributed by Facebook under the following license.
#include "my_config.h"
#include <string.h>

#if defined(__linux__) && defined(__powerpc__)
#if defined(__GNUC__) && defined(__x86_64__)
/* x86-specific CRC32 support may be available */
# define UNIV_CRC32_HW
#endif /* __GNUC__ && __x86_64__ */

#if defined(__linux__)
#if (defined(__powerpc__) || defined(__aarch64__))
/* Used to detect at runtime if we have vpmsum instructions (PowerISA 2.07) */
/* and also ARMv8-specific CRC32 support. */
#include <sys/auxv.h>
#endif /* (defined(__powerpc__) || defined(__aarch64__)) */

#if defined(__powerpc__)
#include <bits/hwcap.h>
#endif /* defined(__linux__) && defined(__powerpc__) */

#elif defined(__aarch64__)
#include <asm/hwcap.h>
#ifndef HWCAP_CRC32
# define HWCAP_CRC32 (1<<7)
#endif
/* ARMv8-specific CRC32 support may be available */
# define UNIV_CRC32_HW
/* assembler directive to enable CRC32 instructions */
asm(".cpu generic+crc");
#endif /* defined(__aarch64__) */
#endif /* defined(__linux__) */

#include "ut0crc32.h"

Expand Down Expand Up @@ -159,10 +180,10 @@ ut_crc32_swap_byteorder(
| i >> 56);
}


#if defined(__GNUC__) && defined(__x86_64__)
#ifdef UNIV_CRC32_HW
#if defined(__x86_64__)
/********************************************************************//**
Fetches CPU info */
Fetches x86_64 CPU info */
static
void
ut_cpuid(
Expand Down Expand Up @@ -192,6 +213,19 @@ ut_cpuid(
}
}

#elif defined(__aarch64__)
/********************************************************************//**
Fetches AArch64 CPU info using kernel auxiliary vector */
static
void
ut_cpuid(
/*=====*/
unsigned long *hwcap) /*!< out: hwcap */
{
*hwcap = getauxval(AT_HWCAP);
}
#endif /* __aarch64__ */

/** Calculate CRC32 over 8-bit data using a hardware/CPU instruction.
@param[in,out] crc crc32 checksum so far when this function is called,
when the function ends it will contain the new checksum
Expand All @@ -205,11 +239,21 @@ ut_crc32_8_hw(
const byte** data,
ulint* len)
{
#if defined(__x86_64__)
asm("crc32b %1, %0"
/* output operands */
: "+r" (*crc)
/* input operands */
: "rm" ((*data)[0]));
#elif defined(__aarch64__)
asm("crc32cb %w[c], %w[c], %w[v]"
/* output operands */
: [c]"+r"(*crc)
/* input operands */
: [v]"r"((*data)[0]));
#else
#error No support for hardware CRC32 implementation
#endif

(*data)++;
(*len)--;
Expand All @@ -225,6 +269,7 @@ ut_crc32_64_low_hw(
uint32_t crc,
uint64_t data)
{
#if defined(__x86_64__)
uint64_t crc_64bit = crc;

asm("crc32q %1, %0"
Expand All @@ -234,6 +279,17 @@ ut_crc32_64_low_hw(
: "rm" (data));

return(static_cast<uint32_t>(crc_64bit));
#elif defined(__aarch64__)
asm("crc32cx %w[c], %w[c], %x[v]"
/* output operands */
: [c]"+r"(crc)
/* input operands */
: [v]"r"(data));

return(crc);
#else
#error No support for hardware CRC32 implementation
#endif
}

/** Calculate CRC32 over 64-bit byte string using a hardware/CPU instruction.
Expand All @@ -252,9 +308,9 @@ ut_crc32_64_hw(
uint64_t data_int = *reinterpret_cast<const uint64_t*>(*data);

#ifdef WORDS_BIGENDIAN
/* Currently we only support x86_64 (little endian) CPUs. In case
some big endian CPU supports a CRC32 instruction, then maybe we will
need a byte order swap here. */
/* Currently we only support little endian CPUs. In case some big endian
CPU supports a CRC32 instruction, then maybe we will NOT need a byte order
swap here. */
#error Dont know how to handle big endian CPUs
/*
data_int = ut_crc32_swap_byteorder(data_int);
Expand Down Expand Up @@ -298,6 +354,92 @@ ut_crc32_64_legacy_big_endian_hw(
*len -= 8;
}

/** Calculate CRC32 over 2 64-bit byte string using a hardware/CPU instruction.
@param[in,out] crc crc32 checksum so far when this function is called,
when the function ends it will contain the new checksum
@param[in,out] data data to be checksummed, the pointer will be advanced
with 16 bytes
@param[in,out] len remaining bytes, it will be decremented with 16 */
inline
void
ut_crc32_128_hw(
uint32_t* crc,
const byte** data,
ulint* len)
{
#ifdef WORDS_BIGENDIAN
/* Currently we only support little endian CPUs. In case some big endian
CPU supports a CRC32 instruction, then maybe we will need a byte order
swap here. */
#error Dont know how to handle big endian CPUs
/*
data_int = ut_crc32_swap_byteorder(data_int);
*/
#endif /* WORDS_BIGENDIAN */
#if defined(__aarch64__)
uint64_t v0, v1;

/* Load a pair of registers with one instruction to spare some cycles.
Note that post-index addressing also increments the source address
automatically. */
asm("ldp %x[a], %x[b], [%x[c]], #16"
/* output operands */
: [a]"=r"(v0), [b]"=r"(v1), [c]"+r"(*data));

*crc = ut_crc32_64_low_hw(*crc, v0);
*crc = ut_crc32_64_low_hw(*crc, v1);

*len -= 16;
#else
ut_crc32_64_hw(crc, data, len);
ut_crc32_64_hw(crc, data, len);
#endif
}

/** Calculate CRC32 over 2 64-bit byte string using a hardware/CPU instruction.
The byte strings are converted to 64-bit integers using big endian byte order.
@param[in,out] crc crc32 checksum so far when this function is called,
when the function ends it will contain the new checksum
@param[in,out] data data to be checksummed, the pointer will be advanced
with 16 bytes
@param[in,out] len remaining bytes, it will be decremented with 16 */
inline
void
ut_crc32_128_legacy_big_endian_hw(
uint32_t* crc,
const byte** data,
ulint* len)
{
#if defined(__aarch64__)
uint64_t v0, v1;

/* Load a pair of registers with one instruction to spare some cycles.
Note that post-index addressing also increments the source address
automatically. */
asm("ldp %x[a], %x[b], [%x[c]], #16"
/* output operands */
: [a]"=r"(v0), [b]"=r"(v1), [c]"+r"(*data));

#ifndef WORDS_BIGENDIAN
v0 = ut_crc32_swap_byteorder(v0);
v1 = ut_crc32_swap_byteorder(v1);
#else
/* Currently we only support little endian CPUs. In case some big endian
CPU supports a CRC32 instruction, then maybe we will NOT need a byte
order swap here. */
#error Dont know how to handle big endian CPUs
#endif /* WORDS_BIGENDIAN */

*crc = ut_crc32_64_low_hw(*crc, v0);
*crc = ut_crc32_64_low_hw(*crc, v1);

*len -= 16;
#else
ut_crc32_64_legacy_big_endian_hw(crc, data, len);
ut_crc32_64_legacy_big_endian_hw(crc, data, len);
#endif
}

/** Calculates CRC32 using hardware/CPU instructions.
@param[in] buf data over which to calculate CRC32
@param[in] len data length
Expand Down Expand Up @@ -357,23 +499,15 @@ ut_crc32_hw_ex(
(4.51% slowdown over N=256)
*/
while (len >= 128) {
/* This call is repeated 16 times. 16 * 8 = 128. */
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
/* This call is repeated 8 times. 128 bits * 8 = 128 bytes */
ut_crc32_128_hw(&crc, &buf, &len);
ut_crc32_128_hw(&crc, &buf, &len);
ut_crc32_128_hw(&crc, &buf, &len);
ut_crc32_128_hw(&crc, &buf, &len);
ut_crc32_128_hw(&crc, &buf, &len);
ut_crc32_128_hw(&crc, &buf, &len);
ut_crc32_128_hw(&crc, &buf, &len);
ut_crc32_128_hw(&crc, &buf, &len);
}

while (len >= 8) {
Expand Down Expand Up @@ -416,23 +550,15 @@ ut_crc32_legacy_big_endian_hw(
}

while (len >= 128) {
/* This call is repeated 16 times. 16 * 8 = 128. */
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len);
/* This call is repeated 8 times. 128 bits * 8 = 128 bytes */
ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len);
ut_crc32_128_legacy_big_endian_hw(&crc, &buf, &len);
}

while (len >= 8) {
Expand Down Expand Up @@ -465,7 +591,7 @@ ut_crc32_byte_by_byte_hw(

return(~crc);
}
#endif /* defined(__GNUC__) && defined(__x86_64__) */
#endif /* UNIV_CRC32_HW */

/* CRC32 software implementation. */

Expand Down Expand Up @@ -782,7 +908,8 @@ void
ut_crc32_init()
/*===========*/
{
bool ut_crc32_sse2_enabled = false;
bool ut_crc32_hw_enabled = false;
#if !defined(UNIV_DEBUG_VALGRIND) && defined(UNIV_CRC32_HW)
#if defined(__GNUC__) && defined(__x86_64__)
uint32_t vend[3];
uint32_t model;
Expand Down Expand Up @@ -810,19 +937,24 @@ ut_crc32_init()
probably kill your program.
*/
#ifndef UNIV_DEBUG_VALGRIND
ut_crc32_sse2_enabled = (features_ecx >> 20) & 1;
#endif /* UNIV_DEBUG_VALGRIND */
ut_crc32_hw_enabled = (features_ecx >> 20) & 1;
#endif /* defined(__GNUC__) && defined(__x86_64__) */

#if defined(__GNUC__) && defined(__aarch64__)
unsigned long hwcap;

if (ut_crc32_sse2_enabled) {
ut_cpuid(&hwcap);
ut_crc32_hw_enabled = hwcap & HWCAP_CRC32;
#endif /* defined (__GNUC__) && defined(__aarch64__) */

if (ut_crc32_hw_enabled) {
ut_crc32 = ut_crc32_hw;
ut_crc32_ex = ut_crc32_hw_ex;
ut_crc32_legacy_big_endian = ut_crc32_legacy_big_endian_hw;
ut_crc32_byte_by_byte = ut_crc32_byte_by_byte_hw;
ut_crc32_implementation = "SSE2 crc32 instructions";
ut_crc32_implementation = "Hardware crc32 instructions";
}

#endif /* defined(__GNUC__) && defined(__x86_64__) */
#endif /* !defined(UNIV_DEBUG_VALGRIND) && defined(UNIV_CRC32_HW) */

#if defined(__linux__) && defined(__powerpc__) && defined(AT_HWCAP2)
if (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07) {
Expand All @@ -835,7 +967,7 @@ ut_crc32_init()
ut_crc32_slice8_table_init();
} else
#endif /* defined(__linux__) && defined(__powerpc__) */
if (!ut_crc32_sse2_enabled) {
if (!ut_crc32_hw_enabled) {
ut_crc32_slice8_table_init();
ut_crc32 = ut_crc32_sw;
ut_crc32_ex = ut_crc32_sw_ex;
Expand Down

0 comments on commit af62807

Please sign in to comment.