Skip to content

Commit

Permalink
Add adler32 implementation using ARMv8.2-A dotprod extension (~1% spe…
Browse files Browse the repository at this point in the history
…edup)

This extension is enabled by default from ARMv8.4-A. This commit
only enables it on macOS.

This implementation runs at ~60GB/s on an Apple M1, but the existing
NEON implementation is already very fast, so only a ~1% gain shows
in inflate benchmarks.
  • Loading branch information
dougallj committed Aug 19, 2022
1 parent f23fa25 commit 078f36b
Showing 1 changed file with 157 additions and 0 deletions.
157 changes: 157 additions & 0 deletions adler32_simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,164 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */

#elif defined(ADLER32_SIMD_NEON)

/* __APPLE__ is insufficent, as older iOS devices will not support UDOT,
however all NEON-supporting macOS devices will. */
#ifdef __APPLE__
#include <TargetConditionals.h>
#if TARGET_OS_MAC
#define ADLER32_SIMD_NEON_UDOT
#define __ARM_FEATURE_DOTPROD
#endif
#endif

#include <arm_neon.h>

#ifdef ADLER32_SIMD_NEON_UDOT

uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
uint32_t adler,
const unsigned char *buf,
unsigned long len)
{
/*
* Split Adler-32 into component sums.
*/
uint64_t s1 = adler & 0xffff;
uint64_t s2 = adler >> 16;

/*
* Serially compute s1 & s2, until the data is 16-byte aligned.
*/
if ((uintptr_t)buf & 15) {
while ((uintptr_t)buf & 15) {
s2 += (s1 += *buf++);
--len;
}

if (s1 >= BASE)
s1 -= BASE;
s2 %= BASE;
}

/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 6;

unsigned long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;

while (blocks)
{
unsigned n = 2902; /* Maximum blocks. */
if (n > blocks)
n = blocks;
blocks -= n;

/*
* Process n blocks of data. At most 2902 blocks can be
* processed before s2 must be reduced modulo BASE. This
* is greater than NMAX bytes as we're using 64-bit
* integers.
*/
const unsigned char MULTIPLIERS[0x40] = {
0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39,
0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31,
0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29,
0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21,
0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19,
0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11,
0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09,
0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
};
uint8x16x4_t mul = vld1q_u8_x4(MULTIPLIERS);

uint32x4_t accs[4] = { 0 };
uint32x4_t sums[4] = { 0 };
uint32x4_t extras[4] = { 0 };

const unsigned char *end = buf + n * BLOCK_SIZE;
do {
/*
* Load 64 input bytes.
*/
uint8x16x4_t raw = vld1q_u8_x4(buf);
buf += BLOCK_SIZE;

for (int i = 0; i < 4; i++) {
accs[i] = vaddq_u32(accs[i], sums[i]);
sums[i] = vdotq_u32(sums[i], raw.val[i], vdupq_n_u8(1));
extras[i] = vdotq_u32(extras[i], raw.val[i], mul.val[i]);
}
} while (buf != end);

for (int i = 1; i < 4; i++) {
extras[0] = vaddq_u32(extras[0], extras[i]);
sums[0] = vaddq_u32(sums[0], sums[i]);
}

uint64_t acc = 0;
for (int i = 0; i < 4; i++) {
acc += vaddlvq_u32(accs[i]);
}
uint64_t extra = vaddlvq_u32(extras[0]);
uint64_t sum = vaddlvq_u32(sums[0]);

s2 += s1 * n * BLOCK_SIZE + acc * BLOCK_SIZE + extra;
s1 += sum;

/*
* Reduce.
*/
s1 %= BASE;
s2 %= BASE;
}

/*
* Handle leftover data.
*/
if (len) {
while (len >= 16) {
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

len -= 16;
}

while (len--) {
s2 += (s1 += *buf++);
}

if (s1 >= BASE)
s1 -= BASE;
s2 %= BASE;
}

/*
* Return the recombined sums.
*/
return s1 | (s2 << 16);
}

#else /* ADLER32_SIMD_NEON_UDOT */

uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
uint32_t adler,
const unsigned char *buf,
Expand Down Expand Up @@ -383,5 +539,6 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
*/
return s1 | (s2 << 16);
}
#endif /* ADLER32_SIMD_NEON_UDOT */

#endif /* ADLER32_SIMD_SSSE3 */

0 comments on commit 078f36b

Please sign in to comment.