Add adler32 implementation using ARMv8.2-A dotprod extension (~1% spe…

…edup) This extension is enabled by default from ARMv8.4-A. This commit only enables it on macOS. This implementation runs at ~60GB/s on an Apple M1, but the existing NEON implementation is already very fast, so only a ~1% gain shows in inflate benchmarks.
dougallj · Aug 19, 2022 · 078f36b · 078f36b
1 parent f23fa25
commit 078f36b
Showing 1 changed file with 157 additions and 0 deletions.
diff --git a/adler32_simd.c b/adler32_simd.c
@@ -219,8 +219,164 @@ uint32_t ZLIB_INTERNAL adler32_simd_(  /* SSSE3 */
 
 #elif defined(ADLER32_SIMD_NEON)
 
+/* __APPLE__ is insufficent, as older iOS devices will not support UDOT,
+   however all NEON-supporting macOS devices will. */
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#if TARGET_OS_MAC
+#define ADLER32_SIMD_NEON_UDOT
+#define __ARM_FEATURE_DOTPROD
+#endif
+#endif
+
 #include <arm_neon.h>
 
+#ifdef ADLER32_SIMD_NEON_UDOT
+
+uint32_t ZLIB_INTERNAL adler32_simd_(  /* NEON */
+    uint32_t adler,
+    const unsigned char *buf,
+    unsigned long len)
+{
+    /*
+     * Split Adler-32 into component sums.
+     */
+    uint64_t s1 = adler & 0xffff;
+    uint64_t s2 = adler >> 16;
+
+    /*
+     * Serially compute s1 & s2, until the data is 16-byte aligned.
+     */
+    if ((uintptr_t)buf & 15) {
+        while ((uintptr_t)buf & 15) {
+            s2 += (s1 += *buf++);
+            --len;
+        }
+
+        if (s1 >= BASE)
+            s1 -= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Process the data in blocks.
+     */
+    const unsigned BLOCK_SIZE = 1 << 6;
+
+    unsigned long blocks = len / BLOCK_SIZE;
+    len -= blocks * BLOCK_SIZE;
+
+    while (blocks)
+    {
+        unsigned n = 2902; /* Maximum blocks. */
+        if (n > blocks)
+            n = blocks;
+        blocks -= n;
+
+        /*
+         * Process n blocks of data. At most 2902 blocks can be
+         * processed before s2 must be reduced modulo BASE. This
+         * is greater than NMAX bytes as we're using 64-bit
+         * integers.
+         */
+        const unsigned char MULTIPLIERS[0x40] = {
+            0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39,
+            0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31,
+            0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29,
+            0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21,
+            0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19,
+            0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11,
+            0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09,
+            0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
+        };
+        uint8x16x4_t mul = vld1q_u8_x4(MULTIPLIERS);
+
+        uint32x4_t accs[4] = { 0 };
+        uint32x4_t sums[4] = { 0 };
+        uint32x4_t extras[4] = { 0 };
+
+        const unsigned char *end = buf + n * BLOCK_SIZE;
+        do {
+            /*
+             * Load 64 input bytes.
+             */
+            uint8x16x4_t raw = vld1q_u8_x4(buf);
+            buf += BLOCK_SIZE;
+
+            for (int i = 0; i < 4; i++) {
+                accs[i] = vaddq_u32(accs[i], sums[i]);
+                sums[i] = vdotq_u32(sums[i], raw.val[i], vdupq_n_u8(1));
+                extras[i] = vdotq_u32(extras[i], raw.val[i], mul.val[i]);
+            }
+        } while (buf != end);
+
+        for (int i = 1; i < 4; i++) {
+            extras[0] = vaddq_u32(extras[0], extras[i]);
+            sums[0] = vaddq_u32(sums[0], sums[i]);
+        }
+
+        uint64_t acc = 0;
+        for (int i = 0; i < 4; i++) {
+            acc += vaddlvq_u32(accs[i]);
+        }
+        uint64_t extra = vaddlvq_u32(extras[0]);
+        uint64_t sum = vaddlvq_u32(sums[0]);
+
+        s2 += s1 * n * BLOCK_SIZE + acc * BLOCK_SIZE + extra;
+        s1 += sum;
+
+        /*
+         * Reduce.
+         */
+        s1 %= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Handle leftover data.
+     */
+    if (len) {
+        while (len >= 16) {
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            len -= 16;
+        }
+
+        while (len--) {
+            s2 += (s1 += *buf++);
+        }
+
+        if (s1 >= BASE)
+            s1 -= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Return the recombined sums.
+     */
+    return s1 | (s2 << 16);
+}
+
+#else  /* ADLER32_SIMD_NEON_UDOT */
+
 uint32_t ZLIB_INTERNAL adler32_simd_(  /* NEON */
     uint32_t adler,
     const unsigned char *buf,
@@ -383,5 +539,6 @@ uint32_t ZLIB_INTERNAL adler32_simd_(  /* NEON */
      */
     return s1 | (s2 << 16);
 }
+#endif  /* ADLER32_SIMD_NEON_UDOT */
 
 #endif  /* ADLER32_SIMD_SSSE3 */