Port Intel optimizations (adler32, chunkcopy) to cloudflare (#23)

* Add SIMD SSSE3 implementation of the adler32 checksum Based on the adler32-simd patch from Noel Gordon for the chromium fork of zlib. 17bbb3d73c84 ("zlib adler_simd.c") Signed-off-by: Janakarajan Natarajan <janakan@amazon.com> * Port inflate chunk SIMD SSE2 improvements for cloudflare Based on 2 patches from zlib chromium fork: * Adenilson Cavalcanti (adenilson.cavalcanti@arm.com) 3060dcb - "zlib: inflate using wider loads and stores" * Noel Gordon (noel@chromium.org) 64ffef0 - "Improve zlib inflate speed by using SSE2 chunk copy The improvement in inflate performance is around 15-35%, based on the workload, when checked with a modified zpipe.c and the Silesia corpus. Signed-off-by: Janakarajan Natarajan <janakan@amazon.com>
cloudflare · Sep 28, 2020 · 82035d0 · vkrasnov · Sep 29, 2020 · 82035d0
1 parent e76d32d
commit 82035d0
Show file tree

Hide file tree

Showing 6 changed files with 246 additions and 10 deletions.
diff --git a/Makefile.in b/Makefile.in
@@ -63,6 +63,11 @@ ifneq ($(findstring -DINFLATE_CHUNK_SIMD_NEON, $(CFLAGS)),)
         PIC_OBJZ += inffast_chunk.lo
 endif
 
+ifneq ($(findstring -DINFLATE_CHUNK_SIMD_SSE2, $(CFLAGS)),)
+	OBJZ += inffast_chunk.o
+	PIC_OBJZ += inffast_chunk.lo
+endif
+
 ifneq ($(findstring -DHAS_PCLMUL, $(CFLAGS)),)
 	OBJZ += crc32_simd.o
 	PIC_OBJZ += crc32_simd.lo

diff --git a/adler32.c b/adler32.c
@@ -61,7 +61,7 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
 #  define MOD63(a) a %= BASE
 #endif
 
-#if defined(ADLER32_SIMD_NEON)
+#if defined(ADLER32_SIMD_NEON) || defined (ADLER32_SIMD_SSSE3)
 #include "adler32_simd.h"
 #endif
 
@@ -74,7 +74,7 @@ uLong ZEXPORT adler32(adler, buf, len)
     unsigned long sum2;
     unsigned n;
 
-#if defined(ADLER32_SIMD_NEON)
+#if defined(ADLER32_SIMD_NEON) || defined(ADLER32_SIMD_SSSE3)
     if (buf && len >= 64)
         return adler32_simd_(adler, buf, len);
 #endif

diff --git a/adler32_simd.c b/adler32_simd.c
@@ -70,7 +70,154 @@
 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
 #define NMAX 5552
 
-#if defined(ADLER32_SIMD_NEON)
+#if defined(ADLER32_SIMD_SSSE3)
+
+#include <tmmintrin.h>
+
+uint32_t ZLIB_INTERNAL adler32_simd_(  /* SSSE3 */
+    uint32_t adler,
+    const unsigned char *buf,
+    unsigned long len)
+{
+    /*
+     * Split Adler-32 into component sums.
+     */
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = adler >> 16;
+
+    /*
+     * Process the data in blocks.
+     */
+    const unsigned BLOCK_SIZE = 1 << 5;
+
+    unsigned long blocks = len / BLOCK_SIZE;
+    len -= blocks * BLOCK_SIZE;
+
+    while (blocks)
+    {
+        unsigned n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
+        if (n > blocks)
+            n = (unsigned) blocks;
+        blocks -= n;
+
+        const __m128i tap1 =
+            _mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17);
+        const __m128i tap2 =
+            _mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        const __m128i zero =
+            _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        const __m128i ones =
+            _mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1);
+
+        /*
+         * Process n blocks of data. At most NMAX data bytes can be
+         * processed before s2 must be reduced modulo BASE.
+         */
+        __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
+        __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
+        __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
+
+        do {
+            /*
+             * Load 32 input bytes.
+             */
+            const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf));
+            const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16));
+
+            /*
+             * Add previous block byte sum to v_ps.
+             */
+            v_ps = _mm_add_epi32(v_ps, v_s1);
+
+            /*
+             * Horizontally add the bytes for s1, multiply-adds the
+             * bytes by [ 32, 31, 30, ... ] for s2.
+             */
+            v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
+            const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
+            v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
+
+            v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
+            const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
+            v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
+
+            buf += BLOCK_SIZE;
+
+        } while (--n);
+
+        v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
+
+        /*
+         * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
+         */
+
+#define S23O1 _MM_SHUFFLE(2,3,0,1)  /* A B C D -> B A D C */
+#define S1O32 _MM_SHUFFLE(1,0,3,2)  /* A B C D -> C D A B */
+
+        v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
+        v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
+
+        s1 += _mm_cvtsi128_si32(v_s1);
+
+        v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
+        v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
+
+        s2 = _mm_cvtsi128_si32(v_s2);
+
+#undef S23O1
+#undef S1O32
+
+        /*
+         * Reduce.
+         */
+        s1 %= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Handle leftover data.
+     */
+    if (len) {
+        if (len >= 16) {
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            len -= 16;
+        }
+
+        while (len--) {
+            s2 += (s1 += *buf++);
+        }
+
+        if (s1 >= BASE)
+            s1 -= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Return the recombined sums.
+     */
+    return s1 | (s2 << 16);
+}
+
+#elif defined(ADLER32_SIMD_NEON)
 
 #include <arm_neon.h>
 
@@ -237,4 +384,4 @@ uint32_t ZLIB_INTERNAL adler32_simd_(  /* NEON */
     return s1 | (s2 << 16);
 }
 
-#endif  /* ADLER32_SIMD_NEON */
+#endif  /* ADLER32_SIMD_SSSE3 */
diff --git a/chunkcopy.h b/chunkcopy.h
@@ -50,6 +50,11 @@
 #if defined(INFLATE_CHUNK_SIMD_NEON)
 #include <arm_neon.h>
 typedef uint8x16_t z_vec128i_t;
+#elif defined(INFLATE_CHUNK_SIMD_SSE2)
+#include <emmintrin.h>
+typedef __m128i z_vec128i_t;
+#else
+#error chunkcopy.h inflate chunk SIMD is not defined for your build target
 #endif
 
 /*
@@ -220,6 +225,52 @@ static inline z_vec128i_t v_load8_dup(const void* src) {
 static inline void v_store_128(void* out, const z_vec128i_t vec) {
   vst1q_u8(out, vec);
 }
+#elif defined (INFLATE_CHUNK_SIMD_SSE2)
+/*
+ * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
+ * every 64-bit component of the 128-bit result (64-bit int splat).
+ */
+static inline z_vec128i_t v_load64_dup(const void* src) {
+  int64_t i64;
+  Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
+  return _mm_set1_epi64x(i64);
+}
+
+/*
+ * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
+ * every 32-bit component of the 128-bit result (32-bit int splat).
+ */
+static inline z_vec128i_t v_load32_dup(const void* src) {
+  int32_t i32;
+  Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
+  return _mm_set1_epi32(i32);
+}
+
+/*
+ * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
+ * every 16-bit component of the 128-bit result (16-bit int splat).
+ */
+static inline z_vec128i_t v_load16_dup(const void* src) {
+  int16_t i16;
+  Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
+  return _mm_set1_epi16(i16);
+}
+
+/*
+ * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
+ * component of the 128-bit result (8-bit int splat).
+ */
+static inline z_vec128i_t v_load8_dup(const void* src) {
+  return _mm_set1_epi8(*(const char*)src);
+}
+
+/*
+ * v_store_128(): store the 128-bit vec in a memory destination (that might
+ * not be 16-byte aligned) void* out.
+ */
+static inline void v_store_128(void* out, const z_vec128i_t vec) {
+  _mm_storeu_si128((__m128i*)out, vec);
+}
 #endif
 
 /*

diff --git a/configure b/configure
@@ -745,6 +745,39 @@ fi
 # Check for AMD64 hardware support.
 if [ x$TGT_ARCH = "xx86_64" -o x$TGT_ARCH = "xamd64" ] ; then
 
+cat > $test.c << EOF
+#include <emmintrin.h>
+void foo(void) {
+  __m64 a, b;
+  _mm_add_si64(a, b);
+}
+EOF
+  if try $CC -msse2 $CFLAGS $test.c -c $test; then
+    CFLAGS="-DINFLATE_CHUNK_SIMD_SSE2 -msse2 -DINFLATE_CHUNK_READ_64LE $CLFAGS"
+    SFLAGS="-DINFLATE_CHUNK_SIMD_SSE2 -msse2 -DINFLATE_CHUNK_READ_64LE $SFLAGS"
+    echo "Checking for SSE2 support ... Yes" | tee -a configure.log
+  else
+    echo "Checking for SSE2 support ... No" | tee -a configure.log
+    leave 1
+  fi
+
+# Check for SSSE3 support
+cat > $test.c << EOF
+#include <tmmintrin.h>
+void foo(void) {
+  __m128i a;
+  _mm_abs_epi8(a);
+}
+EOF
+  if try $CC -mssse3 $CFLAGS $test.c -c $test; then
+    CFLAGS="-DADLER32_SIMD_SSSE3 -mssse3 $CFLAGS"
+    SFLAGS="-DADLER32_SIMD_SSSE3 -mssse3 $SFLAGS"
+    echo "Checking for SSSE3 support ... Yes" | tee -a configure.log
+  else
+    echo "Checking for SSSE3 support ... No" | tee -a configure.log
+    leave 1
+  fi
+
   # Check for SSE4.2 and CRC support
 cat > $test.c << EOF
 #include <immintrin.h>

diff --git a/inflate.c b/inflate.c
@@ -84,7 +84,7 @@
 #include "inftrees.h"
 #include "inflate.h"
 
-#ifdef INFLATE_CHUNK_SIMD_NEON
+#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
 #include "inffast_chunk.h"
 #include "chunkcopy.h"
 #else
@@ -411,7 +411,7 @@ unsigned copy;
 
     /* if it hasn't been done already, allocate space for the window */
     if (state->window == Z_NULL) {
-#ifdef INFLATE_CHUNK_SIMD_NEON
+#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
 	unsigned wsize = 1U << state->wbits;
 	state->window = (unsigned char FAR *)
 			ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE,
@@ -431,7 +431,7 @@ unsigned copy;
                         ZALLOC(strm, 1U << state->wbits,
                                sizeof(unsigned char));
         if (state->window == Z_NULL) return 1;
-#endif /* INFLATE_CHUNK_SIMD_NEON */
+#endif /* INFLATE_CHUNK_SIMD */
     }
 
     /* if window not in use yet, initialize */
@@ -1066,7 +1066,7 @@ int flush;
             if (have >= INFLATE_FAST_MIN_INPUT &&
                 left >= INFLATE_FAST_MIN_OUTPUT) {
                 RESTORE();
-#ifdef INFLATE_CHUNK_SIMD_NEON
+#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
                 inflate_fast_chunk_(strm, out);
 #else
                 inflate_fast(strm, out);
@@ -1201,7 +1201,7 @@ int flush;
                 else
                     from = state->window + (state->wnext - copy);
                 if (copy > state->length) copy = state->length;
-#ifdef INFLATE_CHUNK_SIMD_NEON
+#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
                 if (copy > left) copy = left;
                 put = chunkcopy_safe(put, from, copy, put + left);
             }
@@ -1290,7 +1290,7 @@ int flush;
        Note: a memory error from inflate() is non-recoverable.
      */
   inf_leave:
-#ifdef INFLATE_CHUNK_SIMD_NEON
+#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
     /* We write a defined value in the unused space to help mark
      * where the stream has ended. We don't use zeros as that can
      * mislead clients relying on undefined behavior (i.e. assuming