Skip to content

Commit

Permalink
Port Intel optimizations (adler32, chunkcopy) to cloudflare (#23)
Browse files Browse the repository at this point in the history
* Add SIMD SSSE3 implementation of the adler32 checksum

Based on the adler32-simd patch from Noel Gordon for the chromium fork of zlib.
17bbb3d73c84 ("zlib adler_simd.c")

Signed-off-by: Janakarajan Natarajan <janakan@amazon.com>

* Port inflate chunk SIMD SSE2 improvements for cloudflare

Based on 2 patches from zlib chromium fork:

* Adenilson Cavalcanti (adenilson.cavalcanti@arm.com)
  3060dcb - "zlib: inflate using wider loads and stores"

* Noel Gordon (noel@chromium.org)
  64ffef0 - "Improve zlib inflate speed by using SSE2 chunk copy

The improvement in inflate performance is around 15-35%, based
on the workload, when checked with a modified zpipe.c and the
Silesia corpus.

Signed-off-by: Janakarajan Natarajan <janakan@amazon.com>
  • Loading branch information
janaknat committed Sep 28, 2020
1 parent e76d32d commit 82035d0
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 10 deletions.
5 changes: 5 additions & 0 deletions Makefile.in
Expand Up @@ -63,6 +63,11 @@ ifneq ($(findstring -DINFLATE_CHUNK_SIMD_NEON, $(CFLAGS)),)
PIC_OBJZ += inffast_chunk.lo
endif

ifneq ($(findstring -DINFLATE_CHUNK_SIMD_SSE2, $(CFLAGS)),)
OBJZ += inffast_chunk.o
PIC_OBJZ += inffast_chunk.lo
endif

ifneq ($(findstring -DHAS_PCLMUL, $(CFLAGS)),)
OBJZ += crc32_simd.o
PIC_OBJZ += crc32_simd.lo
Expand Down
4 changes: 2 additions & 2 deletions adler32.c
Expand Up @@ -61,7 +61,7 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
# define MOD63(a) a %= BASE
#endif

#if defined(ADLER32_SIMD_NEON)
#if defined(ADLER32_SIMD_NEON) || defined (ADLER32_SIMD_SSSE3)
#include "adler32_simd.h"
#endif

Expand All @@ -74,7 +74,7 @@ uLong ZEXPORT adler32(adler, buf, len)
unsigned long sum2;
unsigned n;

#if defined(ADLER32_SIMD_NEON)
#if defined(ADLER32_SIMD_NEON) || defined(ADLER32_SIMD_SSSE3)
if (buf && len >= 64)
return adler32_simd_(adler, buf, len);
#endif
Expand Down
151 changes: 149 additions & 2 deletions adler32_simd.c
Expand Up @@ -70,7 +70,154 @@
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
#define NMAX 5552

#if defined(ADLER32_SIMD_NEON)
#if defined(ADLER32_SIMD_SSSE3)

#include <tmmintrin.h>

uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */
uint32_t adler,
const unsigned char *buf,
unsigned long len)
{
/*
* Split Adler-32 into component sums.
*/
uint32_t s1 = adler & 0xffff;
uint32_t s2 = adler >> 16;

/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;

unsigned long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;

while (blocks)
{
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks)
n = (unsigned) blocks;
blocks -= n;

const __m128i tap1 =
_mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17);
const __m128i tap2 =
_mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i zero =
_mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
const __m128i ones =
_mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1);

/*
* Process n blocks of data. At most NMAX data bytes can be
* processed before s2 must be reduced modulo BASE.
*/
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);

do {
/*
* Load 32 input bytes.
*/
const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf));
const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16));

/*
* Add previous block byte sum to v_ps.
*/
v_ps = _mm_add_epi32(v_ps, v_s1);

/*
* Horizontally add the bytes for s1, multiply-adds the
* bytes by [ 32, 31, 30, ... ] for s2.
*/
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));

v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));

buf += BLOCK_SIZE;

} while (--n);

v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));

/*
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
*/

#define S23O1 _MM_SHUFFLE(2,3,0,1) /* A B C D -> B A D C */
#define S1O32 _MM_SHUFFLE(1,0,3,2) /* A B C D -> C D A B */

v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));

s1 += _mm_cvtsi128_si32(v_s1);

v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));

s2 = _mm_cvtsi128_si32(v_s2);

#undef S23O1
#undef S1O32

/*
* Reduce.
*/
s1 %= BASE;
s2 %= BASE;
}

/*
* Handle leftover data.
*/
if (len) {
if (len >= 16) {
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

len -= 16;
}

while (len--) {
s2 += (s1 += *buf++);
}

if (s1 >= BASE)
s1 -= BASE;
s2 %= BASE;
}

/*
* Return the recombined sums.
*/
return s1 | (s2 << 16);
}

#elif defined(ADLER32_SIMD_NEON)

#include <arm_neon.h>

Expand Down Expand Up @@ -237,4 +384,4 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
return s1 | (s2 << 16);
}

#endif /* ADLER32_SIMD_NEON */
#endif /* ADLER32_SIMD_SSSE3 */
51 changes: 51 additions & 0 deletions chunkcopy.h
Expand Up @@ -50,6 +50,11 @@
#if defined(INFLATE_CHUNK_SIMD_NEON)
#include <arm_neon.h>
typedef uint8x16_t z_vec128i_t;
#elif defined(INFLATE_CHUNK_SIMD_SSE2)
#include <emmintrin.h>
typedef __m128i z_vec128i_t;
#else
#error chunkcopy.h inflate chunk SIMD is not defined for your build target
#endif

/*
Expand Down Expand Up @@ -220,6 +225,52 @@ static inline z_vec128i_t v_load8_dup(const void* src) {
static inline void v_store_128(void* out, const z_vec128i_t vec) {
vst1q_u8(out, vec);
}
#elif defined (INFLATE_CHUNK_SIMD_SSE2)
/*
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
* every 64-bit component of the 128-bit result (64-bit int splat).
*/
static inline z_vec128i_t v_load64_dup(const void* src) {
int64_t i64;
Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
return _mm_set1_epi64x(i64);
}

/*
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
* every 32-bit component of the 128-bit result (32-bit int splat).
*/
static inline z_vec128i_t v_load32_dup(const void* src) {
int32_t i32;
Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
return _mm_set1_epi32(i32);
}

/*
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
* every 16-bit component of the 128-bit result (16-bit int splat).
*/
static inline z_vec128i_t v_load16_dup(const void* src) {
int16_t i16;
Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
return _mm_set1_epi16(i16);
}

/*
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
* component of the 128-bit result (8-bit int splat).
*/
static inline z_vec128i_t v_load8_dup(const void* src) {
return _mm_set1_epi8(*(const char*)src);
}

/*
* v_store_128(): store the 128-bit vec in a memory destination (that might
* not be 16-byte aligned) void* out.
*/
static inline void v_store_128(void* out, const z_vec128i_t vec) {
_mm_storeu_si128((__m128i*)out, vec);
}
#endif

/*
Expand Down
33 changes: 33 additions & 0 deletions configure
Expand Up @@ -745,6 +745,39 @@ fi
# Check for AMD64 hardware support.
if [ x$TGT_ARCH = "xx86_64" -o x$TGT_ARCH = "xamd64" ] ; then

cat > $test.c << EOF
#include <emmintrin.h>
void foo(void) {
__m64 a, b;
_mm_add_si64(a, b);
}
EOF
if try $CC -msse2 $CFLAGS $test.c -c $test; then
CFLAGS="-DINFLATE_CHUNK_SIMD_SSE2 -msse2 -DINFLATE_CHUNK_READ_64LE $CLFAGS"

This comment has been minimized.

Copy link
@vkrasnov

vkrasnov Sep 29, 2020

Here is the typo

SFLAGS="-DINFLATE_CHUNK_SIMD_SSE2 -msse2 -DINFLATE_CHUNK_READ_64LE $SFLAGS"
echo "Checking for SSE2 support ... Yes" | tee -a configure.log
else
echo "Checking for SSE2 support ... No" | tee -a configure.log
leave 1
fi

# Check for SSSE3 support
cat > $test.c << EOF
#include <tmmintrin.h>
void foo(void) {
__m128i a;
_mm_abs_epi8(a);
}
EOF
if try $CC -mssse3 $CFLAGS $test.c -c $test; then
CFLAGS="-DADLER32_SIMD_SSSE3 -mssse3 $CFLAGS"
SFLAGS="-DADLER32_SIMD_SSSE3 -mssse3 $SFLAGS"
echo "Checking for SSSE3 support ... Yes" | tee -a configure.log
else
echo "Checking for SSSE3 support ... No" | tee -a configure.log
leave 1
fi

# Check for SSE4.2 and CRC support
cat > $test.c << EOF
#include <immintrin.h>
Expand Down
12 changes: 6 additions & 6 deletions inflate.c
Expand Up @@ -84,7 +84,7 @@
#include "inftrees.h"
#include "inflate.h"

#ifdef INFLATE_CHUNK_SIMD_NEON
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
#include "inffast_chunk.h"
#include "chunkcopy.h"
#else
Expand Down Expand Up @@ -411,7 +411,7 @@ unsigned copy;

/* if it hasn't been done already, allocate space for the window */
if (state->window == Z_NULL) {
#ifdef INFLATE_CHUNK_SIMD_NEON
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
unsigned wsize = 1U << state->wbits;
state->window = (unsigned char FAR *)
ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE,
Expand All @@ -431,7 +431,7 @@ unsigned copy;
ZALLOC(strm, 1U << state->wbits,
sizeof(unsigned char));
if (state->window == Z_NULL) return 1;
#endif /* INFLATE_CHUNK_SIMD_NEON */
#endif /* INFLATE_CHUNK_SIMD */
}

/* if window not in use yet, initialize */
Expand Down Expand Up @@ -1066,7 +1066,7 @@ int flush;
if (have >= INFLATE_FAST_MIN_INPUT &&
left >= INFLATE_FAST_MIN_OUTPUT) {
RESTORE();
#ifdef INFLATE_CHUNK_SIMD_NEON
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
inflate_fast_chunk_(strm, out);
#else
inflate_fast(strm, out);
Expand Down Expand Up @@ -1201,7 +1201,7 @@ int flush;
else
from = state->window + (state->wnext - copy);
if (copy > state->length) copy = state->length;
#ifdef INFLATE_CHUNK_SIMD_NEON
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
if (copy > left) copy = left;
put = chunkcopy_safe(put, from, copy, put + left);
}
Expand Down Expand Up @@ -1290,7 +1290,7 @@ int flush;
Note: a memory error from inflate() is non-recoverable.
*/
inf_leave:
#ifdef INFLATE_CHUNK_SIMD_NEON
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
/* We write a defined value in the unused space to help mark
* where the stream has ended. We don't use zeros as that can
* mislead clients relying on undefined behavior (i.e. assuming
Expand Down

0 comments on commit 82035d0

Please sign in to comment.