Skip to content
Permalink
Browse files
lib/arm/adler32: add NEON+dotprod implementation
This improves Adler-32 performance on large inputs:

    CPU	        Old (GB/s)  New (GB/s)
    =========   ==========  ==========
    Apple M1    51.5        61.1
    Cortex-X1   34.2        45.2
    Cortex-A78  16.9        23.3
    Cortex-A76  18.2        23.1
    Cortex-A55  2.4          4.1

This roughly follows the approach of the old AVX-2 implementation, which
I recently changed to a different approach.  But vdotq_u32 (the udot
instruction) makes the approach work well on arm64.
  • Loading branch information
ebiggers committed Aug 8, 2022
1 parent 4a578bd commit d8b53901fb752fbfd9e496f5c21977be6dea0cc2
Showing 1 changed file with 109 additions and 6 deletions.
@@ -30,7 +30,9 @@

#include "cpu_features.h"

/* Regular NEON implementation */
#if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN()
# define adler32_neon adler32_neon
# define FUNCNAME adler32_neon
# define FUNCNAME_CHUNK adler32_neon_chunk
# define IMPL_ALIGNMENT 16
@@ -140,18 +142,119 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
}
# include "../adler32_vec_template.h"
# if HAVE_NEON_NATIVE
# define DEFAULT_IMPL adler32_neon
#endif /* Regular NEON implementation */

/* NEON+dotprod implementation */
#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
# define adler32_neon_dotprod adler32_neon_dotprod
# define FUNCNAME adler32_neon_dotprod
# define FUNCNAME_CHUNK adler32_neon_dotprod_chunk
# define IMPL_ALIGNMENT 16
# define IMPL_SEGMENT_LEN 64
# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN
# if HAVE_DOTPROD_NATIVE
# define ATTRIBUTES
# else
# ifdef __clang__
# define ATTRIBUTES __attribute__((target("dotprod")))
/*
* With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the
* default target is armv8.3-a or later in which case it must be omitted.
* armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
*/
# elif defined(__ARM_FEATURE_JCVT)
# define ATTRIBUTES __attribute__((target("+dotprod")))
# else
# define ATTRIBUTES __attribute__((target("arch=armv8.2-a+dotprod")))
# endif
# endif
# include <arm_neon.h>
static forceinline ATTRIBUTES void
adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end,
u32 *s1, u32 *s2)
{
const uint8x16_t mults_a = {
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
};
const uint8x16_t mults_b = {
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
};
const uint8x16_t mults_c = {
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
};
const uint8x16_t mults_d = {
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const uint8x16_t ones = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 1, 1,
};
uint32x4_t v_s1_a = { 0, 0, 0, 0 };
uint32x4_t v_s1_b = { 0, 0, 0, 0 };
uint32x4_t v_s1_c = { 0, 0, 0, 0 };
uint32x4_t v_s1_d = { 0, 0, 0, 0 };
uint32x4_t v_s2_a = { 0, 0, 0, 0 };
uint32x4_t v_s2_b = { 0, 0, 0, 0 };
uint32x4_t v_s2_c = { 0, 0, 0, 0 };
uint32x4_t v_s2_d = { 0, 0, 0, 0 };
uint32x4_t v_s1_sums_a = { 0, 0, 0, 0 };
uint32x4_t v_s1_sums_b = { 0, 0, 0, 0 };
uint32x4_t v_s1_sums_c = { 0, 0, 0, 0 };
uint32x4_t v_s1_sums_d = { 0, 0, 0, 0 };
uint32x4_t v_s1;
uint32x4_t v_s2;

do {
uint8x16_t bytes_a = *p++;
uint8x16_t bytes_b = *p++;
uint8x16_t bytes_c = *p++;
uint8x16_t bytes_d = *p++;

v_s1_sums_a += v_s1_a;
v_s1_a = vdotq_u32(v_s1_a, bytes_a, ones);
v_s2_a = vdotq_u32(v_s2_a, bytes_a, mults_a);

v_s1_sums_b += v_s1_b;
v_s1_b = vdotq_u32(v_s1_b, bytes_b, ones);
v_s2_b = vdotq_u32(v_s2_b, bytes_b, mults_b);

v_s1_sums_c += v_s1_c;
v_s1_c = vdotq_u32(v_s1_c, bytes_c, ones);
v_s2_c = vdotq_u32(v_s2_c, bytes_c, mults_c);

v_s1_sums_d += v_s1_d;
v_s1_d = vdotq_u32(v_s1_d, bytes_d, ones);
v_s2_d = vdotq_u32(v_s2_d, bytes_d, mults_d);
} while (p != end);

v_s1 = v_s1_a + v_s1_b + v_s1_c + v_s1_d;
v_s2 = v_s2_a + v_s2_b + v_s2_c + v_s2_d +
vqshlq_n_u32(v_s1_sums_a + v_s1_sums_b +
v_s1_sums_c + v_s1_sums_d, 6);
*s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
}
# include "../adler32_vec_template.h"
#endif /* NEON+dotprod implementation */

#if defined(adler32_neon_dotprod) && HAVE_DOTPROD_NATIVE
#define DEFAULT_IMPL adler32_neon_dotprod
#else
static inline adler32_func_t
arch_select_adler32_func(void)
{
if (HAVE_NEON(get_arm_cpu_features()))
const u32 features MAYBE_UNUSED = get_arm_cpu_features();

#ifdef adler32_neon_dotprod
if (HAVE_NEON(features) && HAVE_DOTPROD(features))
return adler32_neon_dotprod;
#endif
#ifdef adler32_neon
if (HAVE_NEON(features))
return adler32_neon;
#endif
return NULL;
}
# define arch_select_adler32_func arch_select_adler32_func
# endif /* !HAVE_NEON_NATIVE */
#endif /* HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN() */
#define arch_select_adler32_func arch_select_adler32_func
#endif

#endif /* LIB_ARM_ADLER32_IMPL_H */

0 comments on commit d8b5390

Please sign in to comment.