From cda87265d5b2b9c9da0934770451ffdb47d5f4d0 Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Tue, 7 May 2024 12:08:56 -0700 Subject: [PATCH] scalar qs8 rsum accumulating microkernels PiperOrigin-RevId: 631504843 --- CMakeLists.txt | 6 + cmake/gen/scalar_microkernels.cmake | 3 + gen/scalar_microkernels.bzl | 3 + scripts/generate-qs8-rsum.sh | 10 + scripts/generate-tests.sh | 1 + ...-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c | 47 ++++ ...-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c | 55 +++++ ...-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c | 62 ++++++ src/qs8-rsum/scalar.c.in | 99 +++++++++ src/xnnpack/microfnptr.h | 6 + src/xnnpack/reduce.h | 11 + test/BUILD.bazel | 9 + test/f16-f32acc-rsum.cc | 119 +++++++++++ test/f16-rsum.cc | 35 +++ test/f32-rsum.cc | 200 ++++++++++++++++++ test/qs8-rsum-minmax-fp32.cc | 138 ++++++++++++ test/qs8-rsum-minmax-fp32.yaml | 12 ++ test/rsum-microkernel-tester.h | 95 +++++++++ tools/generate-reduce-test.py | 32 ++- 19 files changed, 934 insertions(+), 9 deletions(-) create mode 100755 scripts/generate-qs8-rsum.sh create mode 100644 src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c create mode 100644 src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c create mode 100644 src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c create mode 100644 src/qs8-rsum/scalar.c.in create mode 100644 test/qs8-rsum-minmax-fp32.cc create mode 100644 test/qs8-rsum-minmax-fp32.yaml diff --git a/CMakeLists.txt b/CMakeLists.txt index 67ad1ccdb99f..bb2f6562ac48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2991,6 +2991,12 @@ IF(XNNPACK_BUILD_TESTS) TARGET_LINK_LIBRARIES(qs8-requantization-test PRIVATE hardware-config logging microkernels-all) ADD_TEST(NAME qs8-requantization-test COMMAND qs8-requantization-test) + ADD_EXECUTABLE(qs8-rsum-minmax-fp32-test test/qs8-rsum-mimax-fp32.cc) + TARGET_INCLUDE_DIRECTORIES(qs8-rsum-minmax-fp32-test PRIVATE include src test) + TARGET_LINK_LIBRARIES(qs8-rsum-minmax-fp32-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main microparams-init) + TARGET_LINK_LIBRARIES(qs8-rsum-minmax-fp32-test PRIVATE hardware-config logging microkernels-all) + ADD_TEST(NAME qs8-rsum-minmax-fp32-test COMMAND qs8-rsum-minmax-fp32-test) + ADD_EXECUTABLE(qs8-vadd-minmax-test test/qs8-vadd-minmax.cc) SET_TARGET_PROPERTIES(qs8-vadd-minmax-test PROPERTIES CXX_EXTENSIONS YES) TARGET_INCLUDE_DIRECTORIES(qs8-vadd-minmax-test PRIVATE include src test) diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index c634701aa38c..36182bcf7dc3 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -827,6 +827,9 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c src/qs8-requantization/qs8-requantization-rndnu-scalar.c + src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c + src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c + src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u1.c src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u2.c src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u4.c diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index 55ad98945864..6e662cee443a 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -823,6 +823,9 @@ ALL_SCALAR_MICROKERNEL_SRCS = [ "src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c", "src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c", "src/qs8-requantization/qs8-requantization-rndnu-scalar.c", + "src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c", + "src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c", + "src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c", "src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u1.c", "src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u2.c", "src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u4.c", diff --git a/scripts/generate-qs8-rsum.sh b/scripts/generate-qs8-rsum.sh new file mode 100755 index 000000000000..26fa38beb738 --- /dev/null +++ b/scripts/generate-qs8-rsum.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# Copyright 2024 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +#################################### Scalar ################################### +tools/xngen src/qs8-rsum/scalar.c.in -D CHANNEL_TILE=1 -D ACCUMULATORS=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D WASM=0 -o src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c & +tools/xngen src/qs8-rsum/scalar.c.in -D CHANNEL_TILE=2 -D ACCUMULATORS=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D WASM=0 -o src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c & +tools/xngen src/qs8-rsum/scalar.c.in -D CHANNEL_TILE=4 -D ACCUMULATORS=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D WASM=0 -o src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c & diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index 4a4c0a396549..a92c6e4fccf6 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -241,6 +241,7 @@ tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f32-r tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f32-rmin.yaml --output test/f32-rmin.cc & tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f32-rminmax.yaml --output test/f32-rminmax.cc & +tools/generate-reduce-test.py --tester RSumMicrokernelTester --spec test/qs8-rsum-minmax-fp32.yaml --output test/qs8-rsum-minmax-fp32.cc & tools/generate-reduce-test.py --tester RSumMicrokernelTester --spec test/f32-rsum.yaml --output test/f32-rsum.cc & tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/u8-rmax.yaml --output test/u8-rmax.cc & diff --git a/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c b/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c new file mode 100644 index 000000000000..7c2b294bcf5b --- /dev/null +++ b/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c @@ -0,0 +1,47 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-rsum/scalar.c.in +// Generator: tools/xngen +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include +#include +#include + + +void xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1( + size_t batch, + const int8_t* restrict input, + int8_t* restrict output, + const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(input != NULL); + assert(output != NULL); + + const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; + int32_t vacc0 = vinit_bias; + do { + const int32_t vt = (int32_t) *input++; + vacc0 += vt; + batch -= sizeof(int8_t); + } while (batch != 0); + + const float vscale = params->fp32_scalar_imagic.scale; + const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; + const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; + const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; + const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; + + float vfpacc = (float) vacc0 * vscale; + vfpacc += vmagic_bias; + int32_t vout = (int32_t) float_as_uint32(vfpacc); + vout = math_max_s32(vout, vmagic_min); + vout = math_min_s32(vout, vmagic_max); + vout -= vmagic_bias_less_zero_point; + + *output += (int8_t) vout; +} diff --git a/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c b/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c new file mode 100644 index 000000000000..98e4eac270c2 --- /dev/null +++ b/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c @@ -0,0 +1,55 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-rsum/scalar.c.in +// Generator: tools/xngen +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include +#include +#include + + +void xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2( + size_t batch, + const int8_t* restrict input, + int8_t* restrict output, + const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(input != NULL); + assert(output != NULL); + + const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; + int32_t vacc0 = vinit_bias; + for (; batch >= 2; batch -= 2) { + const int32_t vt0 = (int32_t) input[0]; + const int32_t vt1 = (int32_t) input[1]; + input += 2; + + vacc0 += vt0; + vacc0 += vt1; + } + + if XNN_UNLIKELY(batch != 0) { + const int32_t vt = (int32_t) *input; + vacc0 += vt; + } + + const float vscale = params->fp32_scalar_imagic.scale; + const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; + const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; + const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; + const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; + + float vfpacc = (float) vacc0 * vscale; + vfpacc += vmagic_bias; + int32_t vout = (int32_t) float_as_uint32(vfpacc); + vout = math_max_s32(vout, vmagic_min); + vout = math_min_s32(vout, vmagic_max); + vout -= vmagic_bias_less_zero_point; + + *output += (int8_t) vout; +} diff --git a/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c b/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c new file mode 100644 index 000000000000..fe3136ab20ef --- /dev/null +++ b/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c @@ -0,0 +1,62 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-rsum/scalar.c.in +// Generator: tools/xngen +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include +#include +#include + + +void xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4( + size_t batch, + const int8_t* restrict input, + int8_t* restrict output, + const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(input != NULL); + assert(output != NULL); + + const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; + int32_t vacc0 = vinit_bias; + for (; batch >= 4; batch -= 4) { + const int32_t vt0 = (int32_t) input[0]; + const int32_t vt1 = (int32_t) input[1]; + const int32_t vt2 = (int32_t) input[2]; + const int32_t vt3 = (int32_t) input[3]; + input += 4; + + vacc0 += vt0; + vacc0 += vt1; + vacc0 += vt2; + vacc0 += vt3; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t vt = (int32_t) *input++; + vacc0 += vt; + batch -= sizeof(int8_t); + } while (batch != 0); + } + + const float vscale = params->fp32_scalar_imagic.scale; + const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; + const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; + const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; + const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; + + float vfpacc = (float) vacc0 * vscale; + vfpacc += vmagic_bias; + int32_t vout = (int32_t) float_as_uint32(vfpacc); + vout = math_max_s32(vout, vmagic_min); + vout = math_min_s32(vout, vmagic_max); + vout -= vmagic_bias_less_zero_point; + + *output += (int8_t) vout; +} diff --git a/src/qs8-rsum/scalar.c.in b/src/qs8-rsum/scalar.c.in new file mode 100644 index 000000000000..99c4e08d48c4 --- /dev/null +++ b/src/qs8-rsum/scalar.c.in @@ -0,0 +1,99 @@ +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert CHANNEL_TILE >= 1 +$assert VARIANT in ("FMAGIC", "IMAGIC", "LRINTF") +#include + +#include +#include +#include + + +$PARAMS_STRUCT = "fp32_scalar_" + VARIANT.lower() +$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32" +$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32" +void xnn_qs8_rsum_minmax_${REQUANTIZATION.lower()}_ukernel__scalar_${VARIANT.lower()}_u${CHANNEL_TILE}( + size_t batch, + const int8_t* restrict input, + int8_t* restrict output, + const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(input != NULL); + assert(output != NULL); + + const int32_t vinit_bias = params->${PARAMS_STRUCT}.init_bias; + $for A in range(ACCUMULATORS): + int32_t vacc${A} = vinit_bias; + $if CHANNEL_TILE == 1: + do { + const int32_t vt = (int32_t) *input++; + vacc0 += vt; + batch -= sizeof(int8_t); + } while (batch != 0); + $else: + for (; batch >= ${CHANNEL_TILE}; batch -= ${CHANNEL_TILE}) { + $for N in range(CHANNEL_TILE): + const int32_t vt${N} = (int32_t) input[${N}]; + input += ${CHANNEL_TILE}; + + $for N in range(CHANNEL_TILE): + vacc${N % ACCUMULATORS} += vt${N}; + } + $if ACCUMULATORS > 1: + $ACC_SLICE = 1 + $while ACC_SLICE < ACCUMULATORS: + $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): + $if A + ACC_SLICE < ACCUMULATORS: + vacc${A} += vacc${A + ACC_SLICE}; + $ACC_SLICE *= 2 + + if XNN_UNLIKELY(batch != 0) { + $if CHANNEL_TILE == 2: + const int32_t vt = (int32_t) *input; + vacc0 += vt; + $else: + do { + const int32_t vt = (int32_t) *input++; + vacc0 += vt; + batch -= sizeof(int8_t); + } while (batch != 0); + } + + const float vscale = params->${PARAMS_STRUCT}.scale; + $if VARIANT == "FMAGIC": + const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; + const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; + const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; + const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; + $elif VARIANT == "IMAGIC": + const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; + const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; + const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; + const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; + $elif VARIANT == "LRINTF": + const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; + const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; + const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; + + float vfpacc = (float) vacc0 * vscale; + $if VARIANT == "FMAGIC": + vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); + vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + $elif VARIANT == "IMAGIC": + vfpacc += vmagic_bias; + int32_t vout = (int32_t) float_as_uint32(vfpacc); + vout = math_max_s32(vout, vmagic_min); + vout = math_min_s32(vout, vmagic_max); + vout -= vmagic_bias_less_zero_point; + $elif VARIANT == "LRINTF": + vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); + vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); + const int32_t vrndacc = (int32_t) lrintf(vfpacc); + int32_t vout = vrndacc + voutput_zero_point; + + *output += (int8_t) vout; +} diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index d653929083df..4a985c30c2aa 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -1597,6 +1597,12 @@ typedef void (*xnn_f32_rsum_ukernel_fn)( float* output, const union xnn_f32_scale_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); +typedef void (*xnn_qs8_rsum_ukernel_fn)( + size_t batch, + const int8_t* input, + int8_t* output, + const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + // RMAX: Reduce-MAX typedef void (*xnn_rmax_ukernel_fn)( diff --git a/src/xnnpack/reduce.h b/src/xnnpack/reduce.h index d8ab21805cf2..6f71ae1c2130 100644 --- a/src/xnnpack/reduce.h +++ b/src/xnnpack/reduce.h @@ -329,6 +329,17 @@ DECLARE_F32_RSUM_UKERNEL_FUNCTION(xnn_f32_rsum_ukernel__wasmsimd_u12_acc3) DECLARE_F32_RSUM_UKERNEL_FUNCTION(xnn_f32_rsum_ukernel__wasmsimd_u16_acc2) DECLARE_F32_RSUM_UKERNEL_FUNCTION(xnn_f32_rsum_ukernel__wasmsimd_u16_acc4) +#define DECLARE_QS8_RSUM_UKERNEL_FUNCTION(fn_name) \ + XNN_INTERNAL void fn_name( \ + size_t batch, \ + const int8_t* input, \ + int8_t* output, \ + const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + +DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1) +DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2) +DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4) + #define DECLARE_F32_RDSUM_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ size_t rows, \ diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 31a29da6640b..eacd53ca43b2 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -1084,6 +1084,15 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS, ) +xnnpack_unit_test( + name = "qs8_rsum_minmax_fp32_test", + srcs = [ + "qs8-rsum-minmax-fp32.cc", + "rsum-microkernel-tester.h", + ], + deps = MICROKERNEL_TEST_DEPS, +) + xnnpack_unit_test( name = "f16_f32acc_rdsum_test", srcs = [ diff --git a/test/f16-f32acc-rsum.cc b/test/f16-f32acc-rsum.cc index fe6813a990ab..c30dd6f0c6dc 100644 --- a/test/f16-f32acc-rsum.cc +++ b/test/f16-f32acc-rsum.cc @@ -62,6 +62,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u4, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__NEONFP16ARITH_U4, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(128) + .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u4, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -109,6 +116,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u8, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__NEONFP16ARITH_U8, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(256) + .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u8, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -156,6 +170,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u16_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__NEONFP16ARITH_U16_ACC2, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u16_acc2, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -203,6 +224,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u24_acc3, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__NEONFP16ARITH_U24_ACC3, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(768) + .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u24_acc3, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -250,6 +278,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__NEONFP16ARITH_U32_ACC2, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -297,6 +332,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u32_acc4, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__NEONFP16ARITH_U32_ACC4, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u32_acc4, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -344,6 +386,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_avx_params); } } + + TEST(F16_F32ACC_RSUM__F16C_U8, overflow_accumulator) { + TEST_REQUIRES_X86_F16C; + RSumMicrokernelTester() + .batch_size(256) + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -391,6 +440,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_avx_params); } } + + TEST(F16_F32ACC_RSUM__F16C_U16_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_F16C; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -438,6 +494,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_avx_params); } } + + TEST(F16_F32ACC_RSUM__F16C_U24_ACC3, overflow_accumulator) { + TEST_REQUIRES_X86_F16C; + RSumMicrokernelTester() + .batch_size(768) + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -485,6 +548,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_avx_params); } } + + TEST(F16_F32ACC_RSUM__F16C_U32_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_F16C; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -532,6 +602,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_avx_params); } } + + TEST(F16_F32ACC_RSUM__F16C_U32_ACC4, overflow_accumulator) { + TEST_REQUIRES_X86_F16C; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -579,6 +656,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u16, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__AVX512SKX_U16, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512SKX; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u16, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -626,6 +710,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__AVX512SKX_U32_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512SKX; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -673,6 +764,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u48_acc3, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__AVX512SKX_U48_ACC3, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512SKX; + RSumMicrokernelTester() + .batch_size(1536) + .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u48_acc3, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -720,6 +818,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u64_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__AVX512SKX_U64_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512SKX; + RSumMicrokernelTester() + .batch_size(2048) + .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u64_acc2, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -767,6 +872,13 @@ .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u64_acc4, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__AVX512SKX_U64_ACC4, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512SKX; + RSumMicrokernelTester() + .batch_size(2048) + .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u64_acc4, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -814,4 +926,11 @@ .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u128_acc4, xnn_init_f16_f32acc_scale_scalar_params); } } + + TEST(F16_F32ACC_RSUM__AVX512SKX_U128_ACC4, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512SKX; + RSumMicrokernelTester() + .batch_size(4096) + .Test(xnn_f16_f32acc_rsum_ukernel__avx512skx_u128_acc4, xnn_init_f16_f32acc_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f16-rsum.cc b/test/f16-rsum.cc index f3c4104e4c44..2d30ea2d27b0 100644 --- a/test/f16-rsum.cc +++ b/test/f16-rsum.cc @@ -62,6 +62,13 @@ .Test(xnn_f16_rsum_ukernel__neonfp16arith_u8, xnn_init_f16_scale_fp16arith_params); } } + + TEST(F16_RSUM__NEONFP16ARITH_U8, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(256) + .Test(xnn_f16_rsum_ukernel__neonfp16arith_u8, xnn_init_f16_scale_fp16arith_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -109,6 +116,13 @@ .Test(xnn_f16_rsum_ukernel__neonfp16arith_u16_acc2, xnn_init_f16_scale_fp16arith_params); } } + + TEST(F16_RSUM__NEONFP16ARITH_U16_ACC2, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f16_rsum_ukernel__neonfp16arith_u16_acc2, xnn_init_f16_scale_fp16arith_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -156,6 +170,13 @@ .Test(xnn_f16_rsum_ukernel__neonfp16arith_u24_acc3, xnn_init_f16_scale_fp16arith_params); } } + + TEST(F16_RSUM__NEONFP16ARITH_U24_ACC3, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(768) + .Test(xnn_f16_rsum_ukernel__neonfp16arith_u24_acc3, xnn_init_f16_scale_fp16arith_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -203,6 +224,13 @@ .Test(xnn_f16_rsum_ukernel__neonfp16arith_u32_acc2, xnn_init_f16_scale_fp16arith_params); } } + + TEST(F16_RSUM__NEONFP16ARITH_U32_ACC2, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f16_rsum_ukernel__neonfp16arith_u32_acc2, xnn_init_f16_scale_fp16arith_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -250,4 +278,11 @@ .Test(xnn_f16_rsum_ukernel__neonfp16arith_u32_acc4, xnn_init_f16_scale_fp16arith_params); } } + + TEST(F16_RSUM__NEONFP16ARITH_U32_ACC4, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f16_rsum_ukernel__neonfp16arith_u32_acc4, xnn_init_f16_scale_fp16arith_params); + } #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) diff --git a/test/f32-rsum.cc b/test/f32-rsum.cc index 96dcbd8ea2e6..584afc70f36b 100644 --- a/test/f32-rsum.cc +++ b/test/f32-rsum.cc @@ -62,6 +62,13 @@ .Test(xnn_f32_rsum_ukernel__neon_u4, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__NEON_U4, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON; + RSumMicrokernelTester() + .batch_size(128) + .Test(xnn_f32_rsum_ukernel__neon_u4, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -109,6 +116,13 @@ .Test(xnn_f32_rsum_ukernel__neon_u8_acc2, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__NEON_U8_ACC2, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON; + RSumMicrokernelTester() + .batch_size(256) + .Test(xnn_f32_rsum_ukernel__neon_u8_acc2, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -156,6 +170,13 @@ .Test(xnn_f32_rsum_ukernel__neon_u12_acc3, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__NEON_U12_ACC3, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON; + RSumMicrokernelTester() + .batch_size(384) + .Test(xnn_f32_rsum_ukernel__neon_u12_acc3, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -203,6 +224,13 @@ .Test(xnn_f32_rsum_ukernel__neon_u16_acc2, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__NEON_U16_ACC2, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f32_rsum_ukernel__neon_u16_acc2, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -250,6 +278,13 @@ .Test(xnn_f32_rsum_ukernel__neon_u16_acc4, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__NEON_U16_ACC4, overflow_accumulator) { + TEST_REQUIRES_ARM_NEON; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f32_rsum_ukernel__neon_u16_acc4, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -297,6 +332,13 @@ .Test(xnn_f32_rsum_ukernel__sse_u4, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__SSE_U4, overflow_accumulator) { + TEST_REQUIRES_X86_SSE; + RSumMicrokernelTester() + .batch_size(128) + .Test(xnn_f32_rsum_ukernel__sse_u4, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -344,6 +386,13 @@ .Test(xnn_f32_rsum_ukernel__sse_u8_acc2, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__SSE_U8_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_SSE; + RSumMicrokernelTester() + .batch_size(256) + .Test(xnn_f32_rsum_ukernel__sse_u8_acc2, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -391,6 +440,13 @@ .Test(xnn_f32_rsum_ukernel__sse_u12_acc3, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__SSE_U12_ACC3, overflow_accumulator) { + TEST_REQUIRES_X86_SSE; + RSumMicrokernelTester() + .batch_size(384) + .Test(xnn_f32_rsum_ukernel__sse_u12_acc3, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -438,6 +494,13 @@ .Test(xnn_f32_rsum_ukernel__sse_u16_acc2, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__SSE_U16_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_SSE; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f32_rsum_ukernel__sse_u16_acc2, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -485,6 +548,13 @@ .Test(xnn_f32_rsum_ukernel__sse_u16_acc4, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__SSE_U16_ACC4, overflow_accumulator) { + TEST_REQUIRES_X86_SSE; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f32_rsum_ukernel__sse_u16_acc4, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -532,6 +602,13 @@ .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scale_avx_params); } } + + TEST(F32_RSUM__AVX_U8, overflow_accumulator) { + TEST_REQUIRES_X86_AVX; + RSumMicrokernelTester() + .batch_size(256) + .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -579,6 +656,13 @@ .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scale_avx_params); } } + + TEST(F32_RSUM__AVX_U16_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_AVX; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -626,6 +710,13 @@ .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scale_avx_params); } } + + TEST(F32_RSUM__AVX_U24_ACC3, overflow_accumulator) { + TEST_REQUIRES_X86_AVX; + RSumMicrokernelTester() + .batch_size(768) + .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -673,6 +764,13 @@ .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scale_avx_params); } } + + TEST(F32_RSUM__AVX_U32_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_AVX; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -720,6 +818,13 @@ .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scale_avx_params); } } + + TEST(F32_RSUM__AVX_U32_ACC4, overflow_accumulator) { + TEST_REQUIRES_X86_AVX; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scale_avx_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -767,6 +872,13 @@ .Test(xnn_f32_rsum_ukernel__avx512f_u16, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__AVX512F_U16, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512F; + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f32_rsum_ukernel__avx512f_u16, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -814,6 +926,13 @@ .Test(xnn_f32_rsum_ukernel__avx512f_u32_acc2, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__AVX512F_U32_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512F; + RSumMicrokernelTester() + .batch_size(1024) + .Test(xnn_f32_rsum_ukernel__avx512f_u32_acc2, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -861,6 +980,13 @@ .Test(xnn_f32_rsum_ukernel__avx512f_u48_acc3, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__AVX512F_U48_ACC3, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512F; + RSumMicrokernelTester() + .batch_size(1536) + .Test(xnn_f32_rsum_ukernel__avx512f_u48_acc3, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -908,6 +1034,13 @@ .Test(xnn_f32_rsum_ukernel__avx512f_u64_acc2, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__AVX512F_U64_ACC2, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512F; + RSumMicrokernelTester() + .batch_size(2048) + .Test(xnn_f32_rsum_ukernel__avx512f_u64_acc2, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -955,6 +1088,13 @@ .Test(xnn_f32_rsum_ukernel__avx512f_u64_acc4, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__AVX512F_U64_ACC4, overflow_accumulator) { + TEST_REQUIRES_X86_AVX512F; + RSumMicrokernelTester() + .batch_size(2048) + .Test(xnn_f32_rsum_ukernel__avx512f_u64_acc4, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -997,6 +1137,12 @@ .Test(xnn_f32_rsum_ukernel__wasmsimd_u4, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__WASMSIMD_U4, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(128) + .Test(xnn_f32_rsum_ukernel__wasmsimd_u4, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1039,6 +1185,12 @@ .Test(xnn_f32_rsum_ukernel__wasmsimd_u8_acc2, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__WASMSIMD_U8_ACC2, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(256) + .Test(xnn_f32_rsum_ukernel__wasmsimd_u8_acc2, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1081,6 +1233,12 @@ .Test(xnn_f32_rsum_ukernel__wasmsimd_u12_acc3, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__WASMSIMD_U12_ACC3, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(384) + .Test(xnn_f32_rsum_ukernel__wasmsimd_u12_acc3, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1123,6 +1281,12 @@ .Test(xnn_f32_rsum_ukernel__wasmsimd_u16_acc2, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__WASMSIMD_U16_ACC2, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f32_rsum_ukernel__wasmsimd_u16_acc2, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1165,6 +1329,12 @@ .Test(xnn_f32_rsum_ukernel__wasmsimd_u16_acc4, xnn_init_f32_scale_scalar_params); } } + + TEST(F32_RSUM__WASMSIMD_U16_ACC4, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(512) + .Test(xnn_f32_rsum_ukernel__wasmsimd_u16_acc4, xnn_init_f32_scale_scalar_params); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1244,6 +1414,12 @@ TEST(F32_RSUM__SCALAR_U1, scale) { } } +TEST(F32_RSUM__SCALAR_U1, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(32) + .Test(xnn_f32_rsum_ukernel__scalar_u1, xnn_init_f32_scale_scalar_params); +} + TEST(F32_RSUM__SCALAR_U2_ACC2, batch_eq_2) { RSumMicrokernelTester() .batch_size(2) @@ -1283,6 +1459,12 @@ TEST(F32_RSUM__SCALAR_U2_ACC2, scale) { } } +TEST(F32_RSUM__SCALAR_U2_ACC2, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(64) + .Test(xnn_f32_rsum_ukernel__scalar_u2_acc2, xnn_init_f32_scale_scalar_params); +} + TEST(F32_RSUM__SCALAR_U3_ACC3, batch_eq_3) { RSumMicrokernelTester() .batch_size(3) @@ -1322,6 +1504,12 @@ TEST(F32_RSUM__SCALAR_U3_ACC3, scale) { } } +TEST(F32_RSUM__SCALAR_U3_ACC3, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(96) + .Test(xnn_f32_rsum_ukernel__scalar_u3_acc3, xnn_init_f32_scale_scalar_params); +} + TEST(F32_RSUM__SCALAR_U4_ACC2, batch_eq_4) { RSumMicrokernelTester() .batch_size(4) @@ -1361,6 +1549,12 @@ TEST(F32_RSUM__SCALAR_U4_ACC2, scale) { } } +TEST(F32_RSUM__SCALAR_U4_ACC2, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(128) + .Test(xnn_f32_rsum_ukernel__scalar_u4_acc2, xnn_init_f32_scale_scalar_params); +} + TEST(F32_RSUM__SCALAR_U4_ACC4, batch_eq_4) { RSumMicrokernelTester() .batch_size(4) @@ -1398,4 +1592,10 @@ TEST(F32_RSUM__SCALAR_U4_ACC4, scale) { .scale(scale) .Test(xnn_f32_rsum_ukernel__scalar_u4_acc4, xnn_init_f32_scale_scalar_params); } +} + +TEST(F32_RSUM__SCALAR_U4_ACC4, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(128) + .Test(xnn_f32_rsum_ukernel__scalar_u4_acc4, xnn_init_f32_scale_scalar_params); } \ No newline at end of file diff --git a/test/qs8-rsum-minmax-fp32.cc b/test/qs8-rsum-minmax-fp32.cc new file mode 100644 index 000000000000..42e13f7e8e80 --- /dev/null +++ b/test/qs8-rsum-minmax-fp32.cc @@ -0,0 +1,138 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Specification: test/qs8-rsum-minmax-fp32.yaml +// Generator: tools/generate-reduce-test.py + + +#include + +#include +#include + +#include +#include +#include "rsum-microkernel-tester.h" + + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U1, batch_eq_1) { + RSumMicrokernelTester() + .batch_size(1) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U1, batch_gt_1) { + for (size_t batch_size = 2; batch_size < 10; batch_size++) { + RSumMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U1, scale) { + for (float scale = 0.3f; scale < 5.0f; scale *= 3.0f) { + RSumMicrokernelTester() + .batch_size(2) + .scale(scale) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U1, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(32) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U2, batch_eq_2) { + RSumMicrokernelTester() + .batch_size(2) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U2, batch_div_2) { + for (size_t batch_size = 4; batch_size < 20; batch_size += 2) { + RSumMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U2, batch_lt_2) { + for (size_t batch_size = 1; batch_size < 2; batch_size++) { + RSumMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U2, batch_gt_2) { + for (size_t batch_size = 3; batch_size < 4; batch_size++) { + RSumMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U2, scale) { + for (float scale = 0.3f; scale < 5.0f; scale *= 3.0f) { + RSumMicrokernelTester() + .batch_size(3) + .scale(scale) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U2, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(64) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U4, batch_eq_4) { + RSumMicrokernelTester() + .batch_size(4) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U4, batch_div_4) { + for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { + RSumMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U4, batch_lt_4) { + for (size_t batch_size = 1; batch_size < 4; batch_size++) { + RSumMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U4, batch_gt_4) { + for (size_t batch_size = 5; batch_size < 8; batch_size++) { + RSumMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U4, scale) { + for (float scale = 0.3f; scale < 5.0f; scale *= 3.0f) { + RSumMicrokernelTester() + .batch_size(5) + .scale(scale) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); + } +} + +TEST(QS8_RSUM_MINMAX_FP32__SCALAR_IMAGIC_U4, overflow_accumulator) { + RSumMicrokernelTester() + .batch_size(128) + .Test(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); +} \ No newline at end of file diff --git a/test/qs8-rsum-minmax-fp32.yaml b/test/qs8-rsum-minmax-fp32.yaml new file mode 100644 index 000000000000..3fb498975e22 --- /dev/null +++ b/test/qs8-rsum-minmax-fp32.yaml @@ -0,0 +1,12 @@ +# Copyright 2024 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Scalar +- name: xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1 + init: xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params +- name: xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2 + init: xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params +- name: xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4 + init: xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params diff --git a/test/rsum-microkernel-tester.h b/test/rsum-microkernel-tester.h index 2ad0730b0daf..9bb5fc9d95f2 100644 --- a/test/rsum-microkernel-tester.h +++ b/test/rsum-microkernel-tester.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +55,93 @@ class RSumMicrokernelTester { return this->iterations_; } + RSumMicrokernelTester& input_scale(float input_scale) { + assert(input_scale > 0.0f); + assert(std::isnormal(input_scale)); + this->input_scale_ = input_scale; + return *this; + } + + float input_scale() const { + return this->input_scale_; + } + + RSumMicrokernelTester& output_scale(float output_scale) { + assert(output_scale > 0.0f); + assert(std::isnormal(output_scale)); + this->output_scale_ = output_scale; + return *this; + } + + float output_scale() const { + return this->output_scale_; + } + + RSumMicrokernelTester& input_zero_point(uint8_t input_zero_point) { + this->input_zero_point_ = input_zero_point; + return *this; + } + + uint8_t input_zero_point() const { + return this->input_zero_point_; + } + + RSumMicrokernelTester& output_zero_point(uint8_t output_zero_point) { + this->output_zero_point_ = output_zero_point; + return *this; + } + + uint8_t output_zero_point() const { + return this->output_zero_point_; + } + + uint8_t qmin() const { + return this->qmin_; + } + + uint8_t qmax() const { + return this->qmax_; + } + + void Test(xnn_qs8_rsum_ukernel_fn rsum, + xnn_init_qs8_avgpool_minmax_params_fn init_params, + xnn_qs8_requantize_fn requantize) const { + xnnpack::ReplicableRandomDevice rng; + std::uniform_int_distribution i8dist( + std::numeric_limits::min(), std::numeric_limits::max()); + + std::vector input(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); + {//for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); + std::iota(input.begin(), input.end(), 1); + + // Compute reference results. + int8_t output_init = 0x80;//i8dist(rng); + int8_t output_ref = 0; + int32_t acc = 0; + for (size_t i = 0; i < batch_size(); i++) { + acc += int32_t(input[i]) - int32_t(input_zero_point() - 0x80); + } + output_ref = requantize( + acc, input_scale() / (output_scale() * float(batch_size())), int8_t(output_zero_point() - 0x80), std::numeric_limits::min(), std::numeric_limits::max()) + output_init; + + // Prepare parameters + union xnn_qs8_avgpool_minmax_params params; + init_params( + ¶ms, + -int32_t(input_zero_point() - 0x80) * int32_t(batch_size()), + input_scale() / (output_scale() * float(batch_size())), + int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); + + // Call optimized micro-kernel. + int8_t output = output_init; + rsum(batch_size() * sizeof(int8_t), input.data(), &output, ¶ms); + + // Verify results. + EXPECT_EQ(int32_t(output_ref), int32_t(output)); + } + } + void Test(xnn_f16_rsum_ukernel_fn rsum, xnn_init_f16_scale_params_fn init_params) const { xnnpack::ReplicableRandomDevice rng; std::uniform_real_distribution f32dist(0.01f, 1.0f); @@ -140,4 +229,10 @@ class RSumMicrokernelTester { size_t batch_size_{1}; float scale_{1.0f}; size_t iterations_{15}; + float input_scale_{1.25f}; + float output_scale_{0.75f}; + uint8_t input_zero_point_{121}; + uint8_t output_zero_point_{133}; + uint8_t qmin_{0}; + uint8_t qmax_{255}; }; diff --git a/tools/generate-reduce-test.py b/tools/generate-reduce-test.py index e41218d1c247..ec7409a82032 100755 --- a/tools/generate-reduce-test.py +++ b/tools/generate-reduce-test.py @@ -30,7 +30,7 @@ def split_ukernel_name(name): - match = re.fullmatch(r"xnn_(f16|f16_f32acc|f32|u8)_r(minmax|max|min|sum)_ukernel__(.+)_u(\d+)(v)?(_acc\d+)?", name) + match = re.fullmatch(r"xnn_(f16|f16_f32acc|f32|qs8|u8)_r(minmax|max|min|sum)(_minmax_(fp32))?_ukernel__(.+)_u(\d+)(v)?(_acc\d+)?", name) if match is None: raise ValueError("Unexpected microkernel name: " + name) op_type = { @@ -39,10 +39,12 @@ def split_ukernel_name(name): "min": "Min", "sum": "Sum", }[match.group(2)] - batch_tile = int(match.group(4)) - vector_tile = bool(match.group(5)) - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(3)) - return op_type, batch_tile, vector_tile, arch, isa + requantization_type = match.group(4) + batch_tile = int(match.group(6)) + vector_tile = bool(match.group(7)) + target_name = match.group(5) + arch, isa, assembly = xnncommon.parse_target_name(target_name=target_name) + return requantization_type, op_type, batch_tile, vector_tile, arch, isa REDUCE_TEST_TEMPLATE = """\ @@ -126,16 +128,25 @@ def split_ukernel_name(name): .Test(${", ".join(TEST_ARGS)}); } } + + TEST(${TEST_NAME}, overflow_accumulator) { + $if ISA_CHECK: + ${ISA_CHECK}; + ${TESTER}() + .batch_size(${32 * BATCH_TILE}) + .Test(${", ".join(TEST_ARGS)}); + } """ -def generate_test_cases(ukernel, op_type, init_fn, tester, batch_tile, vector_tile, isa): +def generate_test_cases(ukernel, op_type, init_fn, requantization_type, tester, batch_tile, vector_tile, isa): """Generates all tests cases for a Vector Binary Operation micro-kernel. Args: ukernel: C name of the micro-kernel function. op_type: Operation type (MAX/MIN/SUM/etc). init_fn: C name of the function to initialize microkernel parameters. + requantization_type: Requantization type (FP32/RNDNU). tester: C++ name of the tester class. batch_tile: Number of batch elements processed per one iteration of the inner loop of the micro-kernel. @@ -155,6 +166,9 @@ def generate_test_cases(ukernel, op_type, init_fn, tester, batch_tile, vector_ti if init_fn: test_args.append(init_fn) batch_scale = "" + if requantization_type: + test_args.append("xnn_qs8_requantize_%s" % \ + requantization_type.lower()) if vector_tile: ctype = {"u8": "uint8_t", "f16": "uint16_t", "f32": "float"}[datatype] batch_scale = { @@ -212,10 +226,10 @@ def main(args): for ukernel_spec in spec_yaml: name = ukernel_spec["name"] init_fn = ukernel_spec.get("init") - op_type, batch_tile, vector_tile, arch, isa = split_ukernel_name(name) + requantization_type, op_type, batch_tile, vector_tile, arch, isa = split_ukernel_name(name) - test_case = generate_test_cases(name, op_type, init_fn, options.tester, - batch_tile, vector_tile, isa) + test_case = generate_test_cases(name, op_type, init_fn, requantization_type, + options.tester, batch_tile, vector_tile, isa) tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) xnncommon.overwrite_if_changed(options.output, tests)