Skip to content

Commit

Permalink
Add xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c4__neondot (ARMv8.2+dotpr…
Browse files Browse the repository at this point in the history
…od).

PiperOrigin-RevId: 326503942
  • Loading branch information
bjacob authored and xnnpack-bot committed Aug 13, 2020
1 parent 0af63ab commit a964473
Show file tree
Hide file tree
Showing 15 changed files with 2,735 additions and 1 deletion.
46 changes: 46 additions & 0 deletions BUILD.bazel
Expand Up @@ -1618,6 +1618,12 @@ AARCH64_NEONFP16ARITH_UKERNELS = [
"src/f16-spmm/gen/32x1-minmax-neonfp16arith-unroll2.c",
]

NEONDOT_UKERNELS = [
"src/qs8-gemm/gen/8x8c4-minmax-neondot.c",
"src/qs8-gemm/gen/12x8c4-minmax-neondot.c",
"src/qs8-gemm/gen/4x16c4-minmax-neondot.c",
]

SSE_UKERNELS = [
"src/f32-avgpool/9p8x-minmax-sse-c4.c",
"src/f32-avgpool/9x-minmax-sse-c4.c",
Expand Down Expand Up @@ -3004,6 +3010,42 @@ xnnpack_cc_library(
],
)

xnnpack_cc_library(
name = "neondot_ukernels",
hdrs = INTERNAL_HDRS,
aarch32_copts = ["-march=armv8.2-a+dotprod"],
aarch32_srcs = NEONDOT_UKERNELS,
aarch64_copts = ["-march=armv8.2-a+dotprod"],
aarch64_srcs = NEONDOT_UKERNELS,
gcc_copts = xnnpack_gcc_std_copts(),
msvc_copts = xnnpack_msvc_std_copts(),
deps = [
":tables",
"@FP16",
"@pthreadpool",
],
)

xnnpack_cc_library(
name = "neondot_ukernels_test_mode",
hdrs = INTERNAL_HDRS,
aarch32_copts = ["-march=armv8.2-a+dotprod"],
aarch32_srcs = NEONDOT_UKERNELS,
aarch64_copts = ["-march=armv8.2-a+dotprod"],
aarch64_srcs = NEONDOT_UKERNELS,
copts = [
"-UNDEBUG",
"-DXNN_TEST_MODE=1",
],
gcc_copts = xnnpack_gcc_std_copts(),
msvc_copts = xnnpack_msvc_std_copts(),
deps = [
":tables",
"@FP16",
"@pthreadpool",
],
)

xnnpack_cc_library(
name = "sse2_ukernels",
hdrs = INTERNAL_HDRS,
Expand Down Expand Up @@ -3383,13 +3425,15 @@ xnnpack_aggregate_library(
":neon_ukernels",
":neonfma_ukernels",
":neonv8_ukernels",
":neondot_ukernels",
":asm_ukernels",
],
aarch64_deps = [
":neon_ukernels",
":neonfma_ukernels",
":neonv8_ukernels",
":neonfp16arith_ukernels",
":neondot_ukernels",
":asm_ukernels",
],
generic_deps = [
Expand Down Expand Up @@ -3424,13 +3468,15 @@ xnnpack_aggregate_library(
":neon_ukernels_test_mode",
":neonfma_ukernels_test_mode",
":neonv8_ukernels_test_mode",
":neondot_ukernels_test_mode",
":asm_ukernels",
],
aarch64_deps = [
":neon_ukernels_test_mode",
":neonfma_ukernels_test_mode",
":neonv8_ukernels_test_mode",
":neonfp16arith_ukernels_test_mode",
":neondot_ukernels_test_mode",
":asm_ukernels",
],
generic_deps = [
Expand Down
10 changes: 10 additions & 0 deletions CMakeLists.txt
Expand Up @@ -1187,6 +1187,12 @@ SET(XNNPACK_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
src/f16-spmm/gen/32x1-minmax-neonfp16arith.c
src/f16-spmm/gen/32x1-minmax-neonfp16arith-unroll2.c)

SET(XNNPACK_NEONDOT_MICROKERNEL_SRCS
src/qs8-gemm/gen/8x8c4-minmax-neondot.c
src/qs8-gemm/gen/12x8c4-minmax-neondot.c
src/qs8-gemm/gen/4x16c4-minmax-neondot.c
)

SET(XNNPACK_SSE_MICROKERNEL_SRCS
src/f32-avgpool/9p8x-minmax-sse-c4.c
src/f32-avgpool/9x-minmax-sse-c4.c
Expand Down Expand Up @@ -2116,6 +2122,7 @@ IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]" OR IOS_ARCH MATCHES "^armv7")
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_NEON_MICROKERNEL_SRCS})
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_NEONFMA_MICROKERNEL_SRCS})
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_NEONV8_MICROKERNEL_SRCS})
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_NEONDOT_MICROKERNEL_SRCS})
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_AARCH32_ASM_MICROKERNEL_SRCS})
ENDIF()
IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR IOS_ARCH MATCHES "^arm64.*")
Expand All @@ -2124,6 +2131,7 @@ IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR IOS_ARCH MATCHES "^arm64.*")
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_NEONV8_MICROKERNEL_SRCS})
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_AARCH64_NEONFMA_MICROKERNEL_SRCS})
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS})
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_NEONDOT_MICROKERNEL_SRCS})
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_AARCH64_ASM_MICROKERNEL_SRCS})
ENDIF()
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i[3-6]86|x86_64|AMD64)$" OR IOS_ARCH MATCHES "^(i386|x86_64|AMD64)$")
Expand Down Expand Up @@ -2157,12 +2165,14 @@ IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]" OR IOS_ARCH MATCHES "^armv7")
SET_PROPERTY(SOURCE ${XNNPACK_NEON_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv7-a -mfpu=neon ")
SET_PROPERTY(SOURCE ${XNNPACK_NEONFMA_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv7-a -mfpu=neon-vfpv4 ")
SET_PROPERTY(SOURCE ${XNNPACK_NEONV8_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv8-a -mfpu=neon-fp-armv8 ")
SET_PROPERTY(SOURCE ${XNNPACK_NEONDOT_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv8.2-a+dotprod ")
IF(IOS)
SET_PROPERTY(SOURCE ${XNNPACK_AARCH32_ASM_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -arch ${IOS_ARCH} ")
ENDIF()
ENDIF()
IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR IOS_ARCH MATCHES "^arm64.*")
SET_PROPERTY(SOURCE ${XNNPACK_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
SET_PROPERTY(SOURCE ${XNNPACK_NEONDOT_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv8.2-a+dotprod ")
SET_PROPERTY(SOURCE ${XNNPACK_AARCH64_ASM_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
IF(IOS)
SET_PROPERTY(SOURCE ${XNNPACK_AARCH64_ASM_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -arch ${IOS_ARCH} ")
Expand Down
12 changes: 12 additions & 0 deletions bench/qs8-gemm.cc
Expand Up @@ -188,9 +188,21 @@ static void GEMMBenchmark(benchmark::State& state,
static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane, 2, 16, 1, 1, benchmark::utils::CheckNEON);
}
static void qs8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot, 8, 8, 4, 1, benchmark::utils::CheckNEONDOT);
}
static void qs8_gemm_12x8c4__neondot(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot, 12, 8, 4, 1, benchmark::utils::CheckNEONDOT);
}
static void qs8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot, 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
}

BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane)
BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane)
BENCHMARK_GEMM(qs8_gemm_8x8c4__neondot)
BENCHMARK_GEMM(qs8_gemm_12x8c4__neondot)
BENCHMARK_GEMM(qs8_gemm_4x16c4__neondot)
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64

#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Expand Down
8 changes: 8 additions & 0 deletions bench/utils.cc
Expand Up @@ -197,6 +197,14 @@ bool CheckNEONFMA(benchmark::State& state) {
return true;
}

bool CheckNEONDOT(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_dot()) {
state.SkipWithError("no NEON-DOT extension");
return false;
}
return true;
}

bool CheckSSSE3(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_x86_ssse3()) {
state.SkipWithError("no SSSE3 extension");
Expand Down
4 changes: 4 additions & 0 deletions bench/utils.h
Expand Up @@ -47,6 +47,10 @@ bool CheckNEON(benchmark::State& state);
// If NEON-FMA is unsupported, report error in benchmark state, and return false.
bool CheckNEONFMA(benchmark::State& state);

// Check if ARM DOT extension is supported.
// If DOT is unsupported, report error in benchmark state, and return false.
bool CheckNEONDOT(benchmark::State& state);

// Check if x86 SSSE3 extension is supported.
// If SSSE3 is unsupported, report error in benchmark state, and return false.
bool CheckSSSE3(benchmark::State& state);
Expand Down
5 changes: 5 additions & 0 deletions scripts/generate-qs8-gemm.sh
Expand Up @@ -30,6 +30,11 @@ tools/xngen src/qs8-gemm/minmax-neon-mlal-lane.c.in -D MR=4 -D NR=8 -o src/qs8-g
tools/xngen src/qs8-gemm/minmax-neon-mlal-lane.c.in -D MR=1 -D NR=16 -o src/qs8-gemm/gen/1x16-minmax-neon-mlal-lane.c
tools/xngen src/qs8-gemm/minmax-neon-mlal-lane.c.in -D MR=2 -D NR=16 -o src/qs8-gemm/gen/2x16-minmax-neon-mlal-lane.c

### C4 micro-kernels
tools/xngen src/qs8-gemm/MRxNRc4-minmax-neondot.c.in -D MR=8 -D NR=8 -o src/qs8-gemm/gen/8x8c4-minmax-neondot.c
tools/xngen src/qs8-gemm/MRxNRc4-minmax-neondot.c.in -D MR=12 -D NR=8 -o src/qs8-gemm/gen/12x8c4-minmax-neondot.c
tools/xngen src/qs8-gemm/MRxNRc4-minmax-neondot.c.in -D MR=4 -D NR=16 -o src/qs8-gemm/gen/4x16c4-minmax-neondot.c

################################### x86 SSE ###################################
### C2 micro-kernels
tools/xngen src/qs8-gemm/MRx4c2-minmax-sse.c.in -D MR=1 -D SSE=2 -D VARIANT=LD64 -o src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
Expand Down

0 comments on commit a964473

Please sign in to comment.