Permalink
Switch branches/tags
Nothing to show
Find file Copy path
4f09871 Sep 4, 2018
5 contributors

Users who have contributed to this file

@bjacob @mariecwhite @alexfru @DavidMansell @jart
6192 lines (5702 sloc) 237 KB
// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This is a standalone testbed and benchmark for gemmlowp-style GEMM kernels,
// either doing integer or float arithmetic.
// It verifies that a kernel produces correct results, then benchmarks it.
//
// Some benchmark results are recorded in this spreadsheet:
//
// https://docs.google.com/spreadsheets/d/1UPbzbp9rdsD6RXxOr5q6AZ0n1omgEknLYO2ogiw6Kqk/edit?usp=sharing
//
// This program is entirely self-contained, and can be compiled manually
// such as suggested in the command lines below.
// It currently supports only Android/ARM but would trivially generalize to
// other OSes (it's mostly standard POSIX) or architectures (each kernel
// targets a specific architecture, one may simply add more).
/*
Build and run this benchmark on Android/ARM/32bit:
~/android/toolchains/arm-linux-androideabi/bin/arm-linux-androideabi-clang++ \
-fPIE -pie -O3 --std=c++11 standalone/neon-gemm-kernel-benchmark.cc -o \
/tmp/benchmark -mfloat-abi=softfp -mfpu=neon-vfpv4 && adb push /tmp/benchmark \
/data/local/tmp && adb shell /data/local/tmp/benchmark
Build and run this benchmark on Android/ARM/64bit:
~/android/toolchains/aarch64-linux-android/bin/aarch64-linux-android-clang++ \
-fPIE -static -O3 --std=c++11 standalone/neon-gemm-kernel-benchmark.cc -o \
/tmp/benchmark && adb push /tmp/benchmark /data/local/tmp && adb shell \
/data/local/tmp/benchmark
*/
// For big.LITTLE devices, use 'taskset' to select which cores to benchmark.
//
// The syntax is: taskset <mask> <commandline>
// where mask is a binary mask where each bit corresponds to a core,
// and low bits are little cores.
//
// Examples:
// Nexus 5X big cores: taskset 30
// Nexus 5X little cores: taskset 0f
// Pixel XL big cores: taskset 0c
// Pixel XL little cores: taskset 03
//
// Full example:
// adb shell taskset 0c /data/local/tmp/benchmark
#include <sched.h>
#include <unistd.h>
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <random>
#include <type_traits>
#if !defined(__arm__) && !defined(__aarch64__) && \
!(defined(__mips) && (__mips_isa_rev >= 5) && defined(__mips_msa))
#error This benchmark assumes ARM or MIPS (for intrinsics and inline assembly sections).
#endif
#if defined(__arm__) || defined(__aarch64__)
#include <arm_neon.h>
#endif
#if defined(__mips)
#include <msa.h>
// Some convenience macros to hide differences between MIPS32 and MIPS64.
#ifdef __LP64__
#define GEMMLOWP_MIPS_XADDIU "daddiu"
#else
#define GEMMLOWP_MIPS_XADDIU "addiu"
#endif
#endif
// Typically one wants to fit in L1 cache, and GEMM implementations
// are carefully optimized to tune their access patterns to that effect.
// Most devices have at least 16k of L1 cache. The Kraits have exactly 16k.
const int kDefaultCacheSizeK = 16;
const int kCacheLineSize = 64;
// These definitions are used for labels within assembly code. Required for
// iOS toolchain compatibility.
#define GEMMLOWP_LABEL_AFTER_LOOP "1"
#define GEMMLOWP_LABEL_LOOP "2"
#define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
#define GEMMLOWP_LABEL_STORE "4"
// BEGIN code copied from gemmlowp/internal/kernel.h
// Explanation of general gemmlowp terminology
// ===========================================
//
// We use the following abbreviations:
// LHS = "left-hand side"
// RHS = "right-hand side"
// Sometimes when referring to either LHS or RHS, we just say a "Side".
//
// In a matrix product of a MxK matrix times a KxN matrix,
// we call K the 'depth'. Note that M is the number of rows
// of the result (and of the LHS), and N is the number of columns
// of the result (and of the RHS).
//
// In each of the LHS and RHS matrices, we call 'width' the
// other dimension, besides the depth. So in the LHS, 'width'
// is the number of rows, while in the RHS, 'width' is the number
// of columns.
//
// So in the LHS MxK matrix, the depth is K and the width in M.
// And in the RHS KxN matrix, the depth is K and the width in N.
//
// This is illustrated in this picture:
//
// RHS width
// <----------------->
// +-----------------+ ^
// | RHS | | Depth
// +-----------------+ v
// ^ +--+ +-----------------+
// | |L | | |
// LHS width | |H | | Result |
// | |S | | |
// v +--+ +-----------------+
// <-->
// Depth
// Explanation of gemmlowp kernel formats and "cells"
// ==================================================
//
// Kernels operate on small LHS and RHS blocks that fit in registers.
// These blocks are stored contiguously in memory, but not always
// in a traditional column-major or row-major order; instead,
// they consist of a number of sub-blocks, which we call "cells",
// that are stored in column-major or row-major order. However,
// what really matters to us is not so much rows vs columns, but
// rather width vs depth. So we refer to "width-major" and "depth-major"
// storage orders. In the LHS, width-major means row-major,
// while in the RHS, width-major means column-major.
// There is also a third possibility, "diagonal order",
// which is unused at the moment.
//
// We aim to treat both sides, LHS and RHS, on an equal footing,
// so we call them both 'sides'. A KernelFormat thus is just a pair
// of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat
// contains a CellFormat and a number of cells; cells are only ever
// stacked in the width dimension, which means stacked vertically in the
// LHS and stacked horizondally in the RHS.
//
// Example
// =======
//
// Let's work out the data layout expected by a kernel having the
// following format (the struct names here are defined below in this file):
//
// KernelFormat<
// KernelSideFormat<CellFormat<3, 4>, 3>,
// KernelSideFormat<CellFormat<5, 4>, 2>
// >
//
// The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means:
// 3 cells, each cell having dimensions (width=3, depth=4), laid out in
// DepthMajor order (the default value, see CellFormat). In the LHS,
// DepthMajor means column-major, so the LHS cells are of size 3x4 in
// column-major order, so the LHS layout is:
//
// 0 3 6 9
// 1 4 7 10
// 2 5 8 11
// 12 15 18 21
// 13 16 19 22
// 14 17 20 23
// 24 27 30 33
// 25 28 31 34
// 26 29 32 35
//
// The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means:
// 2 cells each having dimensions (width=5, depth=4), laid out in
// DepthMajor order (the default value, see CellFormat). In the RHS,
// DepthMajor means row-major, so the RHS cells are of size 4x5 in
// row-major order, so the RHS layout is:
//
// 0 1 2 3 4 20 21 22 23 24
// 5 6 7 8 9 25 26 27 28 29
// 10 11 12 13 14 30 31 32 33 34
// 15 16 17 18 19 35 36 37 38 39
// CellOrder enumerates the possible storage orders (=layouts) for
// a cell (see explanation above).
enum class CellOrder { DepthMajor, WidthMajor, Diagonal };
// CellFormat describes how data is laid
// out in a cell. That is, a CellOrder together with actual dimensions.
template <int tWidth, int tDepth, CellOrder tOrder>
struct CellFormat {
static const int kWidth = tWidth;
static const int kDepth = tDepth;
static const CellOrder kOrder = tOrder;
static const int kSize = kWidth * kDepth;
};
// KernelSideFormat describes how data is laid out in a kernel side
// (i.e. LHS or RHS). That is, a CellFormat together with a number of
// cells. These cells are always stacked in the Width dimension.
// For example, in the LHS case, the Width dimension is the rows dimension,
// se we're saying that in the LHS, cells are stacked vertically.
// We never stack cells in the Depth dimension.
template <typename tCellFormat, int tCells>
struct KernelSideFormat {
typedef tCellFormat Cell;
static const int kCells = tCells;
static const int kWidth = kCells * Cell::kWidth;
static const int kDepth = Cell::kDepth;
};
// KernelFormat describes fully the input data layout that a kernel expects.
// It consists of two KernelSideFormat's, one for LHS and one for RHS.
template <typename tLhs, typename tRhs>
struct KernelFormat {
typedef tLhs Lhs;
typedef tRhs Rhs;
static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, "");
static const int kDepth = Lhs::Cell::kDepth;
static const int kRows = Lhs::Cell::kWidth * Lhs::kCells;
static const int kCols = Rhs::Cell::kWidth * Rhs::kCells;
};
// KernelOperandRanges specifies the minimum and maximum values an operand can
// take. It consists of two ranges: one for the LHS and one for the RHS. The
// default values are the minimum and maximum values of the operand data type.
template <typename Kernel, typename OperandType = typename Kernel::OperandType>
struct KernelOperandRanges {
static OperandType LhsMin() {
return std::numeric_limits<OperandType>::lowest();
}
static OperandType LhsMax() {
return std::numeric_limits<OperandType>::max();
}
static OperandType RhsMin() {
return std::numeric_limits<OperandType>::lowest();
}
static OperandType RhsMax() {
return std::numeric_limits<OperandType>::max();
}
};
template <typename Kernel>
struct KernelOperandRanges<Kernel, float> {
static float LhsMin() { return -100.f; }
static float LhsMax() { return 100.f; }
static float RhsMin() { return -100.f; }
static float RhsMax() { return 100.f; }
};
#define SET_7BIT_RANGES(kernel) \
template <> \
struct KernelOperandRanges<kernel, std::int8_t> { \
static std::int8_t LhsMin() { return -63; } \
static std::int8_t LhsMax() { return 63; } \
static std::int8_t RhsMin() { return -64; } \
static std::int8_t RhsMax() { return 63; } \
};
#define SET_425BIT_RANGES(kernel) \
template <> \
struct KernelOperandRanges<kernel, std::int8_t> { \
static std::int8_t LhsMin() { return -7; } \
static std::int8_t LhsMax() { return 7; } \
static std::int8_t RhsMin() { return -9; } \
static std::int8_t RhsMax() { return 9; } \
};
inline const char* CellOrderName(CellOrder o) {
switch (o) {
case CellOrder::DepthMajor:
return "DepthMajor";
case CellOrder::WidthMajor:
return "WidthMajor";
case CellOrder::Diagonal:
return "Diagonal";
default:
assert(false);
return nullptr;
}
}
// Returns the offset into a cell, at which a given coefficient is stored.
template <typename CellFormat>
inline int OffsetIntoCell(int w, int d) {
switch (CellFormat::kOrder) {
case CellOrder::DepthMajor:
return w + d * CellFormat::kWidth;
case CellOrder::WidthMajor:
return d + w * CellFormat::kDepth;
case CellOrder::Diagonal:
assert(CellFormat::kWidth == CellFormat::kDepth);
static const int size = CellFormat::kWidth;
return ((size + w - d) * size + d) % (size * size);
default:
assert(false);
return 0;
}
}
// END code copied from gemmlowp/internal/kernel.h
#ifdef __arm__
// This is the current standard kernel in gemmlowp, see:
// https://github.com/google/gemmlowp/blob/b1e2a29ff866680028f3080efc244e10e8dd7f46/internal/kernel_neon.h#L33
struct NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators {
typedef std::uint8_t OperandType;
typedef std::uint32_t AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load 1 Rhs cell of size 2x4
"vld1.8 {d0}, [%[rhs_ptr]]!\n"
// Load 3 Lhs cells of size 4x2 each
"vld1.8 {d2}, [%[lhs_ptr]]!\n"
"vld1.8 {d4}, [%[lhs_ptr]]!\n"
"vld1.8 {d6}, [%[lhs_ptr]]!\n"
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d8, d9}, [r0]!\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vld1.32 {d24, d25}, [r0]!\n"
"vld1.32 {d10, d11}, [r0]!\n"
"vld1.32 {d18, d19}, [r0]!\n"
"vld1.32 {d26, d27}, [r0]!\n"
"vld1.32 {d12, d13}, [r0]!\n"
"vld1.32 {d20, d21}, [r0]!\n"
"vld1.32 {d28, d29}, [r0]!\n"
"vld1.32 {d14, d15}, [r0]!\n"
"vld1.32 {d22, d23}, [r0]!\n"
"vld1.32 {d30, d31}, [r0]!\n"
"subs %[depth], #2\n"
"beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
GEMMLOWP_LABEL_LOOP
":\n"
// Overview of register layout:
//
// A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
// A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
// (q1--q3).
// A 12x4 block of accumulators is stored in 32bit in q4--q15.
//
// +-----+-----+-----+-----+
// |d0[0]|d0[1]|d0[2]|d0[3]|
// Rhs +-----+-----+-----+-----+
// |d1[0]|d1[1]|d1[2]|d1[3]|
// +-----+-----+-----+-----+
//
// | | | | |
//
// Lhs | | | | |
//
// +--+--+ - - - - +-----+-----+-----+-----+
// |d2|d3| | q4 | q5 | q6 | q7 |
// |d2|d3| | q4 | q5 | q6 | q7 |
// |d2|d3| | q4 | q5 | q6 | q7 |
// |d2|d3| | q4 | q5 | q6 | q7 |
// +--+--+ - - - - +-----+-----+-----+-----+
// |d4|d5| | q8 | q9 | q10 | q11 |
// |d4|d5| | q8 | q9 | q10 | q11 |
// |d4|d5| | q8 | q9 | q10 | q11 |
// |d4|d5| | q8 | q9 | q10 | q11 |
// +--+--+ - - - - +-----+-----+-----+-----+
// |d6|d7| | q12 | q13 | q14 | q15 |
// |d6|d7| | q12 | q13 | q14 | q15 |
// |d6|d7| | q12 | q13 | q14 | q15 |
// |d6|d7| | q12 | q13 | q14 | q15 |
// +--+--+ - - - - +-----+-----+-----+-----+
//
// Accumulator
// Expand Lhs/Rhs cells to 16 bit.
// Note: moving theses vmovls further down to allow for
// longer data pipelining helps a little on A57 but is
// harmful on A53 --- It looks as if A53 doesn't like
// interleaving vmovl's into the vmlal's.
"vmovl.u8 q0, d0\n"
"vmovl.u8 q1, d2\n"
"vmovl.u8 q2, d4\n"
"vmovl.u8 q3, d6\n"
// Multiply-accumulate, level of depth 0
"vmlal.u16 q4, d2, d0[0]\n"
"vmlal.u16 q5, d2, d0[1]\n"
"vmlal.u16 q6, d2, d0[2]\n"
"vmlal.u16 q7, d2, d0[3]\n"
"vldr d2, [%[lhs_ptr]]\n"
"vmlal.u16 q8, d4, d0[0]\n"
"vmlal.u16 q9, d4, d0[1]\n"
"vmlal.u16 q10, d4, d0[2]\n"
"vmlal.u16 q11, d4, d0[3]\n"
"vldr d4, [%[lhs_ptr], #8]\n"
"vmlal.u16 q12, d6, d0[0]\n"
"vmlal.u16 q13, d6, d0[1]\n"
"vmlal.u16 q14, d6, d0[2]\n"
"vmlal.u16 q15, d6, d0[3]\n"
"vldr d6, [%[lhs_ptr], #16]\n"
"vldr d0, [%[rhs_ptr]]\n"
// Multiply-accumulate, level of depth 1
"vmlal.u16 q4, d3, d1[0]\n"
"vmlal.u16 q5, d3, d1[1]\n"
"add %[lhs_ptr], #24\n"
"vmlal.u16 q6, d3, d1[2]\n"
"vmlal.u16 q7, d3, d1[3]\n"
"add %[rhs_ptr], #8\n"
"vmlal.u16 q8, d5, d1[0]\n"
"vmlal.u16 q9, d5, d1[1]\n"
"subs %[depth], #2\n"
"vmlal.u16 q10, d5, d1[2]\n"
"vmlal.u16 q11, d5, d1[3]\n"
"vmlal.u16 q12, d7, d1[0]\n"
"vmlal.u16 q13, d7, d1[1]\n"
"vmlal.u16 q14, d7, d1[2]\n"
"vmlal.u16 q15, d7, d1[3]\n"
"bne " GEMMLOWP_LABEL_LOOP "b\n"
GEMMLOWP_LABEL_AFTER_LOOP
":\n"
// Expand Lhs/Rhs cells to 16 bit.
"vmovl.u8 q0, d0\n"
"vmovl.u8 q1, d2\n"
"vmovl.u8 q2, d4\n"
"vmovl.u8 q3, d6\n"
// Multiply-accumulate, level of depth 0
"vmlal.u16 q4, d2, d0[0]\n"
"vmlal.u16 q5, d2, d0[1]\n"
"vmlal.u16 q6, d2, d0[2]\n"
"vmlal.u16 q7, d2, d0[3]\n"
"vmlal.u16 q8, d4, d0[0]\n"
"vmlal.u16 q9, d4, d0[1]\n"
"vmlal.u16 q10, d4, d0[2]\n"
"vmlal.u16 q11, d4, d0[3]\n"
"vmlal.u16 q12, d6, d0[0]\n"
"vmlal.u16 q13, d6, d0[1]\n"
"vmlal.u16 q14, d6, d0[2]\n"
"vmlal.u16 q15, d6, d0[3]\n"
// Multiply-accumulate, level of depth 1
"vmlal.u16 q4, d3, d1[0]\n"
"vmlal.u16 q5, d3, d1[1]\n"
"vmlal.u16 q6, d3, d1[2]\n"
"vmlal.u16 q7, d3, d1[3]\n"
"vmlal.u16 q8, d5, d1[0]\n"
"vmlal.u16 q9, d5, d1[1]\n"
"vmlal.u16 q10, d5, d1[2]\n"
"vmlal.u16 q11, d5, d1[3]\n"
"vmlal.u16 q12, d7, d1[0]\n"
"vmlal.u16 q13, d7, d1[1]\n"
"vmlal.u16 q14, d7, d1[2]\n"
"vmlal.u16 q15, d7, d1[3]\n"
// Store accumulators
"mov r0, %[accum_ptr]\n"
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d16, d17}, [r0]!\n"
"vst1.32 {d24, d25}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]!\n"
"vst1.32 {d18, d19}, [r0]!\n"
"vst1.32 {d26, d27}, [r0]!\n"
"vst1.32 {d12, d13}, [r0]!\n"
"vst1.32 {d20, d21}, [r0]!\n"
"vst1.32 {d28, d29}, [r0]!\n"
"vst1.32 {d14, d15}, [r0]!\n"
"vst1.32 {d22, d23}, [r0]!\n"
"vst1.32 {d30, d31}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31");
}
};
// This is Maciek Chociej's fast kernel not expanding operands,
// from gemmlowp/meta/. Search for
// mul_3x8_3x8_int32_lhsadd_rhsadd
// in this file:
// https://raw.githubusercontent.com/google/gemmlowp/e4b9d858b6637d5d0058bfa3d869d2b95864251b/meta/single_thread_gemm.h
struct NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand {
typedef std::uint8_t OperandType;
typedef std::uint32_t AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<3, 8, CellOrder::WidthMajor>, 1>,
KernelSideFormat<CellFormat<3, 8, CellOrder::WidthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Clear aggregators.
"vmov.i32 q0, #0\n"
"vmov.i32 q1, #0\n"
"vmov.i32 q2, #0\n"
"vmov.i32 q3, q0\n"
"vmov.i32 q4, q1\n"
"vmov.i32 q5, q2\n"
"vmov.i32 q6, q3\n"
"vmov.i32 q7, q4\n"
"vmov.i32 q8, q5\n"
// Loop head
GEMMLOWP_LABEL_LOOP
":\n"
// Subtract counter.
"subs %[depth], %[depth], #8\n"
"vld1.8 {d18, d19, d20}, [%[rhs_ptr]]!\n"
"vld1.8 {d21, d22, d23}, [%[lhs_ptr]]!\n"
"vmull.u8 q12, d18, d21\n"
"vmull.u8 q13, d18, d22\n"
"vmull.u8 q14, d18, d23\n"
"vmull.u8 q15, d19, d21\n"
"vpadal.u16 q0, q12\n"
"vpadal.u16 q1, q13\n"
"vpadal.u16 q2, q14\n"
"vpadal.u16 q3, q15\n"
"vmull.u8 q12, d19, d22\n"
"vmull.u8 q13, d19, d23\n"
"vmull.u8 q14, d20, d21\n"
"vmull.u8 q15, d20, d22\n"
"vmull.u8 q9, d20, d23\n"
"vpadal.u16 q4, q12\n"
"vpadal.u16 q5, q13\n"
"vpadal.u16 q6, q14\n"
"vpadal.u16 q7, q15\n"
"vpadal.u16 q8, q9\n"
// Loop branch
"bne " GEMMLOWP_LABEL_LOOP
"b\n"
// Horizontal reduce aggregators, step 1
"vpadd.u32 d0, d0, d1\n"
"vpadd.u32 d2, d2, d3\n"
"vpadd.u32 d4, d4, d5\n"
"vpadd.u32 d6, d6, d7\n"
"vpadd.u32 d8, d8, d9\n"
"vpadd.u32 d10, d10, d11\n"
"vpadd.u32 d12, d12, d13\n"
"vpadd.u32 d14, d14, d15\n"
"vpadd.u32 d16, d16, d17\n"
// Horizontal reduce aggregators, step 2
"vpadd.u32 d0, d0, d2\n"
"vpadd.u32 d1, d4, d4\n"
"vpadd.u32 d6, d6, d8\n"
"vpadd.u32 d7, d10, d10\n"
"vpadd.u32 d12, d12, d14\n"
"vpadd.u32 d13, d16, d16\n"
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d2}, [r0]!\n"
"vld1.32 {d3[0]}, [r0]!\n"
"vld1.32 {d8}, [r0]!\n"
"vld1.32 {d9[0]}, [r0]!\n"
"vld1.32 {d14}, [r0]!\n"
"vld1.32 {d15[0]}, [r0]!\n"
// Accumulate
"vadd.s32 q0, q0, q1\n"
"vadd.s32 q3, q3, q4\n"
"vadd.s32 q6, q6, q7\n"
// Store accumulators
"mov r0, %[accum_ptr]\n"
"vst1.32 {d0}, [r0]!\n"
"vst1.32 {d1[0]}, [r0]!\n"
"vst1.32 {d6}, [r0]!\n"
"vst1.32 {d7[0]}, [r0]!\n"
"vst1.32 {d12}, [r0]!\n"
"vst1.32 {d13[0]}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31");
}
};
// Fast kernel operating on int8 operands.
// It is assumed that one of the two int8 operands only takes values
// in [-127, 127], while the other may freely range in [-128, 127].
// The issue with both operands taking the value -128 is that:
// -128*-128 + -128*-128 == -32768 overflows int16.
// Every other expression a*b + c*d, for any int8 a,b,c,d, fits in int16
// range. That is the basic idea of this kernel.
struct NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits {
typedef std::int8_t OperandType;
typedef std::int32_t AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
KernelSideFormat<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
std::size_t start_depth = 123;
std::size_t run_depth = depth;
std::size_t dst_col_stride = 4;
AccumulatorType* dst_ptr = accum_ptr;
asm volatile(
// Overview of register layout:
//
// A 2x16 block of Rhs is stored in 8 bit in d0--d3.
// A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only
// half of the register space required, so we loop over these registers
// twice. Only half of it, a 2x16 block, is stored in d4--d7 at
// any given time.
//
// A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit
// components which need to be horizontally-added at the end)
//
// The Lhs vectors are multiplied by the Rhs vectors with a widening
// multiply over the 8 first levels of depth, producing int16x8
// vectors of products for each position in the accumulator matrix.
// Here comes the special trick: since the operands are signed int8,
// their range being [ -2^7 , 2^7 ), their products are in range
// [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
// without any risk of overflowing int16.
// We thus proceed with the 8 next levels of depth, multiplying
// again Lhs by Rhs, accumulating into this existing int16x8 vector.
//
// Only then, having processed 16 levels of depth, do we need to
// horizontally add these int16x8 accumulators into the final
// int32x4 accumulators.
//
// As we do not have enough registers to store all 16 int16x8
// temporary-16bit-accumulators, we have them cycle through q4--q7.
//
//
// Register layout (ignoring the q4--q7 temporary 16bit accumulators):
//
// +----+----+
// | d0 | d2 |
// | . | . |
// | . | . |
// | . | . |
// Rhs +----+----+
// | d1 | d3 |
// | . | . |
// | . | . |
// | . | . |
// +----+----+
//
// | | |
//
// Lhs | | |
//
// +--------+--------+ - - - - +----+----+
// | d4 ... | d5 ... | | q8 | q9 |
// | d6 ... | d7 ... | | q10| q11|
// | d4 ... | d5 ... | | q12| q13|
// | d6 ... | d7 ... | | q14| q15|
// +--------+--------+ - - - - +----+----+
//
// Accumulator
//
// Clear accumulators, and, interleaved with it,
// initial loads of the first loop iteration,
// taken out of the loop so that in the loop itself we have
// optimal streaming of data from memory.
"vldr d0, [%[rhs_ptr], #0]\n"
"vmov.i32 q8, #0\n"
"vldr d4, [%[lhs_ptr], #0]\n"
"vmov.i32 q9, #0\n"
"vldr d2, [%[rhs_ptr], #16]\n"
"vmov.i32 q10, q8\n"
"vldr d6, [%[lhs_ptr], #16]\n"
"vmov.i32 q11, q8\n"
"vldr d1, [%[rhs_ptr], #8]\n"
"vmov.i32 q12, q8\n"
"vldr d5, [%[lhs_ptr], #8]\n"
"vmov.i32 q13, q8\n"
"vldr d3, [%[rhs_ptr], #24]\n"
"vmov.i32 q14, q8\n"
"vldr d7, [%[lhs_ptr], #24]\n"
"vmov.i32 q15, q8\n"
// General loop.
GEMMLOWP_LABEL_LOOP
":\n"
// Multiply 8 first levels of depth.
"vmull.s8 q4, d0, d4\n"
"add %[rhs_ptr], %[rhs_ptr], #32\n"
"vmull.s8 q5, d2, d4\n"
"vldr d4, [%[lhs_ptr], #32]\n"
"vmull.s8 q6, d0, d6\n"
"vmull.s8 q7, d2, d6\n"
"vldr d6, [%[lhs_ptr], #48]\n"
// Multiply-accumulate second-half, again into the same
// 16bit local accumulator registers. This is where we
// take advantage of having int8 instead of uint8 and therefore
// being able to accumulate two products into int16.
"vmlal.s8 q4, d1, d5\n"
"vmlal.s8 q5, d3, d5\n"
"vldr d5, [%[lhs_ptr], #40]\n"
"vmlal.s8 q6, d1, d7\n"
"vmlal.s8 q7, d3, d7\n"
"vldr d7, [%[lhs_ptr], #56]\n"
// Add pairwise, accumulate into 32-bit accumulators.
"vpadal.s16 q8, q4\n"
"add %[lhs_ptr], %[lhs_ptr], #64\n"
"vpadal.s16 q9, q5\n"
"subs %[run_depth], %[run_depth], #16\n"
"vpadal.s16 q10, q6\n"
"vpadal.s16 q11, q7\n"
"beq " GEMMLOWP_LABEL_AFTER_LOOP
"f\n"
// Multiply first half.
"vmull.s8 q4, d0, d4\n"
"vmull.s8 q5, d2, d4\n"
"vldr d4, [%[lhs_ptr], #0]\n"
"vmull.s8 q6, d0, d6\n"
"vldr d0, [%[rhs_ptr], #0]\n"
"vmull.s8 q7, d2, d6\n"
"vldr d2, [%[rhs_ptr], #16]\n"
// Multiply-accumulate second-half, again into the same
// 16bit local accumulator registers. This is where we
// take advantage of having int8 instead of uint8 and therefore
// being able to accumulate two products into int16.
"vmlal.s8 q4, d1, d5\n"
"vldr d6, [%[lhs_ptr], #16]\n"
"vmlal.s8 q5, d3, d5\n"
"vldr d5, [%[lhs_ptr], #8]\n"
"vmlal.s8 q6, d1, d7\n"
"vldr d1, [%[rhs_ptr], #8]\n"
"vmlal.s8 q7, d3, d7\n"
"vldr d3, [%[rhs_ptr], #24]\n"
// Add pairwise, accumulate into 32-bit accumulators.
"vpadal.s16 q12, q4\n"
"vldr d7, [%[lhs_ptr], #24]\n"
"vpadal.s16 q13, q5\n"
"vpadal.s16 q14, q6\n"
"vpadal.s16 q15, q7\n"
"b " GEMMLOWP_LABEL_LOOP "b\n"
GEMMLOWP_LABEL_AFTER_LOOP
":\n"
// Multiply first half.
"vmull.s8 q4, d0, d4\n"
"vmull.s8 q5, d2, d4\n"
"vmull.s8 q6, d0, d6\n"
"vmull.s8 q7, d2, d6\n"
// Multiply-accumulate second-half, again into the same
// 16bit local accumulator registers. This is where we
// take advantage of having int8 instead of uint8 and therefore
// being able to accumulate two products into int16.
"vmlal.s8 q4, d1, d5\n"
"vmlal.s8 q5, d3, d5\n"
"vmlal.s8 q6, d1, d7\n"
"vmlal.s8 q7, d3, d7\n"
// Add pairwise, accumulate into 32-bit accumulators.
"vpadal.s16 q12, q4\n"
"vpadal.s16 q13, q5\n"
"vpadal.s16 q14, q6\n"
"vpadal.s16 q15, q7\n"
"cmp %[start_depth], #0\n"
// Reduce 32bit accumulators horizontally.
"vpadd.s32 d0, d16, d17\n"
"vpadd.s32 d1, d18, d19\n"
"vpadd.s32 d2, d20, d21\n"
"vpadd.s32 d3, d22, d23\n"
"vpadd.s32 d4, d24, d25\n"
"vpadd.s32 d5, d26, d27\n"
"vpadd.s32 d6, d28, d29\n"
"vpadd.s32 d7, d30, d31\n"
"bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
"f\n"
// Reduce 32bit accumulators horizontally, second pass
// (each pass adds pairwise. we need to add 4-wise).
"vpadd.s32 d8, d0, d2\n"
"vpadd.s32 d9, d4, d6\n"
"vpadd.s32 d10, d1, d3\n"
"vpadd.s32 d11, d5, d7\n"
"b " GEMMLOWP_LABEL_STORE "f\n"
GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
":\n"
// Reduce 32bit accumulators horizontally, second pass
// (each pass adds pairwise. we need to add 4-wise),
// and load destination values from memory.
"mov r0, %[dst_ptr]\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vpadd.s32 d8, d0, d2\n"
"vpadd.s32 d9, d4, d6\n"
"vld1.32 {d18, d19}, [r0]\n"
"vpadd.s32 d10, d1, d3\n"
"vpadd.s32 d11, d5, d7\n"
// Add horizontally-reduced accumulators into
// the values loaded from memory
"vadd.s32 q4, q8, q4\n"
"vadd.s32 q5, q9, q5\n"
GEMMLOWP_LABEL_STORE
":\n"
// Store back into memory
"mov r0, %[dst_ptr]\n"
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth)
: // inputs
[start_depth] "r"(start_depth)
: // clobbers
"cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31");
}
};
// We don't actually use int32*int32 in production. This is just an
// experiment to help dissociate the effect of integer-vs-float, from the
// effect of operands width.
struct NEON_32bit_GEMM_Int32_WithScalar {
typedef std::int32_t OperandType;
typedef std::int32_t AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d8, d9}, [r0]!\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vld1.32 {d24, d25}, [r0]!\n"
"vld1.32 {d10, d11}, [r0]!\n"
"vld1.32 {d18, d19}, [r0]!\n"
"vld1.32 {d26, d27}, [r0]!\n"
"vld1.32 {d12, d13}, [r0]!\n"
"vld1.32 {d20, d21}, [r0]!\n"
"vld1.32 {d28, d29}, [r0]!\n"
"vld1.32 {d14, d15}, [r0]!\n"
"vld1.32 {d22, d23}, [r0]!\n"
"vld1.32 {d30, d31}, [r0]!\n"
GEMMLOWP_LABEL_LOOP
":\n"
// Load 1 Rhs cell of size 1x4
"vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"
// Load 3 Lhs cells of size 4x1 each
"vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
"vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
"vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
// Multiply-accumulate
"vmla.s32 q4, q1, d0[0]\n"
"vmla.s32 q5, q1, d0[1]\n"
"vmla.s32 q6, q1, d1[0]\n"
"vmla.s32 q7, q1, d1[1]\n"
"vmla.s32 q8, q2, d0[0]\n"
"vmla.s32 q9, q2, d0[1]\n"
"vmla.s32 q10, q2, d1[0]\n"
"vmla.s32 q11, q2, d1[1]\n"
"vmla.s32 q12, q3, d0[0]\n"
"vmla.s32 q13, q3, d0[1]\n"
"vmla.s32 q14, q3, d1[0]\n"
"vmla.s32 q15, q3, d1[1]\n"
// Loop. Decrement loop index (depth) by 1, since we just handled 1
// level of depth.
"subs %[depth], #1\n"
"bne " GEMMLOWP_LABEL_LOOP
"b\n"
// Store accumulators
"mov r0, %[accum_ptr]\n"
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d16, d17}, [r0]!\n"
"vst1.32 {d24, d25}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]!\n"
"vst1.32 {d18, d19}, [r0]!\n"
"vst1.32 {d26, d27}, [r0]!\n"
"vst1.32 {d12, d13}, [r0]!\n"
"vst1.32 {d20, d21}, [r0]!\n"
"vst1.32 {d28, d29}, [r0]!\n"
"vst1.32 {d14, d15}, [r0]!\n"
"vst1.32 {d22, d23}, [r0]!\n"
"vst1.32 {d30, d31}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31");
}
};
// Not very efficient kernel, just an experiment to see what we can do
// without using NEON multiply-with-scalar instructions.
struct NEON_32bit_GEMM_Float32_MLA_WithVectorDuplicatingScalar {
typedef float OperandType;
typedef float AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d8, d9}, [r0]!\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vld1.32 {d24, d25}, [r0]!\n"
"vld1.32 {d10, d11}, [r0]!\n"
"vld1.32 {d18, d19}, [r0]!\n"
"vld1.32 {d26, d27}, [r0]!\n"
"vld1.32 {d12, d13}, [r0]!\n"
"vld1.32 {d20, d21}, [r0]!\n"
"vld1.32 {d28, d29}, [r0]!\n"
"vld1.32 {d14, d15}, [r0]!\n"
"vld1.32 {d22, d23}, [r0]!\n"
"vld1.32 {d30, d31}, [r0]!\n"
GEMMLOWP_LABEL_LOOP
":\n"
// Load 3 Lhs cells of size 4x1 each
"vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
"vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
"vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
// Multiply-accumulate
"vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
"vmla.f32 q4, q1, q0\n"
"vmla.f32 q8, q2, q0\n"
"vmla.f32 q12, q3, q0\n"
"vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
"vmla.f32 q5, q1, q0\n"
"vmla.f32 q9, q2, q0\n"
"vmla.f32 q13, q3, q0\n"
"vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
"vmla.f32 q6, q1, q0\n"
"vmla.f32 q10, q2, q0\n"
"vmla.f32 q14, q3, q0\n"
"vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
"vmla.f32 q7, q1, q0\n"
"vmla.f32 q11, q2, q0\n"
"vmla.f32 q15, q3, q0\n"
// Loop. Decrement loop index (depth) by 1, since we just handled 1
// level of depth.
"subs %[depth], #1\n"
"bne " GEMMLOWP_LABEL_LOOP
"b\n"
// Store accumulators
"mov r0, %[accum_ptr]\n"
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d16, d17}, [r0]!\n"
"vst1.32 {d24, d25}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]!\n"
"vst1.32 {d18, d19}, [r0]!\n"
"vst1.32 {d26, d27}, [r0]!\n"
"vst1.32 {d12, d13}, [r0]!\n"
"vst1.32 {d20, d21}, [r0]!\n"
"vst1.32 {d28, d29}, [r0]!\n"
"vst1.32 {d14, d15}, [r0]!\n"
"vst1.32 {d22, d23}, [r0]!\n"
"vst1.32 {d30, d31}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31");
}
};
// Not very efficient kernel, just an experiment to see what we can do
// without using NEON multiply-with-scalar instructions.
// This variant is relevant as on ARMv7 FMA does not have a with-scalar variant.
struct NEON_32bit_GEMM_Float32_FMA_WithVectorDuplicatingScalar {
typedef float OperandType;
typedef float AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d8, d9}, [r0]!\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vld1.32 {d24, d25}, [r0]!\n"
"vld1.32 {d10, d11}, [r0]!\n"
"vld1.32 {d18, d19}, [r0]!\n"
"vld1.32 {d26, d27}, [r0]!\n"
"vld1.32 {d12, d13}, [r0]!\n"
"vld1.32 {d20, d21}, [r0]!\n"
"vld1.32 {d28, d29}, [r0]!\n"
"vld1.32 {d14, d15}, [r0]!\n"
"vld1.32 {d22, d23}, [r0]!\n"
"vld1.32 {d30, d31}, [r0]!\n"
GEMMLOWP_LABEL_LOOP
":\n"
// Load 3 Lhs cells of size 4x1 each
"vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
"vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
"vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
// Multiply-accumulate
"vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
"vfma.f32 q4, q1, q0\n"
"vfma.f32 q8, q2, q0\n"
"vfma.f32 q12, q3, q0\n"
"vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
"vfma.f32 q5, q1, q0\n"
"vfma.f32 q9, q2, q0\n"
"vfma.f32 q13, q3, q0\n"
"vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
"vfma.f32 q6, q1, q0\n"
"vfma.f32 q10, q2, q0\n"
"vfma.f32 q14, q3, q0\n"
"vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
"vfma.f32 q7, q1, q0\n"
"vfma.f32 q11, q2, q0\n"
"vfma.f32 q15, q3, q0\n"
// Loop. Decrement loop index (depth) by 1, since we just handled 1
// level of depth.
"subs %[depth], #1\n"
"bne " GEMMLOWP_LABEL_LOOP
"b\n"
// Store accumulators
"mov r0, %[accum_ptr]\n"
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d16, d17}, [r0]!\n"
"vst1.32 {d24, d25}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]!\n"
"vst1.32 {d18, d19}, [r0]!\n"
"vst1.32 {d26, d27}, [r0]!\n"
"vst1.32 {d12, d13}, [r0]!\n"
"vst1.32 {d20, d21}, [r0]!\n"
"vst1.32 {d28, d29}, [r0]!\n"
"vst1.32 {d14, d15}, [r0]!\n"
"vst1.32 {d22, d23}, [r0]!\n"
"vst1.32 {d30, d31}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31");
}
};
// This is the "most natural" kernel, using NEON multiply-with-scalar
// instructions.
struct NEON_32bit_GEMM_Float32_MLA_WithScalar {
typedef float OperandType;
typedef float AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d8, d9}, [r0]!\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vld1.32 {d24, d25}, [r0]!\n"
"vld1.32 {d10, d11}, [r0]!\n"
"vld1.32 {d18, d19}, [r0]!\n"
"vld1.32 {d26, d27}, [r0]!\n"
"vld1.32 {d12, d13}, [r0]!\n"
"vld1.32 {d20, d21}, [r0]!\n"
"vld1.32 {d28, d29}, [r0]!\n"
"vld1.32 {d14, d15}, [r0]!\n"
"vld1.32 {d22, d23}, [r0]!\n"
"vld1.32 {d30, d31}, [r0]!\n"
GEMMLOWP_LABEL_LOOP
":\n"
// Load 1 Rhs cell of size 1x4
"vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"
// Load 3 Lhs cells of size 4x1 each
"vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
"vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
"vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
// Multiply-accumulate
"vmla.f32 q4, q1, d0[0]\n"
"vmla.f32 q5, q1, d0[1]\n"
"vmla.f32 q6, q1, d1[0]\n"
"vmla.f32 q7, q1, d1[1]\n"
"vmla.f32 q8, q2, d0[0]\n"
"vmla.f32 q9, q2, d0[1]\n"
"vmla.f32 q10, q2, d1[0]\n"
"vmla.f32 q11, q2, d1[1]\n"
"vmla.f32 q12, q3, d0[0]\n"
"vmla.f32 q13, q3, d0[1]\n"
"vmla.f32 q14, q3, d1[0]\n"
"vmla.f32 q15, q3, d1[1]\n"
// Loop. Decrement loop index (depth) by 1, since we just handled 1
// level of depth.
"subs %[depth], #1\n"
"bne " GEMMLOWP_LABEL_LOOP
"b\n"
// Store accumulators
"mov r0, %[accum_ptr]\n"
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d16, d17}, [r0]!\n"
"vst1.32 {d24, d25}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]!\n"
"vst1.32 {d18, d19}, [r0]!\n"
"vst1.32 {d26, d27}, [r0]!\n"
"vst1.32 {d12, d13}, [r0]!\n"
"vst1.32 {d20, d21}, [r0]!\n"
"vst1.32 {d28, d29}, [r0]!\n"
"vst1.32 {d14, d15}, [r0]!\n"
"vst1.32 {d22, d23}, [r0]!\n"
"vst1.32 {d30, d31}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31");
}
};
// Faster kernel contributed by ARM in 64bit form
// (see NEON_64bit_GEMM_Float32_WithScalar_A53) then ported to 32bit code.
// Tuned for A53.
struct NEON_32bit_GEMM_Float32_WithScalar_A53 {
typedef float OperandType;
typedef float AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d8, d9}, [r0]!\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vld1.32 {d24, d25}, [r0]!\n"
"vld1.32 {d10, d11}, [r0]!\n"
"vld1.32 {d18, d19}, [r0]!\n"
"vld1.32 {d26, d27}, [r0]!\n"
"vld1.32 {d12, d13}, [r0]!\n"
"vld1.32 {d20, d21}, [r0]!\n"
"vld1.32 {d28, d29}, [r0]!\n"
"vld1.32 {d14, d15}, [r0]!\n"
"vld1.32 {d22, d23}, [r0]!\n"
"vld1.32 {d30, d31}, [r0]!\n"
// Overview of register layout:
//
// A 1x4 cell of Rhs is stored in d0--d1 (q0).
// A 12x1 block of 3 4x1 cells Lhs is stored in d2--d7
// (q1--q3).
// A 12x4 block of accumulators is stored in q4--q15.
//
// +-----+-----+-----+-----+
// Rhs |d0[0]|d0[1]|d1[0]|d1[1]|
// +-----+-----+-----+-----+
//
// | | | | |
//
// Lhs | | | | |
//
// +--+- - - - - - +-----+-----+-----+-----+
// |d2| | q4 | q5 | q6 | q7 |
// |d2| | q4 | q5 | q6 | q7 |
// |d3| | q4 | q5 | q6 | q7 |
// |d3| | q4 | q5 | q6 | q7 |
// +--+- - - - - - +-----+-----+-----+-----+
// |d4| | q8 | q9 | q10 | q11 |
// |d4| | q8 | q9 | q10 | q11 |
// |d5| | q8 | q9 | q10 | q11 |
// |d5| | q8 | q9 | q10 | q11 |
// +--+ - - - - - - +-----+-----+-----+-----+
// |d6| | q12 | q13 | q14 | q15 |
// |d6| | q12 | q13 | q14 | q15 |
// |d7| | q12 | q13 | q14 | q15 |
// |d7| | q12 | q13 | q14 | q15 |
// +--+- - - - - - +-----+-----+-----+-----+
//
// Accumulator
// Load Rhs cell
"vldr d0, [%[rhs_ptr]]\n"
"ldr r2, [%[rhs_ptr], #8]\n"
"ldr r3, [%[rhs_ptr], #12]\n"
// Load 1st Lhs Cell
"vld1.32 {d2, d3}, [%[lhs_ptr]]\n"
GEMMLOWP_LABEL_LOOP
":\n"
"vldr d4, [%[lhs_ptr], #16]\n" // Load 1st half of 2nd Lhs cell
"vmov d1, r2, r3\n" // Prepare 2nd half of Rhs cell
"vmla.f32 q4, q1, d0[0]\n" // Multiply 1st Lhs cell with column 0
"ldr r2, [%[lhs_ptr], #24]\n" // Load 2nd half of 2nd Lhs cell, part 1
"vmla.f32 q5, q1, d0[1]\n" // Multiply 1st Lhs cell with column 1
"ldr r3, [%[lhs_ptr], #28]\n" // Load 2nd half of 2nd Lhs cell, part 2
"vmla.f32 q6, q1, d1[0]\n" // Multiply 1st Lhs cell with column 2
"subs %[depth], #1\n"
"vldr d6, [%[lhs_ptr], #32]\n" // Load 1st half of 3rd Lhs cell
"vmov d5, r2, r3\n" // Prepare 2nd half of 2nd Lhs cell
"vmla.f32 q7, q1, d1[1]\n" // Multiply 1st Lhs cell with column 3
"ldr r2, [%[lhs_ptr], #40]\n" // Load 2nd half of 3rd Lhs cell, part 1
"vmla.f32 q8, q2, d0[0]\n" // Multiply 2nd Lhs cell with column 0
"ldr r3, [%[lhs_ptr], #44]\n" // Load 2nd half of 3rd Lhs cell, part 2
"vmla.f32 q9, q2, d0[1]\n" // Multiply 2nd Lhs cell with column 1
"add %[rhs_ptr], %[rhs_ptr], #16\n" // Move forward by 1 Rhs cell
"vldr d2, [%[lhs_ptr], #48]\n" // Load 1st half of 1st Lhs cell of next
// iteration
"vmov d7, r2, r3\n" // Prepare 2nd half of 3rd Lhs cell
"vmla.f32 q10, q2, d1[0]\n" // Multiply 2nd Lhs cell with column 2
"ldr r2, [%[lhs_ptr], #56]\n" // Load 2nd half of 1st Lhs cell of next
// iter, part 1
"vmla.f32 q12, q3, d0[0]\n" // Multiply 3rd Lhs cell with column 0
"ldr r3, [%[lhs_ptr], #60]\n" // Load 2nd half of 1st Lhs cell of next
// iter, part 2
"vmla.f32 q13, q3, d0[1]\n" // Multiply 3rd Lhs cell with column 1
"add %[lhs_ptr], %[lhs_ptr], #48\n" // Move forward by 3 Lhs cells
"vldr d0, [%[rhs_ptr]]\n" // Load 1st half of Rhs cell of next
// iteration
"vmov d3, r2, r3\n" // Prepare 2nd half of 1st Lhs cell of next
// iteration
"vmla.f32 q11, q2, d1[1]\n" // Multiply 2nd Lhs cell with column 3
"ldr r2, [%[rhs_ptr], #8]\n" // Load 2nd half of Rhs cell of next
// iteration, part 1
"vmla.f32 q14, q3, d1[0]\n" // Multiply 3rd Lhs cell with column 2
"ldr r3, [%[rhs_ptr], #12]\n" // Load 2nd half of Rhs cell of next
// iteration, part 2
"vmla.f32 q15, q3, d1[1]\n" // Multiply 3rd Lhs cell with column 3
// Loop branch. This will dual issue in fmla cycle 3 of the 4th block.
"bne " GEMMLOWP_LABEL_LOOP
"b\n"
// Store accumulators
"mov r0, %[accum_ptr]\n"
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d16, d17}, [r0]!\n"
"vst1.32 {d24, d25}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]!\n"
"vst1.32 {d18, d19}, [r0]!\n"
"vst1.32 {d26, d27}, [r0]!\n"
"vst1.32 {d12, d13}, [r0]!\n"
"vst1.32 {d20, d21}, [r0]!\n"
"vst1.32 {d28, d29}, [r0]!\n"
"vst1.32 {d14, d15}, [r0]!\n"
"vst1.32 {d22, d23}, [r0]!\n"
"vst1.32 {d30, d31}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5",
"d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
"d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26",
"d27", "d28", "d29", "d30", "d31");
}
};
struct NEON_32bit_GEMM_Float32_WithScalar_A53_depth2 {
typedef float OperandType;
typedef float AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d8, d9}, [r0]!\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vld1.32 {d24, d25}, [r0]!\n"
"vld1.32 {d10, d11}, [r0]!\n"
"vld1.32 {d18, d19}, [r0]!\n"
"vld1.32 {d26, d27}, [r0]!\n"
"vld1.32 {d12, d13}, [r0]!\n"
"vld1.32 {d20, d21}, [r0]!\n"
"vld1.32 {d28, d29}, [r0]!\n"
"vld1.32 {d14, d15}, [r0]!\n"
"vld1.32 {d22, d23}, [r0]!\n"
"vld1.32 {d30, d31}, [r0]!\n"
// Overview of register layout:
//
// A 1x4 cell of Rhs is stored in d0--d1 (q0).
// A 12x1 block of 3 4x1 cells Lhs is stored in d2--d7
// (q1--q3).
// A 12x4 block of accumulators is stored in q4--q15.
//
// +-----+-----+-----+-----+
// Rhs |d0[0]|d0[1]|d1[0]|d1[1]|
// +-----+-----+-----+-----+
//
// | | | | |
//
// Lhs | | | | |
//
// +--+- - - - - - +-----+-----+-----+-----+
// |d2| | q4 | q5 | q6 | q7 |
// |d2| | q4 | q5 | q6 | q7 |
// |d3| | q4 | q5 | q6 | q7 |
// |d3| | q4 | q5 | q6 | q7 |
// +--+- - - - - - +-----+-----+-----+-----+
// |d4| | q8 | q9 | q10 | q11 |
// |d4| | q8 | q9 | q10 | q11 |
// |d5| | q8 | q9 | q10 | q11 |
// |d5| | q8 | q9 | q10 | q11 |
// +--+ - - - - - - +-----+-----+-----+-----+
// |d6| | q12 | q13 | q14 | q15 |
// |d6| | q12 | q13 | q14 | q15 |
// |d7| | q12 | q13 | q14 | q15 |
// |d7| | q12 | q13 | q14 | q15 |
// +--+- - - - - - +-----+-----+-----+-----+
//
// Accumulator
// Load Rhs cell
"vldr d0, [%[rhs_ptr]]\n"
"ldr r2, [%[rhs_ptr], #8]\n"
"ldr r3, [%[rhs_ptr], #12]\n"
// Load 1st Lhs Cell
"vld1.32 {d2, d3}, [%[lhs_ptr]]\n"
// Loop head - handling 2 levels of depth at once
GEMMLOWP_LABEL_LOOP
":\n"
// Level of depth 1
"vldr d4, [%[lhs_ptr], #32]\n" // Load 1st half of 2nd Lhs cell
"vmov d1, r2, r3\n" // Prepare 2nd half of Rhs cell
"vmla.f32 q4, q1, d0[0]\n" // Multiply 1st Lhs cell with column 0
"ldr r2, [%[lhs_ptr], #40]\n" // Load 2nd half of 2nd Lhs cell, part 1
"vmla.f32 q5, q1, d0[1]\n" // Multiply 1st Lhs cell with column 1
"ldr r3, [%[lhs_ptr], #44]\n" // Load 2nd half of 2nd Lhs cell, part 2
"vmla.f32 q6, q1, d1[0]\n" // Multiply 1st Lhs cell with column 2
"vldr d6, [%[lhs_ptr], #64]\n" // Load 1st half of 3rd Lhs cell
"vmov d5, r2, r3\n" // Prepare 2nd half of 2nd Lhs cell
"vmla.f32 q7, q1, d1[1]\n" // Multiply 1st Lhs cell with column 3
"ldr r2, [%[lhs_ptr], #72]\n" // Load 2nd half of 3rd Lhs cell, part 1
"vmla.f32 q8, q2, d0[0]\n" // Multiply 2nd Lhs cell with column 0
"ldr r3, [%[lhs_ptr], #76]\n" // Load 2nd half of 3rd Lhs cell, part 2
"vmla.f32 q9, q2, d0[1]\n" // Multiply 2nd Lhs cell with column 1
"vldr d2, [%[lhs_ptr], #16]\n" // Load 1st half of 1st Lhs cell of next
// iteration
"vmov d7, r2, r3\n" // Prepare 2nd half of 3rd Lhs cell
"vmla.f32 q10, q2, d1[0]\n" // Multiply 2nd Lhs cell with column 2
"ldr r2, [%[lhs_ptr], #24]\n" // Load 2nd half of 1st Lhs cell of next
// iter, part 1
"vmla.f32 q12, q3, d0[0]\n" // Multiply 3rd Lhs cell with column 0
"ldr r3, [%[lhs_ptr], #28]\n" // Load 2nd half of 1st Lhs cell of next
// iter, part 2
"vmla.f32 q13, q3, d0[1]\n" // Multiply 3rd Lhs cell with column 1
"vldr d0, [%[rhs_ptr], #16]\n" // Load 1st half of Rhs cell of next
// iteration
"vmov d3, r2, r3\n" // Prepare 2nd half of 1st Lhs cell of next
// iteration
"vmla.f32 q11, q2, d1[1]\n" // Multiply 2nd Lhs cell with column 3
"ldr r2, [%[rhs_ptr], #24]\n" // Load 2nd half of Rhs cell of next
// iteration, part 1
"vmla.f32 q14, q3, d1[0]\n" // Multiply 3rd Lhs cell with column 2
"ldr r3, [%[rhs_ptr], #28]\n" // Load 2nd half of Rhs cell of next
// iteration, part 2
"vmla.f32 q15, q3, d1[1]\n" // Multiply 3rd Lhs cell with column 3
// Level of depth 2
"vldr d4, [%[lhs_ptr], #48]\n" // Load 1st half of 2nd Lhs cell
"vmov d1, r2, r3\n" // Prepare 2nd half of Rhs cell
"vmla.f32 q4, q1, d0[0]\n" // Multiply 1st Lhs cell with column 0
"ldr r2, [%[lhs_ptr], #56]\n" // Load 2nd half of 2nd Lhs cell, part 1
"vmla.f32 q5, q1, d0[1]\n" // Multiply 1st Lhs cell with column 1
"ldr r3, [%[lhs_ptr], #60]\n" // Load 2nd half of 2nd Lhs cell, part 2
"vmla.f32 q6, q1, d1[0]\n" // Multiply 1st Lhs cell with column 2
"subs %[depth], #2\n" // Decrement depth counter
"vldr d6, [%[lhs_ptr], #80]\n" // Load 1st half of 3rd Lhs cell
"vmov d5, r2, r3\n" // Prepare 2nd half of 2nd Lhs cell
"vmla.f32 q7, q1, d1[1]\n" // Multiply 1st Lhs cell with column 3
"ldr r2, [%[lhs_ptr], #88]\n" // Load 2nd half of 3rd Lhs cell, part 1
"vmla.f32 q8, q2, d0[0]\n" // Multiply 2nd Lhs cell with column 0
"ldr r3, [%[lhs_ptr], #92]\n" // Load 2nd half of 3rd Lhs cell, part 2
"vmla.f32 q9, q2, d0[1]\n" // Multiply 2nd Lhs cell with column 1
"add %[rhs_ptr], %[rhs_ptr], #32\n" // Move forward by 1 Rhs cell
"vldr d2, [%[lhs_ptr], #96]\n" // Load 1st half of 1st Lhs cell of next
// iteration
"vmov d7, r2, r3\n" // Prepare 2nd half of 3rd Lhs cell
"vmla.f32 q10, q2, d1[0]\n" // Multiply 2nd Lhs cell with column 2
"ldr r2, [%[lhs_ptr], #104]\n" // Load 2nd half of 1st Lhs cell of next
// iter, part 1
"vmla.f32 q12, q3, d0[0]\n" // Multiply 3rd Lhs cell with column 0
"ldr r3, [%[lhs_ptr], #108]\n" // Load 2nd half of 1st Lhs cell of next
// iter, part 2
"vmla.f32 q13, q3, d0[1]\n" // Multiply 3rd Lhs cell with column 1
"add %[lhs_ptr], %[lhs_ptr], #96\n" // Move forward by 3 Lhs cells
"vldr d0, [%[rhs_ptr]]\n" // Load 1st half of Rhs cell of next
// iteration
"vmov d3, r2, r3\n" // Prepare 2nd half of 1st Lhs cell of next
// iteration
"vmla.f32 q11, q2, d1[1]\n" // Multiply 2nd Lhs cell with column 3
"ldr r2, [%[rhs_ptr], #8]\n" // Load 2nd half of Rhs cell of next
// iteration, part 1
"vmla.f32 q14, q3, d1[0]\n" // Multiply 3rd Lhs cell with column 2
"ldr r3, [%[rhs_ptr], #12]\n" // Load 2nd half of Rhs cell of next
// iteration, part 2
"vmla.f32 q15, q3, d1[1]\n" // Multiply 3rd Lhs cell with column 3
// Loop branch. This will dual issue in fmla cycle 3 of the 4th block.
//"bne loop_%=\n"
"bne " GEMMLOWP_LABEL_LOOP
"b\n"
// Store accumulators
"mov r0, %[accum_ptr]\n"
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d16, d17}, [r0]!\n"
"vst1.32 {d24, d25}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]!\n"
"vst1.32 {d18, d19}, [r0]!\n"
"vst1.32 {d26, d27}, [r0]!\n"
"vst1.32 {d12, d13}, [r0]!\n"
"vst1.32 {d20, d21}, [r0]!\n"
"vst1.32 {d28, d29}, [r0]!\n"
"vst1.32 {d14, d15}, [r0]!\n"
"vst1.32 {d22, d23}, [r0]!\n"
"vst1.32 {d30, d31}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5",
"d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
"d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26",
"d27", "d28", "d29", "d30", "d31");
}
};
// This rotating variant performs well when permutations (vext) can be
// dual-issued with arithmetic instructions.
struct NEON_32bit_GEMM_Float32_MLA_Rotating {
typedef float OperandType;
typedef float AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d8, d9}, [r0]!\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vld1.32 {d24, d25}, [r0]!\n"
"vld1.32 {d10, d11}, [r0]!\n"
"vld1.32 {d18, d19}, [r0]!\n"
"vld1.32 {d26, d27}, [r0]!\n"
"vld1.32 {d12, d13}, [r0]!\n"
"vld1.32 {d20, d21}, [r0]!\n"
"vld1.32 {d28, d29}, [r0]!\n"
"vld1.32 {d14, d15}, [r0]!\n"
"vld1.32 {d22, d23}, [r0]!\n"
"vld1.32 {d30, d31}, [r0]!\n"
#define NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS \
"vtrn.32 q4, q5\n" \
"vtrn.32 q6, q7\n" \
"vswp d9, d12\n" \
"vswp d11, d14\n" \
"vtrn.32 q8, q9\n" \
"vtrn.32 q10, q11\n" \
"vswp d17, d20\n" \
"vswp d19, d22\n" \
"vtrn.32 q12, q13\n" \
"vtrn.32 q14, q15\n" \
"vswp d25, d28\n" \
"vswp d27, d30\n"
#define NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(a, b, c) \
NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS \
"vext.32 q5, q5, q5, #" #a \
"\n" \
"vext.32 q6, q6, q6, #" #b \
"\n" \
"vext.32 q7, q7, q7, #" #c \
"\n" \
"vext.32 q9, q9, q9, #" #a \
"\n" \
"vext.32 q10, q10, q10, #" #b \
"\n" \
"vext.32 q11, q11, q11, #" #c \
"\n" \
"vext.32 q13, q13, q13, #" #a \
"\n" \
"vext.32 q14, q14, q14, #" #b \
"\n" \
"vext.32 q15, q15, q15, #" #c \
"\n" NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS
NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(1, 2, 3)
//"loop_%=:\n"
GEMMLOWP_LABEL_LOOP
":\n"
// Load 1 Rhs cell of size 1x4
"vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"
// Load 3 Lhs cells of size 4x1 each
"vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
"vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
"vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
// Multiply-accumulate
"vmla.f32 q4, q1, q0\n"
"vmla.f32 q8, q2, q0\n"
"vmla.f32 q12, q3, q0\n"
"vext.f32 q0, q0, q0, #1\n"
"vmla.f32 q5, q1, q0\n"
"vmla.f32 q9, q2, q0\n"
"vmla.f32 q13, q3, q0\n"
"vext.f32 q0, q0, q0, #1\n"
"vmla.f32 q6, q1, q0\n"
"vmla.f32 q10, q2, q0\n"
"vmla.f32 q14, q3, q0\n"
"vext.f32 q0, q0, q0, #1\n"
"vmla.f32 q7, q1, q0\n"
"vmla.f32 q11, q2, q0\n"
"vmla.f32 q15, q3, q0\n"
// Loop. Decrement loop index (depth) by 1, since we just handled 1
// level of depth.
"subs %[depth], #1\n"
//"bne loop_%=\n"
"bne " GEMMLOWP_LABEL_LOOP
"b\n"
// Store accumulators
"mov r0, %[accum_ptr]\n"
NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(3, 2, 1)
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d16, d17}, [r0]!\n"
"vst1.32 {d24, d25}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]!\n"
"vst1.32 {d18, d19}, [r0]!\n"
"vst1.32 {d26, d27}, [r0]!\n"
"vst1.32 {d12, d13}, [r0]!\n"
"vst1.32 {d20, d21}, [r0]!\n"
"vst1.32 {d28, d29}, [r0]!\n"
"vst1.32 {d14, d15}, [r0]!\n"
"vst1.32 {d22, d23}, [r0]!\n"
"vst1.32 {d30, d31}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31");
}
};
// This rotating variant performs well when permutations (vext) can be
// dual-issued with arithmetic instructions. It is relevant as the rotating
// approach removes the need for multiply-with-scalar instructions, and ARMv7
// FMA does not have a with-scalar variant.
struct NEON_32bit_GEMM_Float32_FMA_Rotating {
typedef float OperandType;
typedef float AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load accumulators
"mov r0, %[accum_ptr]\n"
"vld1.32 {d8, d9}, [r0]!\n"
"vld1.32 {d16, d17}, [r0]!\n"
"vld1.32 {d24, d25}, [r0]!\n"
"vld1.32 {d10, d11}, [r0]!\n"
"vld1.32 {d18, d19}, [r0]!\n"
"vld1.32 {d26, d27}, [r0]!\n"
"vld1.32 {d12, d13}, [r0]!\n"
"vld1.32 {d20, d21}, [r0]!\n"
"vld1.32 {d28, d29}, [r0]!\n"
"vld1.32 {d14, d15}, [r0]!\n"
"vld1.32 {d22, d23}, [r0]!\n"
"vld1.32 {d30, d31}, [r0]!\n"
NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(1, 2, 3)
//"loop_%=:\n"
GEMMLOWP_LABEL_LOOP
":\n"
// Load 1 Rhs cell of size 1x4
"vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"
// Load 3 Lhs cells of size 4x1 each
"vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
"vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
"vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
// Multiply-accumulate
"vfma.f32 q4, q1, q0\n"
"vfma.f32 q8, q2, q0\n"
"vfma.f32 q12, q3, q0\n"
"vext.f32 q0, q0, q0, #1\n"
"vfma.f32 q5, q1, q0\n"
"vfma.f32 q9, q2, q0\n"
"vfma.f32 q13, q3, q0\n"
"vext.f32 q0, q0, q0, #1\n"
"vfma.f32 q6, q1, q0\n"
"vfma.f32 q10, q2, q0\n"
"vfma.f32 q14, q3, q0\n"
"vext.f32 q0, q0, q0, #1\n"
"vfma.f32 q7, q1, q0\n"
"vfma.f32 q11, q2, q0\n"
"vfma.f32 q15, q3, q0\n"
// Loop. Decrement loop index (depth) by 1, since we just handled 1
// level of depth.
"subs %[depth], #1\n"
//"bne loop_%=\n"
"bne " GEMMLOWP_LABEL_LOOP "b\n"
NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(3, 2, 1)
// Store accumulators
"mov r0, %[accum_ptr]\n"
"vst1.32 {d8, d9}, [r0]!\n"
"vst1.32 {d16, d17}, [r0]!\n"
"vst1.32 {d24, d25}, [r0]!\n"
"vst1.32 {d10, d11}, [r0]!\n"
"vst1.32 {d18, d19}, [r0]!\n"
"vst1.32 {d26, d27}, [r0]!\n"
"vst1.32 {d12, d13}, [r0]!\n"
"vst1.32 {d20, d21}, [r0]!\n"
"vst1.32 {d28, d29}, [r0]!\n"
"vst1.32 {d14, d15}, [r0]!\n"
"vst1.32 {d22, d23}, [r0]!\n"
"vst1.32 {d30, d31}, [r0]!\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31");
}
};
#endif // __arm__
#ifdef __aarch64__
// This is the current standard kernel in gemmlowp, see:
// https://github.com/google/gemmlowp/blob/b1e2a29ff866680028f3080efc244e10e8dd7f46/internal/kernel_neon.h#L646
struct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators {
typedef std::uint8_t OperandType;
typedef std::uint32_t AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
asm volatile(
// Load 1 Rhs cell of size 2x8
"ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
"ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
// Load 3 Lhs cells of size 4x2 each
"ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
"ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
"ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
"subs %w[depth], %w[depth], #2\n"
// Load accumulators
"mov x0, %[accum_ptr]\n"
"ld1 {v8.16b}, [x0], #16\n"
"ld1 {v16.16b}, [x0], #16\n"
"ld1 {v24.16b}, [x0], #16\n"
"ld1 {v9.16b}, [x0], #16\n"
"ld1 {v17.16b}, [x0], #16\n"
"ld1 {v25.16b}, [x0], #16\n"
"ld1 {v10.16b}, [x0], #16\n"
"ld1 {v18.16b}, [x0], #16\n"
"ld1 {v26.16b}, [x0], #16\n"
"ld1 {v11.16b}, [x0], #16\n"
"ld1 {v19.16b}, [x0], #16\n"
"ld1 {v27.16b}, [x0], #16\n"
"ld1 {v12.16b}, [x0], #16\n"
"ld1 {v20.16b}, [x0], #16\n"
"ld1 {v28.16b}, [x0], #16\n"
"ld1 {v13.16b}, [x0], #16\n"
"ld1 {v21.16b}, [x0], #16\n"
"ld1 {v29.16b}, [x0], #16\n"
"ld1 {v14.16b}, [x0], #16\n"
"ld1 {v22.16b}, [x0], #16\n"
"ld1 {v30.16b}, [x0], #16\n"
"ld1 {v15.16b}, [x0], #16\n"
"ld1 {v23.16b}, [x0], #16\n"
"ld1 {v31.16b}, [x0], #16\n"
"beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
//"loop_%=:\n"
GEMMLOWP_LABEL_LOOP
":\n"
// Overview of register layout:
//
// A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1.
// A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4.
// A 12x8 block of accumulators is stored in 32bit in v8--v31.
//
// +--------+--------+-----+--------+--------+
// |v0.h[0] |v0.h[1] | ... |v1.h[2] |v1.h[3] |
// Rhs +--------+--------+-----+--------+--------+
// |v0.h[4] |v0.h[5] | ... |v1.h[6] |v1.h[7] |
// +--------+--------+-----+--------+--------+
//
// | | | | | |
//
// Lhs | | | | | |
//
// +-------+-------+ - - +--------+--------+-----+--------+--------+
// |v2.h[0]|v2.h[4]| |v8.s[0] |v9.s[0] | ... |v14.s[0]|v15.s[0]|
// |v2.h[1]|v2.h[5]| |v8.s[1] |v9.s[1] | ... |v14.s[1]|v15.s[1]|
// |v2.h[2]|v2.h[6]| |v8.s[2] |v9.s[2] | ... |v14.s[2]|v15.s[2]|
// |v2.h[3]|v2.h[7]| |v8.s[3] |v9.s[3] | ... |v14.s[3]|v15.s[3]|
// +-------+-------+ - - +--------+--------+-----+--------+--------+
// |v3.h[0]|v3.h[4]| |v16.s[0]|v17.s[0]| ... |v22.s[0]|v23.s[0]|
// |v3.h[1]|v3.h[5]| |v16.s[1]|v17.s[1]| ... |v22.s[1]|v23.s[1]|
// |v3.h[2]|v3.h[6]| |v16.s[2]|v17.s[2]| ... |v22.s[2]|v23.s[2]|
// |v3.h[3]|v3.h[7]| |v16.s[3]|v17.s[3]| ... |v22.s[3]|v23.s[3]|
// +-------+-------+ - - +--------+--------+-----+--------+--------+
// |v4.h[0]|v4.h[4]| |v24.s[0]|v25.s[0]| ... |v30.s[0]|v31.s[0]|
// |v4.h[1]|v4.h[5]| |v24.s[1]|v25.s[1]| ... |v30.s[1]|v31.s[1]|
// |v4.h[2]|v4.h[6]| |v24.s[2]|v25.s[2]| ... |v30.s[2]|v31.s[2]|
// |v4.h[3]|v4.h[7]| |v24.s[3]|v25.s[3]| ... |v30.s[3]|v31.s[3]|
// +-------+-------+ - - +--------+--------+-----+--------+--------+
//
// Accumulator
// Expand Lhs/Rhs cells to 16 bit.
"uxtl v0.8h, v5.8b\n"
"ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
"uxtl v1.8h, v6.8b\n"
"ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
"uxtl v2.8h, v2.8b\n"
"uxtl v3.8h, v3.8b\n"
"uxtl v4.8h, v4.8b\n"
// Multiply-accumulate, top third
"umlal v8.4s, v2.4h, v0.h[0]\n"
"umlal v9.4s, v2.4h, v0.h[1]\n"
"umlal v10.4s, v2.4h, v0.h[2]\n"
"umlal v11.4s, v2.4h, v0.h[3]\n"
"umlal v12.4s, v2.4h, v1.h[0]\n"
"umlal v13.4s, v2.4h, v1.h[1]\n"
"umlal v14.4s, v2.4h, v1.h[2]\n"
"umlal v15.4s, v2.4h, v1.h[3]\n"
"umlal2 v8.4s, v2.8h, v0.h[4]\n"
"umlal2 v9.4s, v2.8h, v0.h[5]\n"
"umlal2 v10.4s, v2.8h, v0.h[6]\n"
"umlal2 v11.4s, v2.8h, v0.h[7]\n"
"umlal2 v12.4s, v2.8h, v1.h[4]\n"
"umlal2 v13.4s, v2.8h, v1.h[5]\n"
"umlal2 v14.4s, v2.8h, v1.h[6]\n"
"umlal2 v15.4s, v2.8h, v1.h[7]\n"
"ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
// Multiply-accumulate, middle third
"umlal v16.4s, v3.4h, v0.h[0]\n"
"umlal v17.4s, v3.4h, v0.h[1]\n"
"umlal v18.4s, v3.4h, v0.h[2]\n"
"umlal v19.4s, v3.4h, v0.h[3]\n"
"umlal v20.4s, v3.4h, v1.h[0]\n"
"umlal v21.4s, v3.4h, v1.h[1]\n"
"umlal v22.4s, v3.4h, v1.h[2]\n"
"umlal v23.4s, v3.4h, v1.h[3]\n"
"umlal2 v16.4s, v3.8h, v0.h[4]\n"
"umlal2 v17.4s, v3.8h, v0.h[5]\n"
"umlal2 v18.4s, v3.8h, v0.h[6]\n"
"umlal2 v19.4s, v3.8h, v0.h[7]\n"
"umlal2 v20.4s, v3.8h, v1.h[4]\n"
"umlal2 v21.4s, v3.8h, v1.h[5]\n"
"umlal2 v22.4s, v3.8h, v1.h[6]\n"
"umlal2 v23.4s, v3.8h, v1.h[7]\n"
"ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
"subs %w[depth], %w[depth], #2\n"
// Multiply-accumulate, bottom third
"umlal v24.4s, v4.4h, v0.h[0]\n"
"umlal v25.4s, v4.4h, v0.h[1]\n"
"umlal v26.4s, v4.4h, v0.h[2]\n"
"umlal v27.4s, v4.4h, v0.h[3]\n"
"umlal v28.4s, v4.4h, v1.h[0]\n"
"umlal v29.4s, v4.4h, v1.h[1]\n"
"umlal v30.4s, v4.4h, v1.h[2]\n"
"umlal v31.4s, v4.4h, v1.h[3]\n"
"umlal2 v24.4s, v4.8h, v0.h[4]\n"
"umlal2 v25.4s, v4.8h, v0.h[5]\n"
"umlal2 v26.4s, v4.8h, v0.h[6]\n"
"umlal2 v27.4s, v4.8h, v0.h[7]\n"
"umlal2 v28.4s, v4.8h, v1.h[4]\n"
"umlal2 v29.4s, v4.8h, v1.h[5]\n"
"umlal2 v30.4s, v4.8h, v1.h[6]\n"
"umlal2 v31.4s, v4.8h, v1.h[7]\n"
"ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
"bne " GEMMLOWP_LABEL_LOOP "b\n"
GEMMLOWP_LABEL_AFTER_LOOP
":\n"
// Expand Lhs/Rhs cells to 16 bit.
"uxtl v0.8h, v5.8b\n"
"uxtl v1.8h, v6.8b\n"
"uxtl v2.8h, v2.8b\n"
"uxtl v3.8h, v3.8b\n"
"uxtl v4.8h, v4.8b\n"
// Multiply-accumulate, level of depth 0
"umlal v8.4s, v2.4h, v0.h[0]\n"
"umlal v9.4s, v2.4h, v0.h[1]\n"
"umlal v10.4s, v2.4h, v0.h[2]\n"
"umlal v11.4s, v2.4h, v0.h[3]\n"
"umlal v12.4s, v2.4h, v1.h[0]\n"
"umlal v13.4s, v2.4h, v1.h[1]\n"
"umlal v14.4s, v2.4h, v1.h[2]\n"
"umlal v15.4s, v2.4h, v1.h[3]\n"
"umlal v16.4s, v3.4h, v0.h[0]\n"
"umlal v17.4s, v3.4h, v0.h[1]\n"
"umlal v18.4s, v3.4h, v0.h[2]\n"
"umlal v19.4s, v3.4h, v0.h[3]\n"
"umlal v20.4s, v3.4h, v1.h[0]\n"
"umlal v21.4s, v3.4h, v1.h[1]\n"
"umlal v22.4s, v3.4h, v1.h[2]\n"
"umlal v23.4s, v3.4h, v1.h[3]\n"
"umlal v24.4s, v4.4h, v0.h[0]\n"
"umlal v25.4s, v4.4h, v0.h[1]\n"
"umlal v26.4s, v4.4h, v0.h[2]\n"
"umlal v27.4s, v4.4h, v0.h[3]\n"
"umlal v28.4s, v4.4h, v1.h[0]\n"
"umlal v29.4s, v4.4h, v1.h[1]\n"
"umlal v30.4s, v4.4h, v1.h[2]\n"
"umlal v31.4s, v4.4h, v1.h[3]\n"
// Multiply-accumulate, level of depth 1
"umlal2 v8.4s, v2.8h, v0.h[4]\n"
"umlal2 v9.4s, v2.8h, v0.h[5]\n"
"umlal2 v10.4s, v2.8h, v0.h[6]\n"
"umlal2 v11.4s, v2.8h, v0.h[7]\n"
"umlal2 v12.4s, v2.8h, v1.h[4]\n"
"umlal2 v13.4s, v2.8h, v1.h[5]\n"
"umlal2 v14.4s, v2.8h, v1.h[6]\n"
"umlal2 v15.4s, v2.8h, v1.h[7]\n"
"umlal2 v16.4s, v3.8h, v0.h[4]\n"
"umlal2 v17.4s, v3.8h, v0.h[5]\n"
"umlal2 v18.4s, v3.8h, v0.h[6]\n"
"umlal2 v19.4s, v3.8h, v0.h[7]\n"
"umlal2 v20.4s, v3.8h, v1.h[4]\n"
"umlal2 v21.4s, v3.8h, v1.h[5]\n"
"umlal2 v22.4s, v3.8h, v1.h[6]\n"
"umlal2 v23.4s, v3.8h, v1.h[7]\n"
"umlal2 v24.4s, v4.8h, v0.h[4]\n"
"umlal2 v25.4s, v4.8h, v0.h[5]\n"
"umlal2 v26.4s, v4.8h, v0.h[6]\n"
"umlal2 v27.4s, v4.8h, v0.h[7]\n"
"umlal2 v28.4s, v4.8h, v1.h[4]\n"
"umlal2 v29.4s, v4.8h, v1.h[5]\n"
"umlal2 v30.4s, v4.8h, v1.h[6]\n"
"umlal2 v31.4s, v4.8h, v1.h[7]\n"
// Store accumulators
"mov x0, %[accum_ptr]\n"
"st1 {v8.16b}, [x0], #16\n"
"st1 {v16.16b}, [x0], #16\n"
"st1 {v24.16b}, [x0], #16\n"
"st1 {v9.16b}, [x0], #16\n"
"st1 {v17.16b}, [x0], #16\n"
"st1 {v25.16b}, [x0], #16\n"
"st1 {v10.16b}, [x0], #16\n"
"st1 {v18.16b}, [x0], #16\n"
"st1 {v26.16b}, [x0], #16\n"
"st1 {v11.16b}, [x0], #16\n"
"st1 {v19.16b}, [x0], #16\n"
"st1 {v27.16b}, [x0], #16\n"
"st1 {v12.16b}, [x0], #16\n"
"st1 {v20.16b}, [x0], #16\n"
"st1 {v28.16b}, [x0], #16\n"
"st1 {v13.16b}, [x0], #16\n"
"st1 {v21.16b}, [x0], #16\n"
"st1 {v29.16b}, [x0], #16\n"
"st1 {v14.16b}, [x0], #16\n"
"st1 {v22.16b}, [x0], #16\n"
"st1 {v30.16b}, [x0], #16\n"
"st1 {v15.16b}, [x0], #16\n"
"st1 {v23.16b}, [x0], #16\n"
"st1 {v31.16b}, [x0], #16\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[accum_ptr] "r"(accum_ptr)
: // clobbers
"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
"v28", "v29", "v30", "v31");
}
};
// Faster kernel by ARM. Not expanding operands before multiplication.
// Tuned for A57. Compare to
// NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand
struct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand_A57 {
typedef std::uint8_t OperandType;
typedef std::uint32_t AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<5, 16, CellOrder::WidthMajor>, 1>,
KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
static const int kLhsWidth = Format::Lhs::kWidth;
static const int kRhsWidth = Format::Rhs::kWidth;
AccumulatorType rowmajor_accumulator_buffer[kLhsWidth * kRhsWidth];
asm volatile(
// Clear aggregators
"dup v12.4s, wzr\n"
"dup v13.4s, wzr\n"
"dup v14.4s, wzr\n"
"dup v15.4s, wzr\n"
"dup v16.4s, wzr\n"
"dup v17.4s, wzr\n"
"dup v18.4s, wzr\n"
"dup v19.4s, wzr\n"
"dup v20.4s, wzr\n"
"dup v21.4s, wzr\n"
"dup v22.4s, wzr\n"
"dup v23.4s, wzr\n"
"dup v24.4s, wzr\n"
"dup v25.4s, wzr\n"
"dup v26.4s, wzr\n"
"dup v27.4s, wzr\n"
"dup v28.4s, wzr\n"
"dup v29.4s, wzr\n"
"dup v30.4s, wzr\n"
"dup v31.4s, wzr\n"
GEMMLOWP_LABEL_LOOP
":\n"
// Overview of register layout:
//
// A 4x16 block of Rhs is stored in 8 bit in v0--v3.
// A 5x16 block of Lhs is cycled through v4 and v5 in 8 bit.
//
// A 4x5 block of aggregators is stored in v12-v31 (as 4x32 bit
// components which would need to be added at the end)
//
// The Lhs vectors are multiplied by the Rhs vectors with a widening
// multiply to produce an intermediate result which is stored in
// v6-v11. Each intermediate result is 8x16 bits so this happens
// twice for each Lhs/Rhs combination (once with UMULL for elements
// 0-7 and once with UMULL2 for elements 8-15).
//
// UADALP is used to accumulate these intermediate results into the
// result aggregators.
//
//
//
// +--------+--------+--------+--------+
// |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] |
// Rhs +--------+--------+--------+--------+
// | ... | ... | ... | ... |
// +--------+--------+--------+--------|
// |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]|
// +--------+--------+--------+--------+
//
// | | | | |
//
// Lhs | | | | |
//
// +-------+-----+--------+ - - +--------+--------+--------+--------+
// |v4.b[0]| ... |v4.b[15]| | v12.4s | v13.4s | v14.4s | v15.4s |
// |v5.b[0]| ... |v5.b[15]| | v16.4s | v17.4s | v18.4s | v19.4s |
// |v4.b[0]| ... |v4.b[15]| | v20.4s | v21.4s | v22.4s | v23.4s |
// |v5.b[0]| ... |v5.b[15]| | v24.4s | v25.4s | v26.4s | v27.4s |
// |v4.b[0]| ... |v4.b[15]| | v28.4s | v29.4s | v30.4s | v31.4s |
// +-------+--------------+ - - +--------+--------+--------+--------+
//
// Accumulator
//
//
// Further possible optimisations (not tried):
// - Move early loads into previous iteration (see Float32_WithScalar
// for example). - Unroll loop 2x to alternate more smoothly between
// v4 and v5. - A different number of temporary registers might work
// better. - Pairing umull with corresponding umull2 might allow
// better
// register loading (e.g. at the start of the loop)
// - Interleaving umull{2} and uadalp even more aggressively might
// help, (not sure about latency vs. dispatch rate).
//
//
// Start loading Rhs - further loads are interleaved amongst the
// multiplies for better dispatch on A57.
"ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
// Load first Lhs vector - further loads are interleaved amongst the
// multiplies
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
"umull v6.8h, v0.8b, v4.8b\n"
"ld1 {v1.16b}, [%[rhs_ptr]], #16\n" // 2nd RHS element
"umull v7.8h, v1.8b, v4.8b\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n" // 3rd RHS element
"umull v8.8h, v2.8b, v4.8b\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n" // 4th RHS element
"umull v9.8h, v3.8b, v4.8b\n"
"umull2 v10.8h, v0.16b, v4.16b\n"
"umull2 v11.8h, v1.16b, v4.16b\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n" // 2nd LHS element
"uadalp v12.4s, v6.8h\n"
"umull2 v6.8h, v2.16b, v4.16b\n"
"uadalp v13.4s, v7.8h\n"
"umull2 v7.8h, v3.16b, v4.16b\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n" // 1st LHS element done - Reuse v4
// for 3rd LHS element
"uadalp v14.4s, v8.8h\n"
"umull v8.8h, v0.8b, v5.8b\n"
"uadalp v15.4s, v9.8h\n"
"umull v9.8h, v1.8b, v5.8b\n"
"uadalp v12.4s, v10.8h\n"
"umull v10.8h, v2.8b, v5.8b\n"
"uadalp v13.4s, v11.8h\n"
"umull v11.8h, v3.8b, v5.8b\n"
"uadalp v14.4s, v6.8h\n"
"umull2 v6.8h, v0.16b, v5.16b\n"
"uadalp v15.4s, v7.8h\n"
"umull2 v7.8h, v1.16b, v5.16b\n"
"uadalp v16.4s, v8.8h\n"
"umull2 v8.8h, v2.16b, v5.16b\n"
"uadalp v17.4s, v9.8h\n"
"umull2 v9.8h, v3.16b, v5.16b\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n" // 2nd LHS element done - Reuse v5
// for 4th LHS element
"uadalp v18.4s, v10.8h\n"
"umull v10.8h, v0.8b, v4.8b\n"
"uadalp v19.4s, v11.8h\n"
"umull v11.8h, v1.8b, v4.8b\n"
"uadalp v16.4s, v6.8h\n"
"umull v6.8h, v2.8b, v4.8b\n"
"uadalp v17.4s, v7.8h\n"
"umull v7.8h, v3.8b, v4.8b\n"
"uadalp v18.4s, v8.8h\n"
"umull2 v8.8h, v0.16b, v4.16b\n"
"uadalp v19.4s, v9.8h\n"
"umull2 v9.8h, v1.16b, v4.16b\n"
"uadalp v20.4s, v10.8h\n"
"umull2 v10.8h, v2.16b, v4.16b\n"
"uadalp v21.4s, v11.8h\n"
"umull2 v11.8h, v3.16b, v4.16b\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n" // 3rd LHS element done - Reuse v4
// for 5th LHS element
"uadalp v22.4s, v6.8h\n"
"umull v6.8h, v0.8b, v5.8b\n"
"uadalp v23.4s, v7.8h\n"
"umull v7.8h, v1.8b, v5.8b\n"
"uadalp v20.4s, v8.8h\n"
"umull v8.8h, v2.8b, v5.8b\n"
"uadalp v21.4s, v9.8h\n"
"umull v9.8h, v3.8b, v5.8b\n"
"uadalp v22.4s, v10.8h\n"
"umull2 v10.8h, v0.16b, v5.16b\n"
"uadalp v23.4s, v11.8h\n"
"umull2 v11.8h, v1.16b, v5.16b\n"
"uadalp v24.4s, v6.8h\n"
"umull2 v6.8h, v2.16b, v5.16b\n"
"uadalp v25.4s, v7.8h\n"
"umull2 v7.8h, v3.16b, v5.16b\n"
"uadalp v26.4s, v8.8h\n"
"umull v8.8h, v0.8b, v4.8b\n"
"uadalp v27.4s, v9.8h\n"
"umull v9.8h, v1.8b, v4.8b\n"
"uadalp v24.4s, v10.8h\n"
"umull v10.8h, v2.8b, v4.8b\n"
"uadalp v25.4s, v11.8h\n"
"umull v11.8h, v3.8b, v4.8b\n"
"uadalp v26.4s, v6.8h\n"
"umull2 v6.8h, v0.16b, v4.16b\n"
"uadalp v27.4s, v7.8h\n"
"umull2 v7.8h, v1.16b, v4.16b\n"
"uadalp v28.4s, v8.8h\n"
"umull2 v8.8h, v2.16b, v4.16b\n"
"uadalp v29.4s, v9.8h\n"
"umull2 v9.8h, v3.16b, v4.16b\n"
"uadalp v30.4s, v10.8h\n"
"uadalp v31.4s, v11.8h\n"
"uadalp v28.4s, v6.8h\n"
"uadalp v29.4s, v7.8h\n"
// Loop. Decrement loop index (depth) by 16, since we just handled
// 16 levels of depth. Do this subs a bit before the end of the loop
// for better dispatch on A57.
"subs %w[depth], %w[depth], #16\n"
"uadalp v30.4s, v8.8h\n"
"uadalp v31.4s, v9.8h\n"
"bne " GEMMLOWP_LABEL_LOOP
"b\n"
// Reduce aggregators horizontally
"addp v0.4s, v12.4s, v13.4s\n"
"addp v1.4s, v14.4s, v15.4s\n"
"addp v2.4s, v16.4s, v17.4s\n"
"addp v3.4s, v18.4s, v19.4s\n"
"addp v4.4s, v20.4s, v21.4s\n"
"addp v5.4s, v22.4s, v23.4s\n"
"addp v6.4s, v24.4s, v25.4s\n"
"addp v7.4s, v26.4s, v27.4s\n"
"addp v8.4s, v28.4s, v29.4s\n"
"addp v9.4s, v30.4s, v31.4s\n"
"addp v10.4s, v0.4s, v1.4s\n"
"addp v11.4s, v2.4s, v3.4s\n"
"addp v12.4s, v4.4s, v5.4s\n"
"addp v13.4s, v6.4s, v7.4s\n"
"addp v14.4s, v8.4s, v9.4s\n"
"mov x0, %[rowmajor_accumulator_buffer]\n"
"st1 {v10.16b}, [x0], #16\n"
"st1 {v11.16b}, [x0], #16\n"
"st1 {v12.16b}, [x0], #16\n"
"st1 {v13.16b}, [x0], #16\n"
"st1 {v14.16b}, [x0], #16\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[depth] "+r"(depth)
: // inputs
[rowmajor_accumulator_buffer] "r"(rowmajor_accumulator_buffer)
: // clobbers
"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
"v28", "v29", "v30", "v31");
// accumulate row-major accumulators into global (column-major) accumulators
for (int l = 0; l < kLhsWidth; l++) {
for (int r = 0; r < kRhsWidth; r++) {
accum_ptr[l + kLhsWidth * r] +=
rowmajor_accumulator_buffer[r + l * kRhsWidth];
}
}
}
};
// Fast kernel operating on int8 operands.
// It is assumed that one of the two int8 operands only takes values
// in [-127, 127], while the other may freely range in [-128, 127].
// The issue with both operands taking the value -128 is that:
// -128*-128 + -128*-128 == -32768 overflows int16.
// Every other expression a*b + c*d, for any int8 a,b,c,d, fits in int16
// range. That is the basic idea of this kernel.
struct NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits {
typedef std::int8_t OperandType;
typedef std::int32_t AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
std::size_t start_depth = 123;
std::size_t run_depth = depth;
std::size_t dst_col_stride = 4;
AccumulatorType* dst_ptr = accum_ptr;
asm volatile(
// Overview of register layout:
//
// A 4x16 block of Rhs is stored in 8 bit in v0--v3.
// A 4x16 block of Lhs is stored in 8 bit in v4--v7.
//
// A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit
// components which need to be horizontally-added at the end)
//
// The Lhs vectors are multiplied by the Rhs vectors with a widening
// multiply over the 8 first levels of depth, producing int16x8
// vectors of products for each position in the accumulator matrix.
// Here comes the special trick: since the operands are signed int8,
// their range being [ -2^7 , 2^7 ), their products are in range
// [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
// without any risk of overflowing int16.
// We thus proceed with the 8 next levels of depth, multiplying
// again Lhs by Rhs, accumulating into this existing int16x8 vector.
//
// Only then, having processed 16 levels of depth, do we need to
// horizontally add these int16x8 accumulators into the final
// int32x4 accumulators.
//
// As we do not have enough registers to store all 16 int16x8
// temporary-16bit-accumulators, we have them cycle through v8--v15.
//
//
// Register layout (ignoring the v8--v15 temporary 16bit accumulators):
//
// +--------+--------+--------+--------+
// |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] |
// Rhs +--------+--------+--------+--------+
// | ... | ... | ... | ... |
// +--------+--------+--------+--------|
// |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]|
// +--------+--------+--------+--------+
//
// | | | | |
//
// Lhs | | | | |
//
// +-------+-----+--------+ - - +--------+--------+--------+--------+
// |v4.b[0]| ... |v4.b[15]| | v16.4s | v17.4s | v18.4s | v19.4s |
// |v5.b[0]| ... |v5.b[15]| | v20.4s | v21.4s | v22.4s | v23.4s |
// |v6.b[0]| ... |v6.b[15]| | v24.4s | v25.4s | v26.4s | v27.4s |
// |v7.b[0]| ... |v7.b[15]| | v28.4s | v29.4s | v30.4s | v31.4s |
// +-------+--------------+ - - +--------+--------+--------+--------+
//
// Accumulator
//
// Clear accumulators
"ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
"dup v16.4s, wzr\n"
"ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
"dup v17.4s, wzr\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
"dup v18.4s, wzr\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
"dup v19.4s, wzr\n"
"ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
"dup v20.4s, wzr\n"
"ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
"dup v21.4s, wzr\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
"dup v22.4s, wzr\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
"dup v23.4s, wzr\n"
"subs %[run_depth], %[run_depth], #16\n"
"dup v24.4s, wzr\n"
"mov x0, %[dst_ptr]\n"
"dup v25.4s, wzr\n"
"dup v26.4s, wzr\n"
"dup v27.4s, wzr\n"
"dup v28.4s, wzr\n"
"dup v29.4s, wzr\n"
"dup v30.4s, wzr\n"
"dup v31.4s, wzr\n"
"smull v12.8h, v0.8b, v4.8b\n"
"smull v13.8h, v1.8b, v4.8b\n"
"smull v14.8h, v0.8b, v5.8b\n"
"smull v15.8h, v1.8b, v5.8b\n"
"smlal2 v12.8h, v0.16b, v4.16b\n"
"smlal2 v13.8h, v1.16b, v4.16b\n"
"smlal2 v14.8h, v0.16b, v5.16b\n"
"smlal2 v15.8h, v1.16b, v5.16b\n"
"beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
GEMMLOWP_LABEL_LOOP
":\n"
"subs %[run_depth], %[run_depth], #16\n"
"sadalp v16.4s, v12.8h\n"
"smull v12.8h, v0.8b, v6.8b\n"
"sadalp v17.4s, v13.8h\n"
"smull v13.8h, v0.8b, v7.8b\n"
"sadalp v20.4s, v14.8h\n"
"smull v14.8h, v1.8b, v6.8b\n"
"sadalp v21.4s, v15.8h\n"
"smull v15.8h, v1.8b, v7.8b\n"
"smlal2 v12.8h, v0.16b, v6.16b\n"
"smlal2 v13.8h, v0.16b, v7.16b\n"
"ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
"smlal2 v14.8h, v1.16b, v6.16b\n"
"smlal2 v15.8h, v1.16b, v7.16b\n"
"ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
"sadalp v24.4s, v12.8h\n"
"smull v12.8h, v2.8b, v4.8b\n"
"sadalp v28.4s, v13.8h\n"
"smull v13.8h, v3.8b, v4.8b\n"
"sadalp v25.4s, v14.8h\n"
"smull v14.8h, v2.8b, v5.8b\n"
"sadalp v29.4s, v15.8h\n"
"smull v15.8h, v3.8b, v5.8b\n"
"smlal2 v12.8h, v2.16b, v4.16b\n"
"smlal2 v13.8h, v3.16b, v4.16b\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v14.8h, v2.16b, v5.16b\n"
"smlal2 v15.8h, v3.16b, v5.16b\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
"sadalp v18.4s, v12.8h\n"
"smull v12.8h, v2.8b, v6.8b\n"
"sadalp v19.4s, v13.8h\n"
"smull v13.8h, v2.8b, v7.8b\n"
"sadalp v22.4s, v14.8h\n"
"smull v14.8h, v3.8b, v6.8b\n"
"sadalp v23.4s, v15.8h\n"
"smull v15.8h, v3.8b, v7.8b\n"
"smlal2 v12.8h, v2.16b, v6.16b\n"
"smlal2 v13.8h, v2.16b, v7.16b\n"
"ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
"smlal2 v14.8h, v3.16b, v6.16b\n"
"ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v15.8h, v3.16b, v7.16b\n"
"ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
"sadalp v26.4s, v12.8h\n"
"ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
"smull v12.8h, v0.8b, v4.8b\n"
"sadalp v30.4s, v13.8h\n"
"smull v13.8h, v1.8b, v4.8b\n"
"sadalp v27.4s, v14.8h\n"
"smull v14.8h, v0.8b, v5.8b\n"
"sadalp v31.4s, v15.8h\n"
"smull v15.8h, v1.8b, v5.8b\n"
"smlal2 v12.8h, v0.16b, v4.16b\n"
"smlal2 v13.8h, v1.16b, v4.16b\n"
"smlal2 v14.8h, v0.16b, v5.16b\n"
"smlal2 v15.8h, v1.16b, v5.16b\n"
"bne " GEMMLOWP_LABEL_LOOP "b\n"
GEMMLOWP_LABEL_AFTER_LOOP
":\n"
// Load accumulators from memory
"ld1 {v8.16b}, [x0], #16\n"
"ld1 {v9.16b}, [x0], #16\n"
"ld1 {v10.16b}, [x0], #16\n"
"ld1 {v11.16b}, [x0], #16\n"
"mov x0, %[dst_ptr]\n"
// Do the remaining arithmetic for the 16 last levels of depths.
// All the operands are already loaded.
"sadalp v16.4s, v12.8h\n"
"smull v12.8h, v0.8b, v6.8b\n"
"sadalp v17.4s, v13.8h\n"
"smull v13.8h, v0.8b, v7.8b\n"
"sadalp v20.4s, v14.8h\n"
"smull v14.8h, v1.8b, v6.8b\n"
"sadalp v21.4s, v15.8h\n"
"smull v15.8h, v1.8b, v7.8b\n"
"smlal2 v12.8h, v0.16b, v6.16b\n"
"smlal2 v13.8h, v0.16b, v7.16b\n"
"smlal2 v14.8h, v1.16b, v6.16b\n"
"smlal2 v15.8h, v1.16b, v7.16b\n"
"sadalp v24.4s, v12.8h\n"
"smull v12.8h, v2.8b, v4.8b\n"
"sadalp v28.4s, v13.8h\n"
"smull v13.8h, v3.8b, v4.8b\n"
"sadalp v25.4s, v14.8h\n"
"smull v14.8h, v2.8b, v5.8b\n"
"sadalp v29.4s, v15.8h\n"
"smull v15.8h, v3.8b, v5.8b\n"
"smlal2 v12.8h, v2.16b, v4.16b\n"
"smlal2 v13.8h, v3.16b, v4.16b\n"
"smlal2 v14.8h, v2.16b, v5.16b\n"
"smlal2 v15.8h, v3.16b, v5.16b\n"
"sadalp v18.4s, v12.8h\n"
"smull v12.8h, v2.8b, v6.8b\n"
"sadalp v19.4s, v13.8h\n"
"smull v13.8h, v2.8b, v7.8b\n"
"sadalp v22.4s, v14.8h\n"
"smull v14.8h, v3.8b, v6.8b\n"
"sadalp v23.4s, v15.8h\n"
"smull v15.8h, v3.8b, v7.8b\n"
"smlal2 v12.8h, v2.16b, v6.16b\n"
"smlal2 v13.8h, v2.16b, v7.16b\n"
"smlal2 v14.8h, v3.16b, v6.16b\n"
"smlal2 v15.8h, v3.16b, v7.16b\n"
"sadalp v26.4s, v12.8h\n"
"sadalp v30.4s, v13.8h\n"
"sadalp v27.4s, v14.8h\n"
"sadalp v31.4s, v15.8h\n"
// Reduce aggregators horizontally
"addp v0.4s, v16.4s, v20.4s\n"
"addp v1.4s, v17.4s, v21.4s\n"
"addp v2.4s, v18.4s, v22.4s\n"
"addp v3.4s, v19.4s, v23.4s\n"
"addp v4.4s, v24.4s, v28.4s\n"
"addp v5.4s, v25.4s, v29.4s\n"
"addp v6.4s, v26.4s, v30.4s\n"
"addp v7.4s, v27.4s, v31.4s\n"
"addp v12.4s, v0.4s, v4.4s\n"
"addp v13.4s, v1.4s, v5.4s\n"
"addp v14.4s, v2.4s, v6.4s\n"
"addp v15.4s, v3.4s, v7.4s\n"
// Add to the accumulators loaded from memory
"add v8.4s, v8.4s, v12.4s\n"
"add v9.4s, v9.4s, v13.4s\n"
"add v10.4s, v10.4s, v14.4s\n"
"add v11.4s, v11.4s, v15.4s\n"
// Store accumulators back to memory
"st1 {v8.16b}, [x0], #16\n"
"st1 {v9.16b}, [x0], #16\n"
"st1 {v10.16b}, [x0], #16\n"
"st1 {v11.16b}, [x0], #16\n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth),
[dst_col_stride] "+r"(dst_col_stride)
: // inputs
[start_depth] "r"(start_depth)
: // clobbers
"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
"v28", "v29", "v30", "v31");
}
};
// Fast kernel operating on int8 operands with 7-bit range.
// It is assumed that one of the two operands only takes values in [-63, 63],
// while the other take values in [-64, 63].
// With this restriction, it is possible to multiply-accumulate operands into
// a 16-bit integer eight times without overflow.
struct NEON_64bit_GEMM_Int7Operands_AccumEightWithin16Bits {
typedef std::int8_t OperandType;
typedef std::int32_t AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
KernelSideFormat<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
#define GEMMLOWP_LABEL_64_DEPTH_LOOP "1"
#define GEMMLOWP_LABEL_64_DEPTH_AFTER_LOOP "2"
#define GEMMLOWP_LABEL_16_DEPTH_LOOP "3"
#define GEMMLOWP_LABEL_16_DEPTH_AFTER_LOOP "4"
AccumulatorType* dst_ptr = accum_ptr;
asm volatile(
// Overview of register layout:
//
// A 4x16 block of Lhs is stored in 8 bit in v0--v7.
// A 2x16 block of Rhs is stored in 8 bit in v8--v15.
//
// A 4x2 block of global accumulators is stored in v24-v31 (as 4x32 bit
// components which need to be horizontally-added at the end).
//
// A 4x2 block of local accumulators is stored in v16-v23 (as 8x16 bit
// components which are added to global accumulators every 64 depth
// iteration.
//
// The Lhs vectors are multiplied by the Rhs vectors with a widening
// multiply over the 8 first levels of depth, producing int16x8
// vectors of products for each position in the accumulator matrix.
//
// Like the trick used in the fast 8-bit kernel, the operands are
// restricted to 7-bit range [-2^6, 2^6) so their products are in range
// [-2^12, 2^12 -1). This enables adding eight such products without any
// risk of overflowing int16, equating to 64 levels of depth before
// horizontally adding these int16x8 accumulators into the final int32x4
// accumulators.
//
// Register layout including both local and global accumulators.
// Since we do not have enough registers to store all Lhs values, we
// reuse the same registers v0--v7 to load the rest of the Lhs values.
//
// +-----+-----+
// | v8 | v9 |
// Rhs +-----+-----+
// | v10 | v11 |
// +-----+-----+
// | v12 | v13 |
// +-----+-----+
// | v14 | v15 |
// Lhs +-----+-----+
// +----+----+----+----+ - - +-----+-----+ +--------+--------+
// | v0 | v4 | v0 | v4 | | v16 | v20 | | v24.4s | v28.4s |
// | v1 | v5 | v1 | v5 | | v17 | v21 | -> | v25.4s | v29.4s |
// | v2 | v6 | v2 | v6 | | v18 | v22 | | v26.4s | v30.4s |
// | v3 | v7 | v3 | v7 | | v19 | v23 | | v27.4s | v31.4s |
// +----+----+----+----+ - - +-----+-----+ +--------+--------+
//
// Local Accumulator Global Accumulator
//
// Clear accumulators.
"dup v16.4s, wzr\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"dup v24.4s, wzr\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"dup v17.4s, wzr\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"dup v25.4s, wzr\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"dup v18.4s, wzr\n"
"ld1 {v8.16b}, [%[rhs_ptr]], #16\n"
"dup v26.4s, wzr\n"
"ld1 {v9.16b}, [%[rhs_ptr]], #16\n"
"dup v19.4s, wzr\n"
"dup v27.4s, wzr\n"
"dup v20.4s, wzr\n"
"dup v28.4s, wzr\n"
"dup v21.4s, wzr\n"
"dup v29.4s, wzr\n"
"dup v22.4s, wzr\n"
"dup v30.4s, wzr\n"
"dup v23.4s, wzr\n"
"dup v31.4s, wzr\n"
"cmp %w[depth], #64\n"
"blt " GEMMLOWP_LABEL_64_DEPTH_AFTER_LOOP "f\n"
//"loop_%=:\n"
GEMMLOWP_LABEL_64_DEPTH_LOOP
":\n"
"subs %w[depth], %w[depth], #64\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
"sadalp v24.4s, v16.8h\n"
"smull v16.8h, v0.8b, v8.8b\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
"sadalp v25.4s, v17.8h\n"
"smull v17.8h, v1.8b, v8.8b\n"
"ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
"sadalp v26.4s, v18.8h\n"
"smull v18.8h, v2.8b, v8.8b\n"
"ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
"sadalp v27.4s, v19.8h\n"
"smull v19.8h, v3.8b, v8.8b\n"
"ld1 {v10.16b}, [%[rhs_ptr]], #16\n"
"sadalp v28.4s, v20.8h\n"
"smull v20.8h, v0.8b, v9.8b\n"
"ld1 {v11.16b}, [%[rhs_ptr]], #16\n"
"sadalp v29.4s, v21.8h\n"
"smull v21.8h, v1.8b, v9.8b\n"
"ld1 {v12.16b}, [%[rhs_ptr]], #16\n"
"sadalp v30.4s, v22.8h\n"
"smull v22.8h, v2.8b, v9.8b\n"
"ld1 {v13.16b}, [%[rhs_ptr]], #16\n"
"sadalp v31.4s, v23.8h\n"
"smull v23.8h, v3.8b, v9.8b\n"
"cmp %w[depth], #64\n"
"smlal2 v16.8h, v0.16b, v8.16b\n"
"ld1 {v14.16b}, [%[rhs_ptr]], #16\n"
"smlal2 v17.8h, v1.16b, v8.16b\n"
"ld1 {v15.16b}, [%[rhs_ptr]], #16\n"
"smlal2 v18.8h, v2.16b, v8.16b\n"
"smlal2 v19.8h, v3.16b, v8.16b\n"
"smlal2 v20.8h, v0.16b, v9.16b\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v21.8h, v1.16b, v9.16b\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v22.8h, v2.16b, v9.16b\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v23.8h, v3.16b, v9.16b\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"smlal v16.8h, v4.8b, v10.8b\n"
"smlal v17.8h, v5.8b, v10.8b\n"
"smlal v18.8h, v6.8b, v10.8b\n"
"smlal v19.8h, v7.8b, v10.8b\n"
"smlal v20.8h, v4.8b, v11.8b\n"
"smlal v21.8h, v5.8b, v11.8b\n"
"smlal v22.8h, v6.8b, v11.8b\n"
"smlal v23.8h, v7.8b, v11.8b\n"
"smlal2 v16.8h, v4.16b, v10.16b\n"
"ld1 {v8.16b}, [%[rhs_ptr]], #16\n"
"smlal2 v17.8h, v5.16b, v10.16b\n"
"ld1 {v9.16b}, [%[rhs_ptr]], #16\n"
"smlal2 v18.8h, v6.16b, v10.16b\n"
"smlal2 v19.8h, v7.16b, v10.16b\n"
"smlal2 v20.8h, v4.16b, v11.16b\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v21.8h, v5.16b, v11.16b\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v22.8h, v6.16b, v11.16b\n"
"ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v23.8h, v7.16b, v11.16b\n"
"ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
"smlal v16.8h, v0.8b, v12.8b\n"
"smlal v17.8h, v1.8b, v12.8b\n"
"smlal v18.8h, v2.8b, v12.8b\n"
"smlal v19.8h, v3.8b, v12.8b\n"
"smlal v20.8h, v0.8b, v13.8b\n"
"smlal v21.8h, v1.8b, v13.8b\n"
"smlal v22.8h, v2.8b, v13.8b\n"
"smlal v23.8h, v3.8b, v13.8b\n"
"smlal2 v16.8h, v0.16b, v12.16b\n"
"smlal2 v17.8h, v1.16b, v12.16b\n"
"smlal2 v18.8h, v2.16b, v12.16b\n"
"smlal2 v19.8h, v3.16b, v12.16b\n"
"smlal2 v20.8h, v0.16b, v13.16b\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v21.8h, v1.16b, v13.16b\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v22.8h, v2.16b, v13.16b\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v23.8h, v3.16b, v13.16b\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"smlal v16.8h, v4.8b, v14.8b\n"
"smlal v17.8h, v5.8b, v14.8b\n"
"smlal v18.8h, v6.8b, v14.8b\n"
"smlal v19.8h, v7.8b, v14.8b\n"
"smlal v20.8h, v4.8b, v15.8b\n"
"smlal v21.8h, v5.8b, v15.8b\n"
"smlal v22.8h, v6.8b, v15.8b\n"
"smlal v23.8h, v7.8b, v15.8b\n"
"smlal2 v16.8h, v4.16b, v14.16b\n"
"smlal2 v17.8h, v5.16b, v14.16b\n"
"smlal2 v18.8h, v6.16b, v14.16b\n"
"smlal2 v19.8h, v7.16b, v14.16b\n"
"smlal2 v20.8h, v4.16b, v15.16b\n"
"smlal2 v21.8h, v5.16b, v15.16b\n"
"smlal2 v22.8h, v6.16b, v15.16b\n"
"smlal2 v23.8h, v7.16b, v15.16b\n"
"bge " GEMMLOWP_LABEL_64_DEPTH_LOOP "b\n"
GEMMLOWP_LABEL_64_DEPTH_AFTER_LOOP
":\n"
"cmp %w[depth], #16\n"
"blt " GEMMLOWP_LABEL_16_DEPTH_AFTER_LOOP "f\n"
//"loop_%=:\n"
GEMMLOWP_LABEL_16_DEPTH_LOOP
":\n"
"sadalp v24.4s, v16.8h\n"
"smull v16.8h, v0.8b, v8.8b\n"
"subs %w[depth], %w[depth], #16\n"
"sadalp v25.4s, v17.8h\n"
"smull v17.8h, v1.8b, v8.8b\n"
"sadalp v26.4s, v18.8h\n"
"smull v18.8h, v2.8b, v8.8b\n"
"sadalp v27.4s, v19.8h\n"
"smull v19.8h, v3.8b, v8.8b\n"
"sadalp v28.4s, v20.8h\n"
"smull v20.8h, v0.8b, v9.8b\n"
"sadalp v29.4s, v21.8h\n"
"smull v21.8h, v1.8b, v9.8b\n"
"sadalp v30.4s, v22.8h\n"
"smull v22.8h, v2.8b, v9.8b\n"
"sadalp v31.4s, v23.8h\n"
"smull v23.8h, v3.8b, v9.8b\n"
"cmp %w[depth], #16\n"
"smlal2 v16.8h, v0.16b, v8.16b\n"
"smlal2 v17.8h, v1.16b, v8.16b\n"
"smlal2 v18.8h, v2.16b, v8.16b\n"
"smlal2 v19.8h, v3.16b, v8.16b\n"
"ld1 {v8.16b}, [%[rhs_ptr]], #16\n"
"smlal2 v20.8h, v0.16b, v9.16b\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v21.8h, v1.16b, v9.16b\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v22.8h, v2.16b, v9.16b\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"smlal2 v23.8h, v3.16b, v9.16b\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v9.16b}, [%[rhs_ptr]], #16\n"
"bge " GEMMLOWP_LABEL_16_DEPTH_LOOP "b\n"
GEMMLOWP_LABEL_16_DEPTH_AFTER_LOOP
":\n"
"sadalp v24.4s, v16.8h\n"
"sadalp v25.4s, v17.8h\n"
"sadalp v26.4s, v18.8h\n"
"sadalp v27.4s, v19.8h\n"
"sadalp v28.4s, v20.8h\n"
"sadalp v29.4s, v21.8h\n"
"sadalp v30.4s, v22.8h\n"
"sadalp v31.4s, v23.8h\n"
// Reduce aggregators horizontally.
"addp v0.4s, v24.4s, v25.4s\n"
"addp v1.4s, v26.4s, v27.4s\n"
"addp v2.4s, v28.4s, v29.4s\n"
"addp v3.4s, v30.4s, v31.4s\n"
"addp v4.4s, v0.4s, v1.4s\n"
"addp v5.4s, v2.4s, v3.4s\n"
// Load accumulators from memory.
"mov x0, %[dst_ptr]\n"
"ld1 {v6.16b}, [x0], #16\n"
"ld1 {v7.16b}, [x0], #16\n"
// Add to the accumulators loaded from memory.
"add v6.4s, v6.4s, v4.4s\n"
"add v7.4s, v7.4s, v5.4s\n"
// Store accumulators back to memory.
"mov x0, %[dst_ptr]\n"
"st1 {v6.16b}, [x0], #16\n"
"st1 {v7.16b}, [x0], #16\n"
:
// Outputs.
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_ptr] "+r"(dst_ptr), [depth] "+r"(depth)
:
// Inputs.
:
// Clobbers.
"cc", "memory",
// We use these NEON registers
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31", "x0");
}
};
SET_7BIT_RANGES(NEON_64bit_GEMM_Int7Operands_AccumEightWithin16Bits);
// Kernel operating on int8 operands with 4.25-bit range.
// It is assumed that one of the two operands only takes values in [-7, 7],
// while the other take values in [-9, 9].
// With this restriction, it is possible to multiply-accumulate operands into
// a 16-bit integer thirty-two times without overflow.
struct NEON_64bit_GEMM_Int425Operands {
typedef std::int8_t OperandType;
typedef std::int32_t AccumulatorType;
typedef KernelFormat<
KernelSideFormat<CellFormat<4, 32, CellOrder::WidthMajor>, 1>,
KernelSideFormat<CellFormat<2, 32, CellOrder::WidthMajor>, 1> >
Format;
static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
AccumulatorType* accum_ptr, int depth) {
#define GEMMLOWP_LABEL_512_DEPTH_LOOP "1"
#define GEMMLOWP_LABEL_32_DEPTH_LOOP "2"
#define GEMMLOWP_LABEL_32_DEPTH_AFTER_LOOP "3"
AccumulatorType* dst_ptr = accum_ptr;
int outer_depth = depth / 512 + 1;
asm volatile(
// Overview of register layout:
//
// A 4x32 block of Lhs is stored in 8 bit in v0--v7.
// A 2x32 block of Rhs is stored in 8 bit in v8--v11.
//
// A 4x2 block of global accumulators is stored in v24-v31 (as 4x32 bit
// components which need to be horizontally-added at the end).
//
// A 4x2 block of local accumulators is stored in v16-v23 (as 8x16 bit
// components which are horizontally-added to global accumulators every
// 512 depth iteration.
//
// The Lhs vectors are multiplied by the Rhs vectors with a multiply
// over the 16 first levels of depth, producing int8x16 vectors of
// products for each position in the accumulator matrix.
//
// Like the trick used in the fast 8-bit and 7-bit kernels, the operands
// are restricted to 4.25-bit range, [-7, 7] for one operand and [-9, 9]
// for the other operand. This enables adding two such products without
// any risk of overflowing int8, and thiry-two such products without
// overflowing int16. This equates to 512 levels of depth before
// horizontally adding these int16x8 accumulators into the final int32x4
// accumulators.
//
// Register layout (ignoring the v12--v15 temporary 8-bit accumulators).
// Since we do not have enough registers to store all Lhs values and Rhs
// values, we reuse the same registers v0--v7 to load subsequent Lhs
// values and v8-v11 to subsequent Rhs values.
//
// +-----+-----+
// | v8 | v9 |
// Rhs +-----+-----+
// | v10 | v11 |
// +-----+-----+
// | v8 | v9 |
// +-----+-----+
// | v10 | v11 |
// Lhs +-----+-----+
// +----+----+----+----+ - - +-----+-----+ +--------+--------+
// | v0 | v4 | v0 | v4 | | v16 | v17 | | v24.4s | v25.4s |
// | v1 | v5 | v1 | v5 | | v18 | v19 | -> | v26.4s | v27.4s |
// | v2 | v6 | v2 | v6 | | v20 | v21 | | v28.4s | v29.4s |
// | v3 | v7 | v3 | v7 | | v22 | v23 | | v30.4s | v31.4s |
// +----+----+----+----+ - - +-----+-----+ +--------+--------+
//
// Local Accumulator Global Accumulator
//
// Clear global accumulators.
"dup v24.4s, wzr\n"
"ld1 {v8.16b}, [%[rhs_ptr]], #16\n"
"dup v25.4s, wzr\n"
"ld1 {v9.16b}, [%[rhs_ptr]], #16\n"
"dup v26.4s, wzr\n"
"ld1 {v10.16b}, [%[rhs_ptr]], #16\n"
"dup v27.4s, wzr\n"
"ld1 {v11.16b}, [%[rhs_ptr]], #16\n"
"dup v28.4s, wzr\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"dup v29.4s, wzr\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"dup v30.4s, wzr\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"dup v31.4s, wzr\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
"ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
//"loop_%=:\n"
GEMMLOWP_LABEL_512_DEPTH_LOOP
":\n"
// Clear local accumulators.
"dup v16.8h, wzr\n"
"dup v17.8h, wzr\n"
"dup v18.8h, wzr\n"
"mov x1, #512\n"
"dup v19.8h, wzr\n"
"dup v20.8h, wzr\n"
"dup v21.8h, wzr\n"
"dup v22.8h, wzr\n"
"dup v23.8h, wzr\n"
//"loop_%=:\n"
GEMMLOWP_LABEL_32_DEPTH_LOOP
":\n"
"mul v12.16b, v0.16b, v8.16b\n"
"mul v13.16b, v0.16b, v10.16b\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"mul v14.16b, v2.16b, v8.16b\n"
"mul v15.16b, v2.16b, v10.16b\n"
"mla v12.16b, v1.16b, v9.16b\n"
"mla v13.16b, v1.16b, v11.16b\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"mla v14.16b, v3.16b, v9.16b\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"mla v15.16b, v3.16b, v11.16b\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"sadalp v16.8h, v12.16b\n"
"sadalp v17.8h, v13.16b\n"
"subs %w[depth], %w[depth], #32\n"
"sadalp v18.8h, v14.16b\n"
"sadalp v19.8h, v15.16b\n"
"subs x1, x1, #32\n"
"mul v12.16b, v4.16b, v8.16b\n"
"mul v13.16b, v4.16b, v10.16b\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
"mul v14.16b, v6.16b, v8.16b\n"
"ld1 {v8.16b}, [%[rhs_ptr]], #16\n"
"mul v15.16b, v6.16b, v10.16b\n"
"mla v12.16b, v5.16b, v9.16b\n"
"mla v13.16b, v5.16b, v11.16b\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
"mla v14.16b, v7.16b, v9.16b\n"
"ld1 {v9.16b}, [%[rhs_ptr]], #16\n"
"mla v15.16b, v7.16b, v11.16b\n"
"ld1 {v10.16b}, [%[rhs_ptr]], #16\n"
"sadalp v20.8h, v12.16b\n"
"ld1 {v11.16b}, [%[rhs_ptr]], #16\n"
"sadalp v21.8h, v13.16b\n"
"ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
"sadalp v22.8h, v14.16b\n"
"ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
"sadalp v23.8h, v15.16b\n"
"mul v12.16b, v0.16b, v8.16b\n"
"mul v13.16b, v0.16b, v10.16b\n"
"ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
"mul v14.16b, v2.16b, v8.16b\n"
"mul v15.16b, v2.16b, v10.16b\n"
"mla v12.16b, v1.16b, v9.16b\n"
"mla v13.16b, v1.16b, v11.16b\n"
"ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
"mla v14.16b, v3.16b, v9.16b\n"
"ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
"mla v15.16b, v3.16b, v11.16b\n"
"ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
"sadalp v16.8h, v12.16b\n"
"sadalp v17.8h, v13.16b\n"
"sadalp v18.8h, v14.16b\n"
"sadalp v19.8h, v15.16b\n"
"mul v12.16b, v4.16b, v8.16b\n"
"mul v13.16b, v4.16b, v10.16b\n"
"ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
"mul v14.16b, v6.16b, v8.16b\n"
"ld1 {v8.16b}, [%[rhs_ptr]], #16\n"
"mul v15.16b, v6.16b, v10.16b\n"
"mla v12.16b, v5.16b, v9.16b\n"
"mla v13.16b, v5.16b, v11.16b\n"
"ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
"mla v14.16b, v7.16b, v9.16b\n"
"ld1 {v9.16b}, [%[rhs_ptr]], #16\n"
"mla v15.16b, v7.16b, v11.16b\n"
"ld1 {v10.16b}, [%[rhs_ptr]], #16\n"
"sadalp v20.8h, v12.16b\n"
"ld1 {v11.16b}, [%[rhs_ptr]], #16\n"
"sadalp v21.8h, v13.16b\n"
"ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
"sadalp v22.8h, v14.16b\n"
"ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
"sadalp v23.8h, v15.16b\n"
"beq " GEMMLOWP_LABEL_32_DEPTH_AFTER_LOOP
"f\n"
"cmp %w[depth], #0\n"
"bne " GEMMLOWP_LABEL_32_DEPTH_LOOP "b\n"
GEMMLOWP_LABEL_32_DEPTH_AFTER_LOOP
":\n"
// Pairwise add 16-bit local accums to 32-bit global accums.
"sadalp v24.4s, v16.8h\n"
"sadalp v25.4s, v17.8h\n"
"sadalp v26.4s, v18.8h\n"
"sadalp v27.4s, v19.8h\n"
"sadalp v28.4s, v20.8h\n"
"sadalp v29.4s, v21.8h\n"
"sadalp v30.4s, v22.8h\n"
"sadalp v31.4s, v23.8h\n"
"bne " GEMMLOWP_LABEL_512_DEPTH_LOOP
"b\n"
// Reduce aggregators horizontally.
"addp v0.4s, v24.4s, v26.4s\n"
"addp v1.4s, v28.4s, v30.4s\n"
"addp v2.4s, v25.4s, v27.4s\n"
"addp v3.4s, v29.4s, v31.4s\n"
"addp v4.4s, v0.4s, v1.4s\n"
"addp v5.4s, v2.4s, v3.4s\n"
// Load accumulators from memory.
"mov x0, %[dst_ptr]\n"
"ld1 {v6.16b}, [x0], #16\n"
"ld1 {v7.16b}, [x0], #16\n"
// Add to the accumulators loaded from memory.
"add v6.4s, v6.4s, v4.4s\n"
"add v7.4s, v7.4s, v5.4s\n"
// Store accumulators back to memory.
"mov x0, %[dst_ptr]\n"
"st1 {v6.16b}, [x0], #16\n"
"st1 {v7.16b}, [x0], #16\n"
:
// Outputs.
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[dst_ptr] "+r"(dst_ptr), [depth] "+r"(depth),
[outer_depth] "+r"(outer_depth)
:
// Inputs.
:
// Clobbers.
"cc", "memory",
// We use these NEON registers
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31", "x0", "x1");
}
};
SET_425BIT_RANGES(NEON_64bit_GEMM_Int425Operands);
#ifdef __ARM_FEATURE_DOTPROD
// Kernels utilizing the Armv8.2 Dot Product extension.
//
// The dot product instructions work by taking 4 consecutive 8-bit depth
// values from each operand, multiplying the 4 pairs together and
// accumulating all the results into the corresponding 32-bit accumulator
// lane. As such, the operation is identical to a 32-bit instruction (like
// FMLA used in SGEMM), except that 4 depth values are processed at a time
// instead of 1.
// Thus, this first kernel is a carbon copy of