Skip to content

Commit

Permalink
Scalar RR1-P6-DIV evaluation stub for F32 TanH
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 497033716
  • Loading branch information
Maratyszcza authored and xnnpack-bot committed Dec 22, 2022
1 parent b8ba4d9 commit 51a9875
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 0 deletions.
1 change: 1 addition & 0 deletions cmake/microkernels.cmake
Expand Up @@ -3918,6 +3918,7 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
src/math/sqrt-u64-scalar-cvtu32-sqrt-cvtsatu32f64.c
src/math/sqrt-u64-scalar-cvtu32-sqrt-llrint.c
src/math/sqrt-u64-scalar-cvtu64-sqrt-llrint.c
src/math/tanh-f32-scalar-rr1-p6-div.c
src/math/tanh-f32-scalar-rr2-p6-div.c
src/qc8-dwconv/gen/qc8-dwconv-3p1c-minmax-fp32-scalar-fmagic.c
src/qc8-dwconv/gen/qc8-dwconv-3p2c-minmax-fp32-scalar-imagic.c
Expand Down
4 changes: 4 additions & 0 deletions eval/f32-tanh-ulp.cc
Expand Up @@ -138,6 +138,10 @@ static void TanhError(benchmark::State& state,
->Iterations(1);
#endif // XNN_ARCH_ARM64

BENCHMARK_CAPTURE(TanhError, scalar_rr1_p6_div,
xnn_math_f32_tanh__scalar_rr1_p6_div)
->Unit(benchmark::kMillisecond)
->Iterations(1);
BENCHMARK_CAPTURE(TanhError, scalar_rr2_p6_div,
xnn_math_f32_tanh__scalar_rr2_p6_div)
->Unit(benchmark::kMillisecond)
Expand Down
1 change: 1 addition & 0 deletions google3/third_party/XNNPACK/microkernels.bzl
Expand Up @@ -3935,6 +3935,7 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
"src/math/sqrt-u64-scalar-cvtu32-sqrt-cvtsatu32f64.c",
"src/math/sqrt-u64-scalar-cvtu32-sqrt-llrint.c",
"src/math/sqrt-u64-scalar-cvtu64-sqrt-llrint.c",
"src/math/tanh-f32-scalar-rr1-p6-div.c",
"src/math/tanh-f32-scalar-rr2-p6-div.c",
"src/qc8-dwconv/gen/qc8-dwconv-3p1c-minmax-fp32-scalar-fmagic.c",
"src/qc8-dwconv/gen/qc8-dwconv-3p2c-minmax-fp32-scalar-imagic.c",
Expand Down
105 changes: 105 additions & 0 deletions src/math/tanh-f32-scalar-rr1-p6-div.c
@@ -0,0 +1,105 @@
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stddef.h>
#include <math.h>

#include <xnnpack/common.h>
#include <xnnpack/math.h>
#include <xnnpack/math-stubs.h>


void xnn_math_f32_tanh__scalar_rr1_p6_div(
size_t n,
const float* input,
float* output)
{
assert(n % sizeof(float) == 0);

// Large number such that ulp(magic bias) == 0.5 and magic bias === 63.5 mod 2**21.
const float vmagic_bias = 0x1.8000FEp+22f;
const float vminus_log2e = -0x1.715476p+0f;
const float vln2 = 0x1.62E430p-1f;
// Coefficient of polynomial approximation
// exp(-2t) - 1 ~ -2 * (t * (1 + t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6))))))
// on [-log(2)/2, log(2)/2]
const float vc6 = -0x1.6b7338p-5f;
const float vc5 = 0x1.12278Ep-3f;
const float vc4 = -0x1.555716p-2f;
const float vc3 = 0x1.5554B0p-1f;
const float vc2 = -0x1.FFFFFEp-1f;
const float vone = 1.0f;
const float vtwo = 2.0f;
// The largest z for which tanhf(-z) is not saturated at -1.0f.
const float vsat_cutoff = 0x1.205966p+3f;

for (; n != 0; n -= sizeof(float)) {
const float vx = *input++;

// General structure of the algorithm:
//
// / expm1(2x) / (2 + expm1(2x)) if x <= 0
// f[x] :=
// \ -f[-x] if x >= 0
//
// First we compute f[-z] := expm1(-2z) / (2 + expm1(-2z)) where z = abs(x),
// then replace result with -f[-z] if x >= 0.
const float vz = fabsf(vx);

// Compute reduced argument n := round(-z / log(2), 1).
// We do it by adding a large number (magic bias), which cause rounding of the result to integer, then subtracing
// the large number back. The trick with adding large number is valid only within certain bounds
// (|-z / log(2)| <= 2**21, i.e. |z| <= 0x1.62E43p+20 = 1453635.0), but that is acceptable, because inputs x
// outside of [-9.010913, 9.010913] (i.e. z outsize [0, 9.010913]) saturate tanhf(x). We fixup the result for such
// inputs at the very end of the algorithm.
// Note that addition-subtraction of the large number doesn't cause overflow for inputs in this range.
float vn = vz * vminus_log2e + vmagic_bias;

// Create a floating-point number s (scale) such that s == 2**(2n) for inputs which don't cause underflow, i.e.
// 0 <= z <= 9.010913, and -13 <= n <= 0 accordingly.
const float vs = uint32_as_float(float_as_uint32(vn) << 23);

// Subtract the large number back to get final n := round(-z / log(2), 1) as a floating-point number.
vn -= vmagic_bias;

// Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
float vt = vn * vln2 + vz;

// Compute degree-6 polynomial approximation for exp(-2t) - 1 on [-log(2)/4, log(2)/4].
// P(-2t) = t * (1 + t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))
// = t + t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))
// = -2 * (t + t * p)
float vp = vc6 * vt + vc5;
vp = vp * vt + vc4;
vp = vp * vt + vc3;
vp = vp * vt + vc2;
vp *= vt;

// Reconstruct the exp(x) - 1 value:
// exp(x) - 1 = s * (1 - 2t * (1 + t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))) - 1
// = (s - 1) + s * (-2t) * (t + t * p)
// = (s - 1) - 2 * ((t * s) + (t * s) * p)
vt *= vs;
const float vsm1 = vs - vone;
vp = vp * vt + vt;
const float vem1 = vsm1 - vtwo * vp;

// Reconstruct tanh(-z) := expm1(-2z) / (2 + expm1(-2z))
const float vep1 = vtwo + vem1;
float vabsy = vem1 / vep1;

// The function saturates at +-1 for large inputs: tanhf(z) == +-1.0f for z > sat_cutoff ~= 9.010913.
// Note that we use 1.0f, because sign will be copied from the input right after.
if XNN_UNPREDICTABLE(vz > vsat_cutoff) {
vabsy = vone;
}

// Reconstruct tanh[x] = sign(x) * tanh[-abs(x)]
const float vy = copysignf(vabsy, vx);

*output++ = vy;
}
}
1 change: 1 addition & 0 deletions src/xnnpack/math-stubs.h
Expand Up @@ -376,6 +376,7 @@ DECLARE_U64_UNARY_MATH_FUNCTION(xnn_math_u64_sqrt__scalar_cvtu32_sqrt_llrint)
DECLARE_U64_UNARY_MATH_FUNCTION(xnn_math_u64_sqrt__scalar_cvtu64_sqrt_llrint)

DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_tanh__aarch64_neonfma_rr1_p6_div)
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_tanh__scalar_rr1_p6_div)
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_tanh__scalar_rr2_p6_div)

#ifdef __cplusplus
Expand Down

0 comments on commit 51a9875

Please sign in to comment.