Scalar RR1-P6-DIV evaluation stub for F32 TanH

PiperOrigin-RevId: 497033716
google · Dec 22, 2022 · 51a9875 · 51a9875
1 parent b8ba4d9
commit 51a9875
Show file tree

Hide file tree

Showing 5 changed files with 112 additions and 0 deletions.
diff --git a/cmake/microkernels.cmake b/cmake/microkernels.cmake
@@ -3918,6 +3918,7 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
   src/math/sqrt-u64-scalar-cvtu32-sqrt-cvtsatu32f64.c
   src/math/sqrt-u64-scalar-cvtu32-sqrt-llrint.c
   src/math/sqrt-u64-scalar-cvtu64-sqrt-llrint.c
+  src/math/tanh-f32-scalar-rr1-p6-div.c
   src/math/tanh-f32-scalar-rr2-p6-div.c
   src/qc8-dwconv/gen/qc8-dwconv-3p1c-minmax-fp32-scalar-fmagic.c
   src/qc8-dwconv/gen/qc8-dwconv-3p2c-minmax-fp32-scalar-imagic.c

diff --git a/eval/f32-tanh-ulp.cc b/eval/f32-tanh-ulp.cc
@@ -138,6 +138,10 @@ static void TanhError(benchmark::State& state,
     ->Iterations(1);
 #endif  // XNN_ARCH_ARM64
 
+BENCHMARK_CAPTURE(TanhError, scalar_rr1_p6_div,
+                  xnn_math_f32_tanh__scalar_rr1_p6_div)
+  ->Unit(benchmark::kMillisecond)
+  ->Iterations(1);
 BENCHMARK_CAPTURE(TanhError, scalar_rr2_p6_div,
                   xnn_math_f32_tanh__scalar_rr2_p6_div)
   ->Unit(benchmark::kMillisecond)

diff --git a/google3/third_party/XNNPACK/microkernels.bzl b/google3/third_party/XNNPACK/microkernels.bzl
@@ -3935,6 +3935,7 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
     "src/math/sqrt-u64-scalar-cvtu32-sqrt-cvtsatu32f64.c",
     "src/math/sqrt-u64-scalar-cvtu32-sqrt-llrint.c",
     "src/math/sqrt-u64-scalar-cvtu64-sqrt-llrint.c",
+    "src/math/tanh-f32-scalar-rr1-p6-div.c",
     "src/math/tanh-f32-scalar-rr2-p6-div.c",
     "src/qc8-dwconv/gen/qc8-dwconv-3p1c-minmax-fp32-scalar-fmagic.c",
     "src/qc8-dwconv/gen/qc8-dwconv-3p2c-minmax-fp32-scalar-imagic.c",

diff --git a/src/math/tanh-f32-scalar-rr1-p6-div.c b/src/math/tanh-f32-scalar-rr1-p6-div.c
@@ -0,0 +1,105 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <math.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_tanh__scalar_rr1_p6_div(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % sizeof(float) == 0);
+
+  // Large number such that ulp(magic bias) == 0.5 and magic bias === 63.5 mod 2**21.
+  const float vmagic_bias = 0x1.8000FEp+22f;
+  const float vminus_log2e = -0x1.715476p+0f;
+  const float vln2 = 0x1.62E430p-1f;
+  // Coefficient of polynomial approximation
+  //   exp(-2t) - 1 ~ -2 * (t * (1 + t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6))))))
+  // on [-log(2)/2, log(2)/2]
+  const float vc6 = -0x1.6b7338p-5f;
+  const float vc5 = 0x1.12278Ep-3f;
+  const float vc4 = -0x1.555716p-2f;
+  const float vc3 = 0x1.5554B0p-1f;
+  const float vc2 = -0x1.FFFFFEp-1f;
+  const float vone = 1.0f;
+  const float vtwo = 2.0f;
+  // The largest z for which tanhf(-z) is not saturated at -1.0f.
+  const float vsat_cutoff = 0x1.205966p+3f;
+
+  for (; n != 0; n -= sizeof(float)) {
+    const float vx = *input++;
+
+    // General structure of the algorithm:
+    //
+    //           / expm1(2x) / (2 + expm1(2x)) if x <= 0
+    //   f[x] :=
+    //           \ -f[-x] if x >= 0
+    //
+    // First we compute f[-z] := expm1(-2z) / (2 + expm1(-2z)) where z = abs(x),
+    // then replace result with -f[-z] if x >= 0.
+    const float vz = fabsf(vx);
+
+    // Compute reduced argument n := round(-z / log(2), 1).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to integer, then subtracing
+    // the large number back. The trick with adding large number is valid only within certain bounds
+    // (|-z / log(2)| <= 2**21, i.e. |z| <= 0x1.62E43p+20 = 1453635.0), but that is acceptable, because inputs x
+    // outside of [-9.010913, 9.010913] (i.e. z outsize [0, 9.010913]) saturate tanhf(x). We fixup the result for such
+    // inputs at the very end of the algorithm.
+    // Note that addition-subtraction of the large number doesn't cause overflow for inputs in this range.
+    float vn = vz * vminus_log2e + vmagic_bias;
+
+    // Create a floating-point number s (scale) such that s == 2**(2n) for inputs which don't cause underflow, i.e.
+    // 0 <= z <= 9.010913, and -13 <= n <= 0 accordingly.
+    const float vs = uint32_as_float(float_as_uint32(vn) << 23);
+
+    // Subtract the large number back to get final n := round(-z / log(2), 1) as a floating-point number.
+    vn -= vmagic_bias;
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    float vt = vn * vln2 + vz;
+
+    // Compute degree-6 polynomial approximation for exp(-2t) - 1 on [-log(2)/4, log(2)/4].
+    //   P(-2t) = t * (1 + t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))
+    //          = t + t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))
+    //          = -2 * (t + t * p)
+    float vp = vc6 * vt + vc5;
+    vp = vp * vt + vc4;
+    vp = vp * vt + vc3;
+    vp = vp * vt + vc2;
+    vp *= vt;
+
+    // Reconstruct the exp(x) - 1 value:
+    //   exp(x) - 1 = s * (1 - 2t * (1 + t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))) - 1
+    //              = (s - 1) + s * (-2t) * (t + t * p)
+    //              = (s - 1) - 2 * ((t * s) + (t * s) * p)
+    vt *= vs;
+    const float vsm1 = vs - vone;
+    vp = vp * vt + vt;
+    const float vem1 = vsm1 - vtwo * vp;
+
+    // Reconstruct tanh(-z) := expm1(-2z) / (2 + expm1(-2z))
+    const float vep1 = vtwo + vem1;
+    float vabsy = vem1 / vep1;
+
+    // The function saturates at +-1 for large inputs: tanhf(z) == +-1.0f for z > sat_cutoff ~= 9.010913.
+    // Note that we use 1.0f, because sign will be copied from the input right after.
+    if XNN_UNPREDICTABLE(vz > vsat_cutoff) {
+      vabsy = vone;
+    }
+
+    // Reconstruct tanh[x] = sign(x) * tanh[-abs(x)]
+    const float vy = copysignf(vabsy, vx);
+
+    *output++ = vy;
+  }
+}
diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h
@@ -376,6 +376,7 @@ DECLARE_U64_UNARY_MATH_FUNCTION(xnn_math_u64_sqrt__scalar_cvtu32_sqrt_llrint)
 DECLARE_U64_UNARY_MATH_FUNCTION(xnn_math_u64_sqrt__scalar_cvtu64_sqrt_llrint)
 
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_tanh__aarch64_neonfma_rr1_p6_div)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_tanh__scalar_rr1_p6_div)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_tanh__scalar_rr2_p6_div)
 
 #ifdef __cplusplus