Adding SSE4 specialization of the output.h

google · Oct 13, 2016 · 3b7c743 · 3b7c743
1 parent 9b3458b
commit 3b7c743
Show file tree

Hide file tree

Showing 3 changed files with 307 additions and 6 deletions.
diff --git a/internal/fixedpoint.h b/internal/fixedpoint.h
@@ -509,14 +509,14 @@ FixedPoint<tRawType, 0> one_minus_x_over_one_plus_x_for_x_in_0_1(
     FixedPoint<tRawType, 0> a) {
   typedef FixedPoint<tRawType, 0> F0;
   typedef FixedPoint<tRawType, 2> F2;
-  /* Computes an approximation of the 1/(1+a) using Newton-Raphson division */
-  /* half_denominator= 1 + a */
+  /* Computes an approximation of the 2/(1+a) using Newton-Raphson division */
+  /* half_denominator= (1 + a)/2 */
   F0 half_denominator = RoundingHalfSum(a, F0::One());
   const F2 constant_48_over_17 =
       GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F2, 1515870810, 48.0 / 17.0);
   const F2 constant_neg_32_over_17 =
       GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F2, -1010580540, -32.0 / 17.0);
-  /* First approx of 1/(1+a) */
+  /* First approx of 2/(1+a) */
   F2 x = constant_48_over_17 + half_denominator * constant_neg_32_over_17;
   /* Newton iteration x_{i+1} = x_i + x_i*(1 - half_denominator * x_i) */
   for (int i = 0; i < 3; i++) {
@@ -525,9 +525,8 @@ FixedPoint<tRawType, 0> one_minus_x_over_one_plus_x_for_x_in_0_1(
         F2::One() - half_denominator_times_x;
     x = x + Rescale<2>(x * one_minus_half_denominator_times_x);
   }
-  /* x ~= 1/(1+a) */
-  /* return x - 1 ~= - a/(1+a) */
-  /* problem: (1-a)/(1+a) = 1/(1+a) - a/(1+a) */
+  /* x ~= 2/(1+a) */
+  /* return x - 1 ~= (1-a)/(1+a) */
   return Rescale<0>(x - F2::One());
 }
 

diff --git a/internal/fixedpoint_sse.h b/internal/fixedpoint_sse.h
@@ -48,6 +48,11 @@ inline __m128i Add(__m128i a, __m128i b) {
   return _mm_add_epi32(a, b);
 }
 
+template <>
+inline __m128i Mul(__m128i a, __m128i b) {
+  return _mm_mullo_epi32(a, b);
+}
+
 template <>
 inline __m128i Sub(__m128i a, __m128i b) {
   return _mm_sub_epi32(a, b);

diff --git a/internal/output_sse.h b/internal/output_sse.h
@@ -0,0 +1,297 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// output_sse.h: optimized SSE4.2 specializations of the templates in output.h.
+
+#ifndef GEMMLOWP_INTERNAL_OUTPUT_SSE4_H_
+#define GEMMLOWP_INTERNAL_OUTPUT_SSE4_H_
+
+#include <smmintrin.h>
+#include "output.h"
+#include "fixedpoint.h"
+
+namespace gemmlowp {
+
+// Definitions of Fragment types wrapping SSE4.2 vector types.
+typedef Fragment<__m128i, 4, 1, MapOrder::ColMajor> SSE4FragmentInt32x4x1;
+typedef Fragment<__m128i[4], 16, 1, MapOrder::ColMajor> SSE4FragmentInt32x16x1;
+typedef Fragment<uint32_t, 4, 1, MapOrder::ColMajor> SSE4FragmentUint8x4x1;
+typedef Fragment<uint64_t, 16, 1, MapOrder::ColMajor> SSE4FragmentUint8x16x1;
+
+// The code in unpack_neon.h will whenever possible process
+// 16 entries at once (4 SIMD vectors of 4 entries each at once),
+// to offer the compiler better optimization opportunities, reducing
+// register dependencies. From the perspective of interfacing with the output
+// pipeline, this takes the form of passing Fragment types wrapping int32x4x4_t
+// data. In most cases, such data is handled simply by handling separately its
+// 4 __m128i components. This partial specialization handles that for
+// arbitrary output stages implementing a __m128i path. Only some output
+// stages below will override this to use custom code to handle int32x4x4_t
+// data all at once (see OutputStageSaturatingCastToUint8 below).
+template <typename OutputStageType>
+struct OutputStageEvalImpl<OutputStageType, SSE4FragmentInt32x16x1> {
+  typedef SSE4FragmentInt32x16x1 InputType;
+  typedef SSE4FragmentInt32x16x1 OutputType;
+  typedef OutputStageEvalImpl<OutputStageType, SSE4FragmentInt32x4x1>
+      ImplInt32x4;
+  OutputStageEvalImpl(const OutputStageType& s) : impl_int32x4(s) {}
+
+  OutputType Eval(InputType input, int row, int col) const {
+    OutputType output;
+
+    for (int i = 0; i < 4; i++) {
+      output.data.val[i] =
+          impl_int32x4.Eval(input.data.val[i], row + 4 * i, col);
+    }
+    return output;
+  }
+
+  ImplInt32x4 impl_int32x4;
+};
+
+// Implementation of OutputStageQuantizeDownInt32ToUint8Scale for
+// SSE4FragmentInt32x4x1
+template <>
+struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
+                           SSE4FragmentInt32x4x1> {
+  typedef SSE4FragmentInt32x4x1 InputType;
+  typedef SSE4FragmentInt32x4x1 OutputType;
+  typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input, int, int) const {
+    const std::int32_t result_shift = output_stage.result_shift;
+    const std::int32_t result_mult_int = output_stage.result_mult_int;
+    const std::int32_t result_offset = output_stage.result_offset;
+    const std::int32_t kRoundingTerm =
+        (result_shift < 1) ? 0 : (1 << (result_shift - 1));
+    const __m128i a = Add(Mul(Add(input, Dup(result_offset)), result_mult_int), kRoundingTerm);
+    return ShiftRight(a, result_shift);
+  }
+
+  const OutputStage& output_stage;
+};
+
+// Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
+// SSE4FragmentInt32x4x1
+template <>
+struct OutputStageEvalImpl<
+    OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
+    SSE4FragmentInt32x4x1> {
+  typedef SSE4FragmentInt32x4x1 InputType;
+  typedef SSE4FragmentInt32x4x1 OutputType;
+  typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
+      OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input, int row, int col) const {
+    const std::int32_t result_shift = output_stage.result_shift;
+    const std::int32_t result_mult_int =
+      _mm_lddqu_si128((__m128i*)(output_stage.result_mult_int(row)));
+    const std::int32_t result_offset =
+      _mm_lddqu_si128((__m128i*)(output_stage.result_offset(row)));
+    const std::int32_t kRoundingTerm =
+        (result_shift < 1) ? 0 : (1 << (result_shift - 1));
+    const __m128i a = Add(Mul(Add(input, Dup(result_offset)), result_mult_int), kRoundingTerm);
+    return ShiftRight(a, result_shift);
+  }
+
+  const OutputStage& output_stage;
+};
+
+// Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
+// SSE4FragmentInt32x4x1
+template <>
+struct OutputStageEvalImpl<
+    OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
+    SSE4FragmentInt32x4x1> {
+  typedef SSE4FragmentInt32x4x1 InputType;
+  typedef SSE4FragmentInt32x4x1 OutputType;
+  typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
+      OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input, int row, int col) const {
+    const std::int32_t result_shift = output_stage.result_shift;
+    const std::int32_t result_mult_int =
+      _mm_lddqu_si128((__m128i*)(output_stage.result_mult_int(col)));
+    const std::int32_t result_offset =
+      _mm_lddqu_si128((__m128i*)(output_stage.result_offset(col)));
+    const std::int32_t kRoundingTerm =
+        (result_shift < 1) ? 0 : (1 << (result_shift - 1));
+    const __m128i a = Add(Mul(Add(input, Dup(result_offset)), result_mult_int), kRoundingTerm);
+    return ShiftRight(a, result_shift);
+  }
+
+  const OutputStage& output_stage;
+};
+
+// Implementation of OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint for
+// SSE4FragmentInt32x4x1
+template <>
+struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
+                           SSE4FragmentInt32x4x1> {
+  typedef SSE4FragmentInt32x4x1 InputType;
+  typedef SSE4FragmentInt32x4x1 OutputType;
+  typedef OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s)
+      : output_stage(s),
+        preshift_offset((s.result_shift < 1) ? 0
+                                             : (1 << (s.result_shift - 1))) {}
+
+  OutputType Eval(InputType input, int, int) const {
+    const __m128i mulhigh_val = SaturatingRoundingDoublingHighMul(
+        input.data, output_stage.result_fixedpoint_multiplier);
+    const std::int32_t result_shift = output_stage.result_shift;
+    const std::int32_t kRoundingTerm =
+      (result_shift < 1) ? 0 : (1 << (result_shift - 1));
+
+    const __m128i shifted_val = ShifRight(Add(mulhigh_val, Dup(kRoundingTerm)), result_shift);
+    return Add(shifted_val, Dup(output_stage.result_offset_after_shift));
+  }
+
+  const OutputStage& output_stage;
+};
+
+
+// Implementation of OutputStageSaturatingCastToUint8 for SSE4FragmentInt32x4x1
+template <>
+struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
+                           SSE4FragmentInt32x4x1> {
+  typedef SSE4FragmentInt32x4x1 InputType;
+  typedef SSE4FragmentUint8x4x1 OutputType;
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input, int, int) const {
+    const __m128i zero = Dup(0);
+    __m128i res_16 = _mm_packus_epi32(input, zero);
+    __m128i res_8  = _mm_packus_epi16(res_16, zero);
+    return _mm_cvtsi128_si32(res_8);
+  }
+};
+
+// In the case of OutputStageSaturatingCastToUint8, the handling of
+// SSE4FragmentInt32x16x1 data can be made much more efficient by handling
+// it all at once, instead of as 4 separate int32x4 values as in the above
+// generic partial specialization. This also avoids the poor (50%) register
+// utilization of FragmentUint8x4x1: by handling 16 scalar values at once,
+// we are able to fill a uint8x16_t.
+template <>
+struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
+                           SSE4FragmentInt32x16x1> {
+  typedef SSE4FragmentInt32x16x1 InputType;
+  typedef SSE4FragmentUint8x16x1 OutputType;
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input, int, int) const {
+    __m128i q16[2];
+    for (int i = 0; i < 2; i++) {
+      q16[i] = _mm_packus_epi32(input.data.val[2 * i],
+				input.data.val[2 * i + 1]);
+    }
+    return _mm_packus_epi16(q16[0], q16[1]);
+  }
+};
+
+// Implementation of OutputStageBiasAddition for SSE4FragmentInt32x4x1
+template <typename VectorType>
+struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
+                           SSE4FragmentInt32x4x1> {
+  typedef SSE4FragmentInt32x4x1 InputType;
+  typedef SSE4FragmentInt32x4x1 OutputType;
+  typedef OutputStageBiasAddition<VectorType> OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input, int row, int col) const {
+    __m128i bias;
+    if (VectorType::kShape == VectorShape::Row) {
+      bias = Dup(output_stage.bias_vector(col));
+    } else {
+      bias = _mm_lddqu_si128(output_stage.bias_vector.data(row));
+    }
+    return Add(input, bias);
+  }
+
+  const OutputStage& output_stage;
+};
+
+// Implementation of OutputStageClamp for SSE4FragmentInt32x4x1
+template <>
+struct OutputStageEvalImpl<OutputStageClamp, SSE4FragmentInt32x4x1> {
+  typedef SSE4FragmentInt32x4x1 InputType;
+  typedef SSE4FragmentInt32x4x1 OutputType;
+  typedef OutputStageClamp OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input, int, int) const {
+    const __m128i min = Dup(output_stage.min);
+    const __m128i max = Dup(output_stage.max);
+    return _mm_min_epi32(_mm_max_epi32(input, min), max);
+  }
+
+  const OutputStage& output_stage;
+};
+
+// Implementation of OutputStageTanh for SSE4FragmentInt32x4x1
+template <>
+struct OutputStageEvalImpl<OutputStageTanh, SSE4FragmentInt32x4x1>
+    : OutputStageTanhEvalImpl<SSE4FragmentInt32x4x1> {
+  OutputStageEvalImpl(const OutputStageTanh& output_stage)
+      : OutputStageTanhEvalImpl(output_stage) {}
+};
+
+// Specialization of StoreFinalOutput for SSE4FragmentUint8x4x1.
+template <typename DstType>
+inline void StoreFinalOutput(SSE4FragmentUint8x4x1 value, DstType* dst, int row,
+                             int col) {
+  __m128i input_value = _mm_cvtsi32_si128(value);
+  _mm_store_ss((float *) (dst->data(row,col)), value);
+}
+
+// Specialization of StoreFinalOutput for SSE4FragmentUint8x16x1.
+template <typename DstType>
+inline void StoreFinalOutput(SSE4FragmentUint8x16x1 value, DstType* dst,
+                             int row, int col) {
+  _mm_storeu_si128((__m128i*) (dst->data(row, col)), value);
+}
+
+// Specialization of StoreFinalOutput for SSE4FragmentInt32x4x1.
+template <typename DstType>
+inline void StoreFinalOutput(SSE4FragmentInt32x4x1 value, DstType* dst, int row,
+                             int col) {
+  _mm_storeu_si128((__m128i*) (dst->data(row, col)), value);
+}
+
+// Specialization of StoreFinalOutput for SSE4FragmentInt32x16x1.
+template <typename DstType>
+inline void StoreFinalOutput(SSE4FragmentInt32x16x1 value, DstType* dst,
+                             int row, int col) {
+  for (int i = 0; i < 4; i++) {
+    _mm_storeu_si128((__m128i*) (dst->data(row, col)), value);
+  }
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_OUTPUT_SSE4_H_