Skip to content

Commit

Permalink
Merge pull request #633 from Ka-zam/arctan
Browse files Browse the repository at this point in the history
New kernels for arctan
  • Loading branch information
jdemel committed Oct 3, 2023
2 parents 74b6c6a + 0343e3c commit a26a1b8
Show file tree
Hide file tree
Showing 7 changed files with 324 additions and 324 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
@@ -1,5 +1,6 @@
#
# Copyright 2011-2020 Free Software Foundation, Inc.
# Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
#
# This file is part of VOLK
#
Expand Down Expand Up @@ -249,6 +250,7 @@ install(FILES
${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
Expand Down
50 changes: 50 additions & 0 deletions include/volk/volk_avx2_fma_intrinsics.h
@@ -0,0 +1,50 @@
/* -*- c++ -*- */
/*
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*
* This file is intended to hold AVX2 FMA intrinsics of intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#include <immintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a13;
arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
38 changes: 38 additions & 0 deletions include/volk/volk_avx_intrinsics.h
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
Expand All @@ -16,6 +17,43 @@
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
#include <immintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a13;
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a11);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a9);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a7);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a5);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a3);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
{
__m256 yl, yh, tmp1, tmp2;
Expand Down
47 changes: 46 additions & 1 deletion include/volk/volk_common.h
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
Expand Down Expand Up @@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f)
// Constant used to do log10 calculations as faster log2
////////////////////////////////////////////////////////////////////////
// precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
#define volk_log2to10factor 3.01029995663981209120
#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120

////////////////////////////////////////////////////////////////////////
// arctan(x)
////////////////////////////////////////////////////////////////////////
static inline float volk_arctan_poly(const float x)
{
/*
* arctan(x) polynomial expansion on the interval [-1, 1]
* Maximum relative error < 6.6e-7
*/
const float a1 = +0x1.ffffeap-1f;
const float a3 = -0x1.55437p-2f;
const float a5 = +0x1.972be6p-3f;
const float a7 = -0x1.1436ap-3f;
const float a9 = +0x1.5785aap-4f;
const float a11 = -0x1.2f3004p-5f;
const float a13 = +0x1.01a37cp-7f;

const float x_times_x = x * x;
float arctan = a13;
arctan = fmaf(x_times_x, arctan, a11);
arctan = fmaf(x_times_x, arctan, a9);
arctan = fmaf(x_times_x, arctan, a7);
arctan = fmaf(x_times_x, arctan, a5);
arctan = fmaf(x_times_x, arctan, a3);
arctan = fmaf(x_times_x, arctan, a1);
arctan *= x;

return arctan;
}

static inline float volk_arctan(const float x)
{
/*
* arctan(x) + arctan(1 / x) == sign(x) * pi / 2
*/
const float pi_over_2 = 0x1.921fb6p0f;

if (fabs(x) < 1.f) {
return volk_arctan_poly(x);
} else {
return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x);
}
}

#endif /*INCLUDED_LIBVOLK_COMMON_H*/
38 changes: 38 additions & 0 deletions include/volk/volk_sse_intrinsics.h
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
Expand All @@ -16,6 +17,43 @@
#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
#include <xmmintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m128 _mm_arctan_poly_sse(const __m128 x)
{
const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);

const __m128 x_times_x = _mm_mul_ps(x, x);
__m128 arctan;
arctan = a13;
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a11);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a9);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a7);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a5);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a3);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a1);
arctan = _mm_mul_ps(x, arctan);

return arctan;
}

static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
{
__m128 iValue, qValue;
Expand Down

0 comments on commit a26a1b8

Please sign in to comment.