Permalink
Browse files

New optimized LOG10 function.

This check-in adds support for the following:

          *) Single/Double Precision log10 intrinsic function
          *) x86_64 Processor Targets:   AVX2, AVX512, KNL
          *) Enables support for Linux, OSX, Windows, and Flang
  • Loading branch information...
gklimowicz committed Feb 7, 2019
1 parent 73fc29b commit e8d06dfa6ced9e569c79569b9222dfc89ec7ea86
@@ -27,6 +27,8 @@ if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows")
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT ${LIBPGMATH_WITH_GENERIC})
add_subdirectory("cexp")
add_subdirectory("cexpf")
add_subdirectory("log10")
add_subdirectory("log10f")
endif()
add_subdirectory("powi")
add_subdirectory("sincos")
@@ -321,6 +321,11 @@ static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }

static INLINE vopmask veq64_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi64(x, y); }
static INLINE vopmask vgt64_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi64(x, y); }
static INLINE vint2 veq64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi64(x, y); }
static INLINE vint2 vgt64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi64(x, y); }

static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
return _mm256_blendv_epi8(y, x, m);
}
@@ -440,6 +445,11 @@ static INLINE vdouble vcvtsi64_vd_vi2(vint2 x) {
return out.vd;
}

static INLINE vopmask vgt64_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi64(x, y); }
static INLINE vint2 vgt64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi64(x, y); }
static INLINE vint vhi64_vi_vi2(vint2 x) { return (vint)_mm256_castsi256_si128(_mm256_permutevar8x32_epi32(x, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1))); }

#if (defined __AVX512VL__)
static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm256_getexp_pd(d); }
static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm256_getexp_ps(d); }
static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm256_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm256_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
#endif
@@ -300,6 +300,12 @@ static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }

static INLINE vopmask veq64_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi64(x, y); }
static INLINE vopmask vgt64_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi64(x, y); }
static INLINE vint2 veq64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi64(x, y); }
static INLINE vint2 vgt64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi64(x, y); }


static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
return _mm_blendv_epi8(y, x, m);
}
@@ -410,6 +416,12 @@ static INLINE vdouble vcvtsi64_vd_vi2(vint2 x) {
return out.vd;
}

static INLINE vopmask vgt64_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi64(x, y); }
static INLINE vint2 vgt64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi64(x, y); }
static INLINE vint vhi64_vi_vi2(vint2 x) { return (vint)_mm_permute_ps((vfloat)x, 0xd); }


#if (defined __AVX512VL__)
static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm_getexp_pd(d); }
static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm_getexp_ps(d); }
static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
#endif
@@ -183,7 +183,9 @@ static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_
//

static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi32(x, y); }
static INLINE vint vadd64_vi_vi_vi(vint x, vint y) { return _mm256_add_epi64(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi32(x, y); }
static INLINE vint vsub64_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi64(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }

static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si256(x, y); }
@@ -362,6 +364,9 @@ static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi32_mask(x, y); }

static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmpgt_epi64_mask(x, y); }
static INLINE vopmask veq64_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi64_mask(x, y); }
static INLINE vopmask vgt64_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi64_mask(x, y); }


static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
__mmask16 m = _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
@@ -499,5 +504,5 @@ static INLINE vint2 vmulu_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_mul_epu3
static INLINE vint2 vadd64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi64(x, y); }
static INLINE vint2 vsub64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi64(x, y); }

static INLINE vopmask vgt64_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi64_mask(x, y); }
static INLINE vint vhi64_vi_vi2(vint2 x) { return (vint)_mm512_castsi512_si256(_mm512_permutexvar_epi32(_mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 7, 5, 3, 1), x)); }

@@ -0,0 +1,64 @@
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

include_directories(${CMAKE_CURRENT_SOURCE_DIR})

get_property(FLAGS GLOBAL PROPERTY "FLAGS_X8664_L1")
get_property(DEFINITIONS GLOBAL PROPERTY "DEFINITIONS_X8664_L1")


set(SRCS_SCALAR
fd_log10_scalar.cpp
)

set(SRCS_VECTOR
fd_log10_vector.cpp
)

list(APPEND DEFINITIONS NDEBUG)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
list(APPEND DEFINITIONS _GNU_SOURCE)
endif()

# Scalar
set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_CPU=avx2")
libmath_add_object_library("${SRCS_SCALAR}" "${FLAGS_TMP}" "${DEFINITIONS}" "log10-avx2_1")

set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_CPU=avx512")
libmath_add_object_library("${SRCS_SCALAR}" "${FLAGS_TMP}" "${DEFINITIONS}" "log10-avx512_1")


# Vector, Two elements
set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_VL=2 -D_CPU=avx2")
libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}" "log10-avx2_2")

set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_VL=2 -D_CPU=avx512")
libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}" "log10-avx512_2")


# Vector, Four elements
set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_VL=4 -D_CPU=avx2")
libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}" "log10-avx2_4")

set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_VL=4 -D_CPU=avx512")
libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}" "log10-avx512_4")


# Vector, Eight elements
set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_VL=8 -D_CPU=avx512")
libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}" "log10-avx512_8")


@@ -0,0 +1,153 @@

/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/


#ifndef COMMON_H
#define COMMON_H

static double ll_as_double(long long a) { return *(double*)&a; }
static long long double_as_ll(double a) { return *(long long*)&a; }

const double TWO_TO_53 = 9007199254740992.0;

const double LOG10_2[] __attribute__ ((aligned (64))) = {
3.0102999566398120e-01, 3.0102999566398120e-01, 3.0102999566398120e-01, 3.0102999566398120e-01,
3.0102999566398120e-01, 3.0102999566398120e-01, 3.0102999566398120e-01, 3.0102999566398120e-01
};

const double ONE_D[] __attribute__ ((aligned (64))) = {
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
};

const unsigned long long ONE_I[] __attribute__ ((aligned (64))) = {
1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL
};

const double THRESHOLD[] __attribute__ ((aligned (64))) = {
0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75
};

const unsigned long long MANTISSA_MASK[] __attribute__ ((aligned (64))) = {
0xfffffffffffffULL, 0xfffffffffffffULL, 0xfffffffffffffULL, 0xfffffffffffffULL,
0xfffffffffffffULL, 0xfffffffffffffULL, 0xfffffffffffffULL, 0xfffffffffffffULL
};

const unsigned long long HALFIFIER[] __attribute__ ((aligned (64))) = {
0x10000000000000ULL, 0x10000000000000ULL, 0x10000000000000ULL, 0x10000000000000ULL,
0x10000000000000ULL, 0x10000000000000ULL, 0x10000000000000ULL, 0x10000000000000ULL
};

const unsigned long long PINF[] __attribute__ ((aligned (64))) = {
0x7ff0000000000000ULL, 0x7ff0000000000000ULL, 0x7ff0000000000000ULL, 0x7ff0000000000000ULL,
0x7ff0000000000000ULL, 0x7ff0000000000000ULL, 0x7ff0000000000000ULL, 0x7ff0000000000000ULL
};

const unsigned long long NINF[] __attribute__ ((aligned (64))) = {
0xfff0000000000000ULL, 0xfff0000000000000ULL, 0xfff0000000000000ULL, 0xfff0000000000000ULL,
0xfff0000000000000ULL, 0xfff0000000000000ULL, 0xfff0000000000000ULL, 0xfff0000000000000ULL
};

const unsigned long long CANONICAL_NAN[] __attribute__ ((aligned (64))) = {
0xfff8000000000000ULL, 0xfff8000000000000ULL, 0xfff8000000000000ULL, 0xfff8000000000000ULL,
0xfff8000000000000ULL, 0xfff8000000000000ULL, 0xfff8000000000000ULL, 0xfff8000000000000ULL
};

const double c0[] __attribute__ ((aligned (64))) = {
-3.9902972429474055e-03, -3.9902972429474055e-03, -3.9902972429474055e-03, -3.9902972429474055e-03,
-3.9902972429474055e-03, -3.9902972429474055e-03, -3.9902972429474055e-03, -3.9902972429474055e-03
};

const double c1[] __attribute__ ((aligned (64))) = {
1.4081442120452604e-02, 1.4081442120452604e-02, 1.4081442120452604e-02, 1.4081442120452604e-02,
1.4081442120452604e-02, 1.4081442120452604e-02, 1.4081442120452604e-02, 1.4081442120452604e-02
};
const double c2[] __attribute__ ((aligned (64))) = {
-2.3672789854611846e-02, -2.3672789854611846e-02, -2.3672789854611846e-02, -2.3672789854611846e-02,
-2.3672789854611846e-02, -2.3672789854611846e-02, -2.3672789854611846e-02, -2.3672789854611846e-02
};
const double c3[] __attribute__ ((aligned (64))) = {
2.7430685728900141e-02, 2.7430685728900141e-02, 2.7430685728900141e-02, 2.7430685728900141e-02,
2.7430685728900141e-02, 2.7430685728900141e-02, 2.7430685728900141e-02, 2.7430685728900141e-02
};
const double c4[] __attribute__ ((aligned (64))) = {
-2.7935977036068853e-02, -2.7935977036068853e-02, -2.7935977036068853e-02, -2.7935977036068853e-02,
-2.7935977036068853e-02, -2.7935977036068853e-02, -2.7935977036068853e-02, -2.7935977036068853e-02
};
const double c5[] __attribute__ ((aligned (64))) = {
2.8913989262316014e-02, 2.8913989262316014e-02, 2.8913989262316014e-02, 2.8913989262316014e-02,
2.8913989262316014e-02, 2.8913989262316014e-02, 2.8913989262316014e-02, 2.8913989262316014e-02
};
const double c6[] __attribute__ ((aligned (64))) = {
-3.0925264020832541e-02, -3.0925264020832541e-02, -3.0925264020832541e-02, -3.0925264020832541e-02,
-3.0925264020832541e-02, -3.0925264020832541e-02, -3.0925264020832541e-02, -3.0925264020832541e-02
};
const double c7[] __attribute__ ((aligned (64))) = {
3.3394250680337925e-02, 3.3394250680337925e-02, 3.3394250680337925e-02, 3.3394250680337925e-02,
3.3394250680337925e-02, 3.3394250680337925e-02, 3.3394250680337925e-02, 3.3394250680337925e-02
};
const double c8[] __attribute__ ((aligned (64))) = {
-3.6196017360600602e-02, -3.6196017360600602e-02, -3.6196017360600602e-02, -3.6196017360600602e-02,
-3.6196017360600602e-02, -3.6196017360600602e-02, -3.6196017360600602e-02, -3.6196017360600602e-02
};
const double c9[] __attribute__ ((aligned (64))) = {
3.9482521625171671e-02, 3.9482521625171671e-02, 3.9482521625171671e-02, 3.9482521625171671e-02,
3.9482521625171671e-02, 3.9482521625171671e-02, 3.9482521625171671e-02, 3.9482521625171671e-02
};
const double c10[] __attribute__ ((aligned (64))) = {
-4.3429342257676216e-02, -4.3429342257676216e-02, -4.3429342257676216e-02, -4.3429342257676216e-02,
-4.3429342257676216e-02, -4.3429342257676216e-02, -4.3429342257676216e-02, -4.3429342257676216e-02
};
const double c11[] __attribute__ ((aligned (64))) = {
4.8254896147627749e-02, 4.8254896147627749e-02, 4.8254896147627749e-02, 4.8254896147627749e-02,
4.8254896147627749e-02, 4.8254896147627749e-02, 4.8254896147627749e-02, 4.8254896147627749e-02
};
const double c12[] __attribute__ ((aligned (64))) = {
-5.4286810715274331e-02, -5.4286810715274331e-02, -5.4286810715274331e-02, -5.4286810715274331e-02,
-5.4286810715274331e-02, -5.4286810715274331e-02, -5.4286810715274331e-02, -5.4286810715274331e-02
};
const double c13[] __attribute__ ((aligned (64))) = {
6.2042069733283481e-02, 6.2042069733283481e-02, 6.2042069733283481e-02, 6.2042069733283481e-02,
6.2042069733283481e-02, 6.2042069733283481e-02, 6.2042069733283481e-02, 6.2042069733283481e-02
};
const double c14[] __attribute__ ((aligned (64))) = {
-7.2382413666496184e-02, -7.2382413666496184e-02, -7.2382413666496184e-02, -7.2382413666496184e-02,
-7.2382413666496184e-02, -7.2382413666496184e-02, -7.2382413666496184e-02, -7.2382413666496184e-02
};
const double c15[] __attribute__ ((aligned (64))) = {
8.6858896372479524e-02, 8.6858896372479524e-02, 8.6858896372479524e-02, 8.6858896372479524e-02,
8.6858896372479524e-02, 8.6858896372479524e-02, 8.6858896372479524e-02, 8.6858896372479524e-02
};
const double c16[] __attribute__ ((aligned (64))) = {
-1.0857362047562141e-01, -1.0857362047562141e-01, -1.0857362047562141e-01, -1.0857362047562141e-01,
-1.0857362047562141e-01, -1.0857362047562141e-01, -1.0857362047562141e-01, -1.0857362047562141e-01
};
const double c17[] __attribute__ ((aligned (64))) = {
1.4476482730111173e-01, 1.4476482730111173e-01, 1.4476482730111173e-01, 1.4476482730111173e-01,
1.4476482730111173e-01, 1.4476482730111173e-01, 1.4476482730111173e-01, 1.4476482730111173e-01
};
const double c18[] __attribute__ ((aligned (64))) = {
-2.1714724095162635e-01, -2.1714724095162635e-01, -2.1714724095162635e-01, -2.1714724095162635e-01,
-2.1714724095162635e-01, -2.1714724095162635e-01, -2.1714724095162635e-01, -2.1714724095162635e-01
};
const double c19[] __attribute__ ((aligned (64))) = {
4.3429448190325182e-01, 4.3429448190325182e-01, 4.3429448190325182e-01, 4.3429448190325182e-01,
4.3429448190325182e-01, 4.3429448190325182e-01, 4.3429448190325182e-01, 4.3429448190325182e-01
};

#endif

Oops, something went wrong.

0 comments on commit e8d06df

Please sign in to comment.