Permalink
Browse files

Merge pull request #666 from ThePortlandGroup/nv_stage

Pull 2019-02-06T16-42 Recent NVIDIA Changes
  • Loading branch information...
sscalpone committed Feb 7, 2019
2 parents 91e8f33 + e8d06df commit 2b336eb2883ccf96aaa8c7d926f0d277fdb1b3c3
Showing with 4,426 additions and 55 deletions.
  1. +7 −1 runtime/libpgmath/lib/common/CMakeLists.txt
  2. +71 −0 runtime/libpgmath/lib/common/cexp/CMakeLists.txt
  3. +166 −0 runtime/libpgmath/lib/common/cexp/cexp.c
  4. +431 −0 runtime/libpgmath/lib/common/cexp/cis_d_common.h
  5. +198 −0 runtime/libpgmath/lib/common/cexp/exp_d_common.h
  6. +216 −0 runtime/libpgmath/lib/common/cexp/ldexp_d_common.h
  7. +106 −0 runtime/libpgmath/lib/common/cexp/names.h
  8. +71 −0 runtime/libpgmath/lib/common/cexpf/CMakeLists.txt
  9. +163 −0 runtime/libpgmath/lib/common/cexpf/cexpf.c
  10. +347 −0 runtime/libpgmath/lib/common/cexpf/cis_common.h
  11. +81 −0 runtime/libpgmath/lib/common/cexpf/common.h
  12. +212 −0 runtime/libpgmath/lib/common/cexpf/exp_common.h
  13. +201 −0 runtime/libpgmath/lib/common/cexpf/ldexp_common.h
  14. +95 −0 runtime/libpgmath/lib/common/cexpf/names.h
  15. +132 −0 runtime/libpgmath/lib/common/debug_prn.h
  16. +24 −2 runtime/libpgmath/lib/common/helperavx2.h
  17. +22 −3 runtime/libpgmath/lib/common/helperavx2_128.h
  18. +19 −1 runtime/libpgmath/lib/common/helperavx512f.h
  19. +64 −0 runtime/libpgmath/lib/common/log10/CMakeLists.txt
  20. +153 −0 runtime/libpgmath/lib/common/log10/common.h
  21. +123 −0 runtime/libpgmath/lib/common/log10/fd_log10_scalar.cpp
  22. +47 −0 runtime/libpgmath/lib/common/log10/fd_log10_vector.cpp
  23. +125 −0 runtime/libpgmath/lib/common/log10/log10_d_vec.h
  24. +86 −0 runtime/libpgmath/lib/common/log10f/CMakeLists.txt
  25. +172 −0 runtime/libpgmath/lib/common/log10f/common.h
  26. +64 −0 runtime/libpgmath/lib/common/log10f/fs_log10_16.cpp
  27. +138 −0 runtime/libpgmath/lib/common/log10f/fs_log10_4.cpp
  28. +135 −0 runtime/libpgmath/lib/common/log10f/fs_log10_8.cpp
  29. +93 −0 runtime/libpgmath/lib/common/log10f/fs_log10_scalar.cpp
  30. +84 −0 runtime/libpgmath/lib/common/log10f/fs_log10_scalar_legacy.cpp
  31. +47 −0 runtime/libpgmath/lib/common/log10f/fs_log10_vector_legacy.cpp
  32. +98 −0 runtime/libpgmath/lib/common/log10f/log10_vec_legacy.h
  33. +291 −0 runtime/libpgmath/lib/common/math_common.h
  34. +5 −2 runtime/libpgmath/lib/common/misc.h
  35. +93 −0 runtime/libpgmath/lib/common/sleef_common.h
  36. +22 −22 runtime/libpgmath/lib/x86_64/math_tables/mth_expdefs.h
  37. +23 −23 runtime/libpgmath/lib/x86_64/math_tables/mth_log10defs.h
  38. +1 −1 tools/flang1/flang1exe/semfunc.c
@@ -1,5 +1,5 @@
#
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -23,6 +23,12 @@ if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows")
add_subdirectory("exp")
add_subdirectory("log")
add_subdirectory("pow")
endif()
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT ${LIBPGMATH_WITH_GENERIC})
add_subdirectory("cexp")
add_subdirectory("cexpf")
add_subdirectory("log10")
add_subdirectory("log10f")
endif()
add_subdirectory("powi")
add_subdirectory("sincos")
@@ -0,0 +1,71 @@
#
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

include_directories(${CMAKE_CURRENT_SOURCE_DIR})

get_property(FLAGS GLOBAL PROPERTY "FLAGS_X8664_L1")
get_property(DEFINITIONS GLOBAL PROPERTY "DEFINITIONS_X8664_L1")


set(SRCS
cexp.c
)

list(APPEND DEFINITIONS NDEBUG)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
list(APPEND DEFINITIONS _GNU_SOURCE)
endif()

# Scalar
set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_VL=1 -D_CPU=avx2")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-avx2_1")

set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_VL=1 -D_CPU=avx512")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-avx512_1")

set(FLAGS_TMP "${FLAGS} -mtune=knl -march=knl -D_VL=1 -D_CPU=knl")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-knl_1")


# Scalar with vector calling ABI
set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_VL=1 -D_CPU=avx2 -D_SCALAR_WITH_VECTOR_ABI_")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-avx2_1v")

set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_VL=1 -D_CPU=avx512 -D_SCALAR_WITH_VECTOR_ABI_")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-avx512_1v")

set(FLAGS_TMP "${FLAGS} -mtune=knl -march=knl -D_VL=1 -D_CPU=knl -D_SCALAR_WITH_VECTOR_ABI_")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-knl_1v")


# Vector, Two elements
set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_VL=2 -D_CPU=avx2")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-avx2_2")

set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_VL=2 -D_CPU=avx512")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-avx512_2")

set(FLAGS_TMP "${FLAGS} -mtune=knl -march=knl -D_VL=2 -D_CPU=knl")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-knl_2")


# Vector, Four elements
set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_VL=4 -D_CPU=avx512")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-avx512_4")

set(FLAGS_TMP "${FLAGS} -mtune=knl -march=knl -D_VL=4 -D_CPU=knl")
libmath_add_object_library("${SRCS}" "${FLAGS_TMP}" "${DEFINITIONS}" "cexp-knl_4")

@@ -0,0 +1,166 @@

/*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/


#include "names.h"
#include "math_common.h"
#include "sleef_common.h"
#include "ldexp_d_common.h"
#include "exp_d_common.h"
#include "cis_d_common.h"

F_VISIBILITY_VEC
vdouble cexp_vec(vdouble x)
{
// Algorithm description for cexp(x)
// We follow the mathematical definition of the cexp
// cexp(re + I*im) = exp(re)*(cos(im) + I*sin(im))
// and also handle C99 special cases separately
//
// sin and cos will be computed in parallel and returned
// in the same SIMD register interleaved and placed properly
// for real and imaginary. We will use single precision for sin/cos.
//
// We will compute exp() as a pair of (poly, scale), where integer
// scale will give us an extended range for exp = 2^(scale) * poly
// exp result will be delivered in a pair of SIMD registers, real and
// imaginary positions will be duplicates.
//
// We multiply poly by sin/cos - this incurs one roundoff in
// every component and then we do ldexp to carefully multiply by
// 2^(scale).

vdouble rx = vmoveldup_vd_vd(x); PRINT(rx);
vdouble ix = vmovehdup_vd_vd(x); PRINT(ix);
// sign of resulting poly is 0, except maybe if Na
vdouble poly, scale; __vexp_d_kernel(rx, &poly, &scale); PRINT(poly); PRINT(scale);
// cis(Inf & NaN) --> NaN
vdouble rcis = __vcis_d_kernel(ix); PRINT(rcis);
// store sign of cis result
vint2 signcis = vand_vi2_vi2_vi2(vD2L(rcis), vSETll(DB_SIGN_BIT)); PRINT(signcis);
// sign of NaN may be lost here in favor of sign of NaN coming from poly
vdouble polycis = vmul_vd_vd_vd(rcis, poly); PRINT(polycis);
// NaN sign fixup, perhaps not worth the effort
polycis = vL2D(vandnot_vi2_vi2_vi2(vSETll(DB_SIGN_BIT), vD2L(polycis))); PRINT(polycis);
polycis = vL2D(vor_vi2_vi2_vi2(vD2L(polycis), signcis)); PRINT(polycis);
// if creal(x) == +Inf, then creal(result) is +Inf
// if cimag(x) == 0.0 then fixup product to
// the same zero, even if poly were NaN
// NOTE: cimag(x) may be denormal under DAZ flag
// subsequent computation in ldexp will flush
// it to zero if done under the same DAZ condition
vopmask reset = veq_vo_vd_vd(x, vL2D(vSETLLL(0x0, DB_PINF))); PRINT(reset);
polycis = vsel_vd_vo_vd_vd(reset, x, polycis); PRINT(polycis);

// if creal(x) == -Inf, then result is +0 * sign_of_cis()
// NOTE: this fixup is only needed in case cimag(x)=Inf/NaN.
// Finite cases would deliver proper zero thanks to ldexp.
vopmask zeromask = veq_vo_vd_vd(rx, vL2D(vSETll(DB_NINF))); PRINT(zeromask);
polycis = vsel_vd_vo_vd_vd(zeromask, vL2D(signcis), polycis); PRINT(polycis);

// careful polycis * 2^(scale)
vdouble vcexp = __vldexp_kernel(polycis, scale); PRINT(vcexp);
return vcexp;
}

#if ((_VL) == (1))

F_VISIBILITY_SCALAR
double _Complex cexp_scalar_default_abi(double _Complex a)
{
#if defined DO_PRINT
feclearexcept(FE_ALL_EXCEPT);
#endif
#if !(defined __USE_PORTABLE_CODE)
vdouble va = _mm_loadu_pd((double const*)(&a)); PRINT(va);
vdouble vr = cexp_vec(va); PRINT(vr);
double _Complex res = *(double _Complex *)(&vr); PRINT(res);
return res;
#else

double ra = creal(a); PRINT(ra);
double ia = cimag(a); PRINT(ia);

double poly;
long long int scale;
// This exp clamps input and doesn't over/underflow
__exp_d_scalar_kernel(ra, &poly, &scale); PRINT( poly ); PRINT( scale );
assert(((poly > 0x1.p511) && (poly < 0x1.p513)) || isnan(poly));

double _Complex cmplx_cis = __cis_d_scalar(ia);
double rsin = cimag(cmplx_cis); PRINT( rsin );
double rcos = creal(cmplx_cis); PRINT( rcos );
// cis(Inf/NaN) results in NaN
assert((isinf(ia) || isnan(ia)) ^ !(isnan(rsin) && isnan(rcos)));

// sign and payload of NaN from cis may be lost here
// in favor of sign/payload of NaN coming from poly
double polycos = poly * rcos; PRINT( polycos );
double polysin = poly * rsin; PRINT( polysin );
// restore sign of NaN coming from cis only to pass
// symmetry test, perhaps not worth the effort
polycos = copysign(polycos, rcos);
polysin = copysign(polysin, rsin);

if ( ia == 0.0 )
{
// if cimag(x) == 0.0 then fixup product to
// the same zero, even if poly were NaN
polysin = ia;
// NOTE: ia may be denormal under DAZ flag
// subsequent computation in ldexp will flush
// it to zero if done under the same DAZ condition
}
if ( ra == L2D(DB_PINF) )
{
// if creal(x) == +Inf, then creal(result) is +Inf
polycos = ra;
}

if ( ra == L2D(DB_NINF) )
{
// if creal(x) == -Inf, then result is +0 * sign_of_cis()
// NOTE: this fixup is only needed in case cimag(x)=Inf/NaN.
// Finite cases would deliver proper zero thanks to ldexp.
polycos = copysign(0.0, rcos);
polysin = copysign(0.0, rsin);
}

// This scaling shall not produce new NaNs
double cexp_real = __ldexp_d_scalar_kernel(polycos, scale); PRINT( cexp_real );
double cexp_imag = __ldexp_d_scalar_kernel(polysin, scale); PRINT( cexp_imag );

return set_cmplxd(cexp_real, cexp_imag);
#endif //if !(defined __USE_PORTABLE_CODE)
}
#if (defined _SCALAR_WITH_VECTOR_ABI_)
// scalar complex real/imag double precision values
// are passed in different registers by default
// here we define a function with single SIMD register
// calling convention
F_VISIBILITY_SCALAR_VECTOR
vdouble cexp_scalar_vector_abi(vdouble vx)
{
double _Complex x = *(double _Complex *)&vx;
vdouble vres;
*(double _Complex *)&vres = cexp_scalar_default_abi(x);
return vres;
}
#endif //if (defined _SCALAR_WITH_VECTOR_ABI_)

#endif //if ((_VL) == (1))
Oops, something went wrong.

0 comments on commit 2b336eb

Please sign in to comment.