Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added STATIC_BMI2 for compile time detection of BMI2 on MSVC, when enabled various intrinsics are used #2258

Merged
merged 3 commits into from
Aug 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions lib/common/bitstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#if defined (__cplusplus)
extern "C" {
#endif

/*
* This API consists of small unitary functions, which must be inlined for best performance.
* Since link-time-optimization is not available for all compilers,
Expand Down Expand Up @@ -141,8 +140,12 @@ MEM_STATIC unsigned BIT_highbit32 (U32 val)
assert(val != 0);
{
# if defined(_MSC_VER) /* Visual */
unsigned long r=0;
return _BitScanReverse ( &r, val ) ? (unsigned)r : 0;
# if STATIC_BMI2 == 1
return _lzcnt_u32(val) ^ 31;
# else
unsigned long r = 0;
return _BitScanReverse(&r, val) ? (unsigned)r : 0;
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */
return __builtin_clz (val) ^ 31;
# elif defined(__ICCARM__) /* IAR Intrinsic */
Expand Down Expand Up @@ -317,23 +320,27 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
return srcSize;
}

MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
{
return bitContainer >> start;
}

MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
{
U32 const regMask = sizeof(bitContainer)*8 - 1;
/* if start > regMask, bitstream is corrupted, and result is undefined */
assert(nbBits < BIT_MASK_SIZE);
return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
}

MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
{
#if STATIC_BMI2
return _bzhi_u64(bitContainer, nbBits);
#else
assert(nbBits < BIT_MASK_SIZE);
return bitContainer & BIT_mask[nbBits];
#endif
}

/*! BIT_lookBits() :
Expand All @@ -342,7 +349,7 @@ MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
* On 32-bits, maxNbBits==24.
* On 64-bits, maxNbBits==56.
* @return : value extracted */
MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
{
/* arbitrate between double-shift and shift+mask */
#if 1
Expand All @@ -365,7 +372,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
}

MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
{
bitD->bitsConsumed += nbBits;
}
Expand All @@ -374,7 +381,7 @@ MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
* Read (consume) next n bits from local register and update.
* Pay attention to not read more than nbBits contained into local register.
* @return : extracted value. */
MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
{
size_t const value = BIT_lookBits(bitD, nbBits);
BIT_skipBits(bitD, nbBits);
Expand Down
13 changes: 13 additions & 0 deletions lib/common/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,4 +183,17 @@
# pragma warning(disable : 4324) /* disable: C4324: padded structure */
#endif

/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
#ifndef STATIC_BMI2
# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))
# ifdef __AVX2__ //MSVC does not have a BMI2 specific flag, but every CPU that supports AVX2 also supports BMI2
# define STATIC_BMI2 1
# endif
# endif
#endif

#ifndef STATIC_BMI2
#define STATIC_BMI2 0
#endif

#endif /* ZSTD_COMPILER_H */
8 changes: 6 additions & 2 deletions lib/common/zstd_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,12 @@ MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus
assert(val != 0);
{
# if defined(_MSC_VER) /* Visual */
unsigned long r=0;
return _BitScanReverse(&r, val) ? (unsigned)r : 0;
# if STATIC_BMI2 == 1
return _lzcnt_u32(val)^31;
# else
unsigned long r=0;
return _BitScanReverse(&r, val) ? (unsigned)r : 0;
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */
return __builtin_clz (val) ^ 31;
# elif defined(__ICCARM__) /* IAR Intrinsic */
Expand Down
16 changes: 12 additions & 4 deletions lib/compress/zstd_compress_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -498,8 +498,12 @@ static unsigned ZSTD_NbCommonBytes (size_t val)
if (MEM_isLittleEndian()) {
if (MEM_64bits()) {
# if defined(_MSC_VER) && defined(_WIN64)
unsigned long r = 0;
return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0;
# if STATIC_BMI2
return _tzcnt_u64(val) >> 3;
# else
unsigned long r = 0;
return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0;
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 4)
return (__builtin_ctzll((U64)val) >> 3);
# else
Expand Down Expand Up @@ -530,8 +534,12 @@ static unsigned ZSTD_NbCommonBytes (size_t val)
} else { /* Big Endian CPU */
if (MEM_64bits()) {
# if defined(_MSC_VER) && defined(_WIN64)
unsigned long r = 0;
return _BitScanReverse64( &r, val ) ? (unsigned)(r >> 3) : 0;
# if STATIC_BMI2
return _lzcnt_u64(val) >> 3;
# else
unsigned long r = 0;
return _BitScanReverse64(&r, (U64)val) ? (unsigned)(r >> 3) : 0;
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 4)
return (__builtin_clzll(val) >> 3);
# else
Expand Down