Skip to content

Commit

Permalink
Generalize the feature transform to use vec_t macros
Browse files Browse the repository at this point in the history
This commit generalizes the feature transform to use vec_t macros
that are architecture defined instead of using a seperate code path for each one.

It should make some old architectures (MMX, including improvements by Fanael) faster
and make further such improvements easier in the future.

Includes some corrections to CI for mingw.

closes official-stockfish#3955
closes official-stockfish#3928

No functional change
  • Loading branch information
Chaneybenjamini authored and dav1312 committed Oct 21, 2022
1 parent 7b0ebc6 commit 166f28f
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 100 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/stockfish.yml
Expand Up @@ -82,20 +82,20 @@ jobs:
name: "Windows 2022 Mingw-w64 GCC x86_64",
os: windows-2022,
compiler: g++,
comp: gcc,
comp: mingw,
run_64bit_tests: true,
msys_sys: 'mingw64',
msys_env: 'x86_64',
msys_env: 'x86_64-gcc',
shell: 'msys2 {0}'
}
- {
name: "Windows 2022 Mingw-w64 GCC i686",
os: windows-2022,
compiler: g++,
comp: gcc,
comp: mingw,
run_32bit_tests: true,
msys_sys: 'mingw32',
msys_env: 'i686',
msys_env: 'i686-gcc',
shell: 'msys2 {0}'
}
- {
Expand All @@ -105,7 +105,7 @@ jobs:
comp: clang,
run_64bit_tests: true,
msys_sys: 'clang64',
msys_env: 'clang-x86_64',
msys_env: 'clang-x86_64-clang',
shell: 'msys2 {0}'
}

Expand All @@ -129,7 +129,7 @@ jobs:
uses: msys2/setup-msys2@v2
with:
msystem: ${{matrix.config.msys_sys}}
install: mingw-w64-${{matrix.config.msys_env}}-${{matrix.config.comp}} make git expect
install: mingw-w64-${{matrix.config.msys_env}} make git expect

- name: Download the used network from the fishtest framework
run: |
Expand Down
1 change: 1 addition & 0 deletions AUTHORS
Expand Up @@ -31,6 +31,7 @@ Arjun Temurnikar
Artem Solopiy (EntityFX)
Auguste Pop
Balint Pfliegel
Ben Chaney (Chaneybenjamini)
Ben Koshy (BKSpurgeon)
Bill Henry (VoyagerOne)
Bojun Guo (noobpwnftw, Nooby)
Expand Down
165 changes: 71 additions & 94 deletions src/nnue/nnue_feature_transformer.h
Expand Up @@ -47,12 +47,22 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm512_mullo_epi16(a,b)
#define vec_zero() _mm512_setzero_epi32()
#define vec_set_16(a) _mm512_set1_epi16(a)
#define vec_max_16(a,b) _mm512_max_epi16(a,b)
#define vec_min_16(a,b) _mm512_min_epi16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a,7),_mm512_srli_epi16(b,7));
return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted);
}
#define vec_load_psqt(a) _mm256_load_si256(a)
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
#define vec_zero_psqt() _mm256_setzero_si256()
#define NumRegistersSIMD 32
#define MaxChunkSize 64

#elif USE_AVX2
typedef __m256i vec_t;
Expand All @@ -61,12 +71,22 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm256_mullo_epi16(a,b)
#define vec_zero() _mm256_setzero_si256()
#define vec_set_16(a) _mm256_set1_epi16(a)
#define vec_max_16(a,b) _mm256_max_epi16(a,b)
#define vec_min_16(a,b) _mm256_min_epi16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a,7), _mm256_srli_epi16(b,7));
return _mm256_permute4x64_epi64(compacted, 0b11011000);
}
#define vec_load_psqt(a) _mm256_load_si256(a)
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
#define vec_zero_psqt() _mm256_setzero_si256()
#define NumRegistersSIMD 16
#define MaxChunkSize 32

#elif USE_SSE2
typedef __m128i vec_t;
Expand All @@ -75,12 +95,19 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm_mullo_epi16(a,b)
#define vec_zero() _mm_setzero_si128()
#define vec_set_16(a) _mm_set1_epi16(a)
#define vec_max_16(a,b) _mm_max_epi16(a,b)
#define vec_min_16(a,b) _mm_min_epi16(a,b)
#define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7))
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
#define vec_zero_psqt() _mm_setzero_si128()
#define NumRegistersSIMD (Is64Bit ? 16 : 8)
#define MaxChunkSize 16

#elif USE_MMX
typedef __m64 vec_t;
Expand All @@ -89,12 +116,26 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
#define vec_mul_16(a,b) _mm_mullo_pi16(a,b)
#define vec_zero() _mm_setzero_si64()
#define vec_set_16(a) _mm_set1_pi16(a)
inline vec_t vec_max_16(vec_t a,vec_t b){
vec_t comparison = _mm_cmpgt_pi16(a,b);
return _mm_or_si64(_mm_and_si64(comparison, a), _mm_andnot_si64(comparison, b));
}
inline vec_t vec_min_16(vec_t a,vec_t b){
vec_t comparison = _mm_cmpgt_pi16(a,b);
return _mm_or_si64(_mm_and_si64(comparison, b), _mm_andnot_si64(comparison, a));
}
#define vec_msb_pack_16(a,b) _mm_packs_pi16(_mm_srli_pi16(a,7),_mm_srli_pi16(b,7))
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
#define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
#define vec_zero_psqt() _mm_setzero_si64()
#define vec_cleanup() _mm_empty()
#define NumRegistersSIMD 8
#define MaxChunkSize 8

#elif USE_NEON
typedef int16x8_t vec_t;
Expand All @@ -103,12 +144,24 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b)
#define vec_mul_16(a,b) vmulq_s16(a,b)
#define vec_zero() vec_t{0}
#define vec_set_16(a) vdupq_n_s16(a)
#define vec_max_16(a,b) vmaxq_s16(a,b)
#define vec_min_16(a,b) vminq_s16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
const int8x8_t shifta = vshrn_n_s16(a, 7);
const int8x8_t shiftb = vshrn_n_s16(b, 7);
const int8x16_t compacted = vcombine_s8(shifta,shiftb);
return *reinterpret_cast<const vec_t*> (&compacted);
}
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) vaddq_s32(a,b)
#define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
#define vec_zero_psqt() psqt_vec_t{0}
#define NumRegistersSIMD 16
#define MaxChunkSize 16

#else
#undef VECTOR
Expand Down Expand Up @@ -235,110 +288,30 @@ namespace Stockfish::Eval::NNUE {
{
const IndexType offset = (HalfDimensions / 2) * p;

#if defined(USE_AVX512)
#if defined(VECTOR)

constexpr IndexType OutputChunkSize = 512 / 8;
constexpr IndexType OutputChunkSize = MaxChunkSize;
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;

const __m512i Zero = _mm512_setzero_si512();
const __m512i One = _mm512_set1_epi16(127);
const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
vec_t Zero = vec_zero();
vec_t One = vec_set_16(127);

const __m512i* in0 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][0]));
const __m512i* in1 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
__m512i* out = reinterpret_cast< __m512i*>(output + offset);
const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
const vec_t* in1 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
vec_t* out = reinterpret_cast< vec_t*>(output + offset);

for (IndexType j = 0; j < NumOutputChunks; j += 1)
{
const __m512i sum0a = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 0], One), Zero);
const __m512i sum0b = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 1], One), Zero);
const __m512i sum1a = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 0], One), Zero);
const __m512i sum1b = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 1], One), Zero);
const vec_t sum0a = vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero);
const vec_t sum0b = vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero);
const vec_t sum1a = vec_max_16(vec_min_16(in1[j * 2 + 0], One), Zero);
const vec_t sum1b = vec_max_16(vec_min_16(in1[j * 2 + 1], One), Zero);

const __m512i pa = _mm512_srli_epi16(_mm512_mullo_epi16(sum0a, sum1a), 7);
const __m512i pb = _mm512_srli_epi16(_mm512_mullo_epi16(sum0b, sum1b), 7);
const vec_t pa = vec_mul_16(sum0a, sum1a);
const vec_t pb = vec_mul_16(sum0b, sum1b);

out[j] = _mm512_permutexvar_epi64(Control, _mm512_packs_epi16(pa, pb));
}

#elif defined(USE_AVX2)

constexpr IndexType OutputChunkSize = 256 / 8;
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;

const __m256i Zero = _mm256_setzero_si256();
const __m256i One = _mm256_set1_epi16(127);
constexpr int Control = 0b11011000;

const __m256i* in0 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][0]));
const __m256i* in1 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
__m256i* out = reinterpret_cast< __m256i*>(output + offset);

for (IndexType j = 0; j < NumOutputChunks; j += 1)
{
const __m256i sum0a = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 0], One), Zero);
const __m256i sum0b = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 1], One), Zero);
const __m256i sum1a = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 0], One), Zero);
const __m256i sum1b = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 1], One), Zero);

const __m256i pa = _mm256_srli_epi16(_mm256_mullo_epi16(sum0a, sum1a), 7);
const __m256i pb = _mm256_srli_epi16(_mm256_mullo_epi16(sum0b, sum1b), 7);

out[j] = _mm256_permute4x64_epi64(_mm256_packs_epi16(pa, pb), Control);
}

#elif defined(USE_SSE2)

constexpr IndexType OutputChunkSize = 128 / 8;
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;

const __m128i Zero = _mm_setzero_si128();
const __m128i One = _mm_set1_epi16(127);

const __m128i* in0 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][0]));
const __m128i* in1 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
__m128i* out = reinterpret_cast< __m128i*>(output + offset);

for (IndexType j = 0; j < NumOutputChunks; j += 1)
{
const __m128i sum0a = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 0], One), Zero);
const __m128i sum0b = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 1], One), Zero);
const __m128i sum1a = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 0], One), Zero);
const __m128i sum1b = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 1], One), Zero);

const __m128i pa = _mm_srli_epi16(_mm_mullo_epi16(sum0a, sum1a), 7);
const __m128i pb = _mm_srli_epi16(_mm_mullo_epi16(sum0b, sum1b), 7);

out[j] = _mm_packs_epi16(pa, pb);
}

#elif defined(USE_NEON)

constexpr IndexType OutputChunkSize = 128 / 8;
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;

const int16x8_t Zero = vdupq_n_s16(0);
const int16x8_t One = vdupq_n_s16(127);

const int16x8_t* in0 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][0]));
const int16x8_t* in1 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
int8x16_t* out = reinterpret_cast< int8x16_t*>(output + offset);

for (IndexType j = 0; j < NumOutputChunks; j += 1)
{
const int16x8_t sum0a = vmaxq_s16(vminq_s16(in0[j * 2 + 0], One), Zero);
const int16x8_t sum0b = vmaxq_s16(vminq_s16(in0[j * 2 + 1], One), Zero);
const int16x8_t sum1a = vmaxq_s16(vminq_s16(in1[j * 2 + 0], One), Zero);
const int16x8_t sum1b = vmaxq_s16(vminq_s16(in1[j * 2 + 1], One), Zero);

const int8x8_t pa = vshrn_n_s16(vmulq_s16(sum0a, sum1a), 7);
const int8x8_t pb = vshrn_n_s16(vmulq_s16(sum0b, sum1b), 7);

out[j] = vcombine_s8(pa, pb);
out[j] = vec_msb_pack_16(pa, pb);
}

#else
Expand All @@ -354,6 +327,10 @@ namespace Stockfish::Eval::NNUE {
#endif
}

#if defined(vec_cleanup)
vec_cleanup();
#endif

return psqt;

} // end of function transform()
Expand Down

0 comments on commit 166f28f

Please sign in to comment.