Fix AVX2 build on Windows (#3238)

Summary: Tested with both MSVC (with /openmp:llvm) and clang-cl (no particular extra flags needed). This PR is separated into two commits (three after I found out that lines need to be 80 chars or less): 1. Changes needed for clang-cl (and probably stock clang too) 2. Changes needed for MSVC So FAISS can decide either to require using LLVM for Windows (not a hard thing to do these days since it is fully supported inside Visual Studio) and discarding the second commits, or taking them all and documenting the need to use /openmp:llvm Closes #3193 Pull Request resolved: #3238 Reviewed By: mdouze Differential Revision: D53479325 Pulled By: algoriddle fbshipit-source-id: e8628f44626b6f49c5d9d7f259a9e3061cfe5568
facebookresearch · Feb 15, 2024 · 87d43b9 · 87d43b9
1 parent c577f43
commit 87d43b9
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 15 deletions.
diff --git a/faiss/impl/LocalSearchQuantizer.cpp b/faiss/impl/LocalSearchQuantizer.cpp
@@ -628,7 +628,9 @@ void LocalSearchQuantizer::icm_encode_step(
                         {
                             size_t binary_idx = (other_m + 1) * M * K * K +
                                     m * K * K + code2 * K + code;
-                            _mm_prefetch(binaries + binary_idx, _MM_HINT_T0);
+                            _mm_prefetch(
+                                    (const char*)(binaries + binary_idx),
+                                    _MM_HINT_T0);
                         }
                     }
 #endif

diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h
@@ -40,11 +40,13 @@
 
 #include <intrin.h>
 
+#ifndef __clang__
 inline int __builtin_ctzll(uint64_t x) {
     unsigned long ret;
     _BitScanForward64(&ret, x);
     return (int)ret;
 }
+#endif
 
 // cudatoolkit provides __builtin_ctz for NVCC >= 11.0
 #if !defined(__CUDACC__) || __CUDACC_VER_MAJOR__ < 11
@@ -55,13 +57,20 @@ inline int __builtin_ctz(unsigned long x) {
 }
 #endif
 
+#ifndef __clang__
 inline int __builtin_clzll(uint64_t x) {
     return (int)__lzcnt64(x);
 }
+#endif
 
 #define __builtin_popcount __popcnt
 #define __builtin_popcountl __popcnt64
 
+#ifndef __clang__
+#define __m128i_u __m128i
+#define __m256i_u __m256i
+#endif
+
 // MSVC does not define __SSEx__, and _M_IX86_FP is only defined on 32-bit
 // processors cf.
 // https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros

diff --git a/faiss/utils/distances.cpp b/faiss/utils/distances.cpp
@@ -417,8 +417,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(
             for (int64_t i = i0; i < i1; i++) {
                 float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);
 
-                _mm_prefetch(ip_line, _MM_HINT_NTA);
-                _mm_prefetch(ip_line + 16, _MM_HINT_NTA);
+                _mm_prefetch((const char*)ip_line, _MM_HINT_NTA);
+                _mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA);
 
                 // constant
                 const __m256 mul_minus2 = _mm256_set1_ps(-2);
@@ -445,8 +445,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(
 
                 // process 16 elements per loop
                 for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) {
-                    _mm_prefetch(ip_line + 32, _MM_HINT_NTA);
-                    _mm_prefetch(ip_line + 48, _MM_HINT_NTA);
+                    _mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA);
+                    _mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA);
 
                     // load values for norms
                     const __m256 y_norm_0 =

diff --git a/faiss/utils/distances_fused/simdlib_based.cpp b/faiss/utils/distances_fused/simdlib_based.cpp
@@ -73,7 +73,7 @@ void kernel(
 
     // prefetch the next point
 #if defined(__AVX2__)
-    _mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
+    _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
 #endif
 
     // load a single point from x

diff --git a/faiss/utils/distances_simd.cpp b/faiss/utils/distances_simd.cpp
@@ -439,14 +439,14 @@ void fvec_op_ny_D2<ElementOpIP>(
 
     if (ny8 > 0) {
         // process 8 D2-vectors per loop.
-        _mm_prefetch(y, _MM_HINT_T0);
-        _mm_prefetch(y + 16, _MM_HINT_T0);
+        _mm_prefetch((const char*)y, _MM_HINT_T0);
+        _mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
 
         const __m256 m0 = _mm256_set1_ps(x[0]);
         const __m256 m1 = _mm256_set1_ps(x[1]);
 
         for (i = 0; i < ny8 * 8; i += 8) {
-            _mm_prefetch(y + 32, _MM_HINT_T0);
+            _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
 
             // load 8x2 matrix and transpose it in registers.
             // the typical bottleneck is memory access, so
@@ -496,14 +496,14 @@ void fvec_op_ny_D2<ElementOpL2>(
 
     if (ny8 > 0) {
         // process 8 D2-vectors per loop.
-        _mm_prefetch(y, _MM_HINT_T0);
-        _mm_prefetch(y + 16, _MM_HINT_T0);
+        _mm_prefetch((const char*)y, _MM_HINT_T0);
+        _mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
 
         const __m256 m0 = _mm256_set1_ps(x[0]);
         const __m256 m1 = _mm256_set1_ps(x[1]);
 
         for (i = 0; i < ny8 * 8; i += 8) {
-            _mm_prefetch(y + 32, _MM_HINT_T0);
+            _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
 
             // load 8x2 matrix and transpose it in registers.
             // the typical bottleneck is memory access, so
@@ -1084,8 +1084,8 @@ size_t fvec_L2sqr_ny_nearest_D2(
     // process 8 D2-vectors per loop.
     const size_t ny8 = ny / 8;
     if (ny8 > 0) {
-        _mm_prefetch(y, _MM_HINT_T0);
-        _mm_prefetch(y + 16, _MM_HINT_T0);
+        _mm_prefetch((const char*)y, _MM_HINT_T0);
+        _mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
 
         // track min distance and the closest vector independently
         // for each of 8 AVX2 components.
@@ -1100,7 +1100,7 @@ size_t fvec_L2sqr_ny_nearest_D2(
         const __m256 m1 = _mm256_set1_ps(x[1]);
 
         for (; i < ny8 * 8; i += 8) {
-            _mm_prefetch(y + 32, _MM_HINT_T0);
+            _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
 
             __m256 v0;
             __m256 v1;