Add support for aarch64 advanced simd to volk.

Add new archs neonv7 and neonv8 to switch on assembly language neon code. Use the existing neon arch for intrinsics, since intrinics work for both neon and advanced simd. Signed-off-by: Philip Balister <philip@balister.org>
gnuradio · Apr 27, 2018 · e98e927 · xloem · Oct 8, 2018 · e98e927
1 parent 297fefd
commit e98e927
Show file tree

Hide file tree

Showing 9 changed files with 119 additions and 25 deletions.
diff --git a/gen/archs.xml b/gen/archs.xml
@@ -13,12 +13,24 @@
 </arch>
 
 <arch name="neon">
-  <flag compiler="gnu">-mfpu=neon</flag>
   <flag compiler="gnu">-funsafe-math-optimizations</flag>
   <alignment>16</alignment>
   <check name="has_neon"></check>
 </arch>
 
+<arch name="neonv7">
+  <flag compiler="gnu">-mfpu=neon</flag>
+  <flag compiler="gnu">-funsafe-math-optimizations</flag>
+  <alignment>16</alignment>
+  <check name="has_neonv7"></check>
+</arch>
+
+<arch name="neonv8">
+  <flag compiler="gnu">-funsafe-math-optimizations</flag>
+  <alignment>16</alignment>
+  <check name="has_neonv8"></check>
+</arch>
+
 <arch name="32">
   <flag compiler="gnu">-m32</flag>
 </arch>

diff --git a/gen/machines.xml b/gen/machines.xml
@@ -5,7 +5,15 @@
 </machine>
 
 <machine name="neon">
-<archs>generic neon softfp|hardfp orc|</archs>
+<archs>generic neon orc|</archs>
+</machine>
+
+<machine name="neonv7">
+<archs>generic neon neonv7 softfp|hardfp orc|</archs>
+</machine>
+
+<machine name="neonv8">
+<archs>generic neon neonv8</archs>
 </machine>
 
 <!-- trailing | bar means generate without either for MSVC -->

diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -189,9 +189,9 @@ volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned i
 }
 #endif /* LV_HAVE_NEON */
 
-#ifdef LV_HAVE_NEON
+#ifdef LV_HAVE_NEONV7
 extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
-#endif /* LV_HAVE_NEON */
+#endif /* LV_HAVE_NEONV7 */
 
 #ifdef LV_HAVE_GENERIC
 static inline void

diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h
@@ -249,13 +249,13 @@ volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,
 
 #endif /* LV_HAVE_NEON */
 
-#ifdef LV_HAVE_NEON
+#ifdef LV_HAVE_NEONV7
 extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-#endif /* LV_HAVE_NEON */
+#endif /* LV_HAVE_NEONV7 */
 
-#ifdef LV_HAVE_NEON
+#ifdef LV_HAVE_NEONV7
 extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-#endif /* LV_HAVE_NEON */
+#endif /* LV_HAVE_NEONV7 */
 
 #ifdef LV_HAVE_GENERIC
 

diff --git a/kernels/volk/volk_32f_x2_dot_prod_32f.h b/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -773,12 +773,12 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i
 
 #endif /* LV_HAVE_NEON */
 
-#ifdef LV_HAVE_NEON
+#ifdef LV_HAVE_NEONV7
 extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-#endif /* LV_HAVE_NEON */
+#endif /* LV_HAVE_NEONV7 */
 
-#ifdef LV_HAVE_NEON
+#ifdef LV_HAVE_NEONV7
 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-#endif /* LV_HAVE_NEON */
+#endif /* LV_HAVE_NEONV7 */
 
 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -489,17 +489,17 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict re
 
 #endif /*LV_HAVE_NEON*/
 
-#ifdef LV_HAVE_NEON
+#ifdef LV_HAVE_NEONV7
 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points);
-#endif /*LV_HAVE_NEON*/
+#endif /*LV_HAVE_NEONV7*/
 
-#ifdef LV_HAVE_NEON
+#ifdef LV_HAVE_NEONV7
 extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points);
-#endif /*LV_HAVE_NEON*/
+#endif /*LV_HAVE_NEONV7*/
 
-#ifdef LV_HAVE_NEON
+#ifdef LV_HAVE_NEONV7
 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points);
-#endif /*LV_HAVE_NEON*/
+#endif /*LV_HAVE_NEONV7*/
 
 #ifdef LV_HAVE_SSE
 

diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -446,12 +446,12 @@ volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aV
 #endif /* LV_HAVE_NEON */
 
 
-#ifdef LV_HAVE_NEON
+#ifdef LV_HAVE_NEONV7
 
 extern void
 volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector,
                                    const lv_32fc_t* bVector, unsigned int num_points);
-#endif /* LV_HAVE_NEON */
+#endif /* LV_HAVE_NEONV7 */
 
 
 #ifdef LV_HAVE_ORC

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
@@ -249,6 +249,36 @@ if(NOT CPU_IS_x86)
     OVERRULE_ARCH(avx "Architecture is not x86 or x86_64")
 endif(NOT CPU_IS_x86)
 
+########################################################################
+# Select neon based on ARM ISA version
+########################################################################
+
+# First, compile a test program to see if compiler supports neon.
+
+include(CheckCSourceCompiles)
+
+check_c_source_compiles("#include <arm_neon.h>\nint main(){ uint8_t *dest; uint8x8_t res; vst1_u8(dest, res); }"
+	                neon_compile_result)
+
+if(neon_compile_result)
+    check_c_source_compiles("int main(){asm volatile(\"vrev32.8 q0, q0\");}"
+                            have_neonv7_result )
+    check_c_source_compiles("int main(){asm volatile(\"sub v1.4s,v1.4s,v1.4s\");}"
+                            have_neonv8_result )
+
+    if (have_neonv7_result)
+        OVERRULE_ARCH(neonv8 "CPU is armv7")
+    endif()
+
+    if (have_neonv8_result)
+        OVERRULE_ARCH(neonv7 "CPU is armv8")
+    endif()
+else(neon_compile_result)
+    OVERRULE_ARCH(neon "Compiler doesn't support NEON")
+    OVERRULE_ARCH(neonv7 "Compiler doesn't support NEON")
+    OVERRULE_ARCH(neonv8 "Compiler doesn't support NEON")
+endif(neon_compile_result)
+
 ########################################################################
 # implement overruling in the ORC case,
 # since ORC always passes flag detection
@@ -405,7 +435,7 @@ string(REPLACE "\n" " \\n" COMPILER_INFO ${COMPILER_INFO})
 #  on by default, but let users turn it off
 ########################################################################
 if(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
-  set(ASM_ARCHS_AVAILABLE "neon")
+  set(ASM_ARCHS_AVAILABLE "neonv7" "neonv8")
 
   set(FULL_C_FLAGS "${CMAKE_C_FLAGS}" "${CMAKE_CXX_COMPILER_ARG1}")
 
@@ -414,7 +444,7 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
   # set up the assembler flags and include the source files
   foreach(ARCH ${ASM_ARCHS_AVAILABLE})
       string(REGEX MATCH "${ARCH}" ASM_ARCH "${available_archs}")
-    if( ASM_ARCH STREQUAL "neon" )
+    if( ASM_ARCH STREQUAL "neonv7" )
       message(STATUS "---- Adding ASM files") # we always use ATT syntax
       message(STATUS "-- Detected neon architecture; enabling ASM")
       # setup architecture specific assembler flags

diff --git a/tmpl/volk_cpu.tmpl.c b/tmpl/volk_cpu.tmpl.c
@@ -124,11 +124,11 @@ static inline unsigned int get_avx2_enabled(void) {
     #include <asm/hwcap.h>
     #include <linux/auxvec.h>
     #include <stdio.h>
-    #define VOLK_CPU_ARM
+    #define VOLK_CPU_ARMV7
 #endif
 
-static int has_neon(void){
-#if defined(VOLK_CPU_ARM)
+static int has_neonv7(void){
+#if defined(VOLK_CPU_ARMV7)
     FILE *auxvec_f;
     unsigned long auxvec[2];
     unsigned int found_neon = 0;
@@ -151,6 +151,50 @@ static int has_neon(void){
 #endif
 }
 
+//\todo: Fix this to really check for neon on aarch64
+//neon detection is linux specific
+#if defined(__aarch64__) && defined(__linux__)
+    #include <asm/hwcap.h>
+    #include <linux/auxvec.h>
+    #include <stdio.h>
+    #define VOLK_CPU_ARMV8
+#endif
+
+static int has_neonv8(void){
+#if defined(VOLK_CPU_ARMV8)
+    FILE *auxvec_f;
+    unsigned long auxvec[2];
+    unsigned int found_neon = 0;
+    auxvec_f = fopen("/proc/self/auxv", "rb");
+    if(!auxvec_f) return 0;
+
+    size_t r = 1;
+    //so auxv is basically 32b of ID and 32b of value
+    //so it goes like this
+    while(!found_neon && r) {
+      r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
+      if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_ASIMD))
+        found_neon = 1;
+    }
+
+    fclose(auxvec_f);
+    return found_neon;
+#else
+    return 0;
+#endif
+}
+
+static int has_neon(void){
+#if defined(VOLK_CPU_ARMV8) || defined(VOLK_CPU_ARMV7)
+    if (has_neonv7() || has_neonv8())
+        return 1;
+    else
+	return 0;
+#else
+    return 0;
+#endif
+}
+
 %for arch in archs:
 static int i_can_has_${arch.name} (void) {
     %for check, params in arch.checks: