Skip to content

Commit

Permalink
Add support for aarch64 advanced simd to volk.
Browse files Browse the repository at this point in the history
Add new archs neonv7 and neonv8 to switch on assembly language neon
code. Use the existing neon arch for intrinsics, since intrinics
work for both neon and advanced simd.

Signed-off-by: Philip Balister <philip@balister.org>
  • Loading branch information
balister committed Apr 27, 2018
1 parent 297fefd commit e98e927
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 25 deletions.
14 changes: 13 additions & 1 deletion gen/archs.xml
Expand Up @@ -13,12 +13,24 @@
</arch>

<arch name="neon">
<flag compiler="gnu">-mfpu=neon</flag>
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="has_neon"></check>
</arch>

<arch name="neonv7">
<flag compiler="gnu">-mfpu=neon</flag>
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="has_neonv7"></check>
</arch>

<arch name="neonv8">
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="has_neonv8"></check>
</arch>

<arch name="32">
<flag compiler="gnu">-m32</flag>
</arch>
Expand Down
10 changes: 9 additions & 1 deletion gen/machines.xml
Expand Up @@ -5,7 +5,15 @@
</machine>

<machine name="neon">
<archs>generic neon softfp|hardfp orc|</archs>
<archs>generic neon orc|</archs>
</machine>

<machine name="neonv7">
<archs>generic neon neonv7 softfp|hardfp orc|</archs>
</machine>

<machine name="neonv8">
<archs>generic neon neonv8</archs>
</machine>

<!-- trailing | bar means generate without either for MSVC -->
Expand Down
4 changes: 2 additions & 2 deletions kernels/volk/volk_16i_max_star_horizontal_16i.h
Expand Up @@ -189,9 +189,9 @@ volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned i
}
#endif /* LV_HAVE_NEON */

#ifdef LV_HAVE_NEON
#ifdef LV_HAVE_NEONV7
extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
#endif /* LV_HAVE_NEON */
#endif /* LV_HAVE_NEONV7 */

#ifdef LV_HAVE_GENERIC
static inline void
Expand Down
8 changes: 4 additions & 4 deletions kernels/volk/volk_32f_x2_add_32f.h
Expand Up @@ -249,13 +249,13 @@ volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,

#endif /* LV_HAVE_NEON */

#ifdef LV_HAVE_NEON
#ifdef LV_HAVE_NEONV7
extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
#endif /* LV_HAVE_NEON */
#endif /* LV_HAVE_NEONV7 */

#ifdef LV_HAVE_NEON
#ifdef LV_HAVE_NEONV7
extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
#endif /* LV_HAVE_NEON */
#endif /* LV_HAVE_NEONV7 */

#ifdef LV_HAVE_GENERIC

Expand Down
8 changes: 4 additions & 4 deletions kernels/volk/volk_32f_x2_dot_prod_32f.h
Expand Up @@ -773,12 +773,12 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i

#endif /* LV_HAVE_NEON */

#ifdef LV_HAVE_NEON
#ifdef LV_HAVE_NEONV7
extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
#endif /* LV_HAVE_NEON */
#endif /* LV_HAVE_NEONV7 */

#ifdef LV_HAVE_NEON
#ifdef LV_HAVE_NEONV7
extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
#endif /* LV_HAVE_NEON */
#endif /* LV_HAVE_NEONV7 */

#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
12 changes: 6 additions & 6 deletions kernels/volk/volk_32fc_32f_dot_prod_32fc.h
Expand Up @@ -489,17 +489,17 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict re

#endif /*LV_HAVE_NEON*/

#ifdef LV_HAVE_NEON
#ifdef LV_HAVE_NEONV7
extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
#endif /*LV_HAVE_NEON*/
#endif /*LV_HAVE_NEONV7*/

#ifdef LV_HAVE_NEON
#ifdef LV_HAVE_NEONV7
extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
#endif /*LV_HAVE_NEON*/
#endif /*LV_HAVE_NEONV7*/

#ifdef LV_HAVE_NEON
#ifdef LV_HAVE_NEONV7
extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
#endif /*LV_HAVE_NEON*/
#endif /*LV_HAVE_NEONV7*/

#ifdef LV_HAVE_SSE

Expand Down
4 changes: 2 additions & 2 deletions kernels/volk/volk_32fc_x2_multiply_32fc.h
Expand Up @@ -446,12 +446,12 @@ volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aV
#endif /* LV_HAVE_NEON */


#ifdef LV_HAVE_NEON
#ifdef LV_HAVE_NEONV7

extern void
volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector,
const lv_32fc_t* bVector, unsigned int num_points);
#endif /* LV_HAVE_NEON */
#endif /* LV_HAVE_NEONV7 */


#ifdef LV_HAVE_ORC
Expand Down
34 changes: 32 additions & 2 deletions lib/CMakeLists.txt
Expand Up @@ -249,6 +249,36 @@ if(NOT CPU_IS_x86)
OVERRULE_ARCH(avx "Architecture is not x86 or x86_64")
endif(NOT CPU_IS_x86)

########################################################################
# Select neon based on ARM ISA version
########################################################################

# First, compile a test program to see if compiler supports neon.

include(CheckCSourceCompiles)

check_c_source_compiles("#include <arm_neon.h>\nint main(){ uint8_t *dest; uint8x8_t res; vst1_u8(dest, res); }"

This comment has been minimized.

Copy link
@xloem

xloem Oct 8, 2018

Contributor

This check always fails for me, issue #205

neon_compile_result)

if(neon_compile_result)
check_c_source_compiles("int main(){asm volatile(\"vrev32.8 q0, q0\");}"
have_neonv7_result )
check_c_source_compiles("int main(){asm volatile(\"sub v1.4s,v1.4s,v1.4s\");}"
have_neonv8_result )

if (have_neonv7_result)
OVERRULE_ARCH(neonv8 "CPU is armv7")
endif()

if (have_neonv8_result)
OVERRULE_ARCH(neonv7 "CPU is armv8")
endif()
else(neon_compile_result)
OVERRULE_ARCH(neon "Compiler doesn't support NEON")
OVERRULE_ARCH(neonv7 "Compiler doesn't support NEON")
OVERRULE_ARCH(neonv8 "Compiler doesn't support NEON")
endif(neon_compile_result)

########################################################################
# implement overruling in the ORC case,
# since ORC always passes flag detection
Expand Down Expand Up @@ -405,7 +435,7 @@ string(REPLACE "\n" " \\n" COMPILER_INFO ${COMPILER_INFO})
# on by default, but let users turn it off
########################################################################
if(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
set(ASM_ARCHS_AVAILABLE "neon")
set(ASM_ARCHS_AVAILABLE "neonv7" "neonv8")

set(FULL_C_FLAGS "${CMAKE_C_FLAGS}" "${CMAKE_CXX_COMPILER_ARG1}")

Expand All @@ -414,7 +444,7 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
# set up the assembler flags and include the source files
foreach(ARCH ${ASM_ARCHS_AVAILABLE})
string(REGEX MATCH "${ARCH}" ASM_ARCH "${available_archs}")
if( ASM_ARCH STREQUAL "neon" )
if( ASM_ARCH STREQUAL "neonv7" )
message(STATUS "---- Adding ASM files") # we always use ATT syntax
message(STATUS "-- Detected neon architecture; enabling ASM")
# setup architecture specific assembler flags
Expand Down
50 changes: 47 additions & 3 deletions tmpl/volk_cpu.tmpl.c
Expand Up @@ -124,11 +124,11 @@ static inline unsigned int get_avx2_enabled(void) {
#include <asm/hwcap.h>
#include <linux/auxvec.h>
#include <stdio.h>
#define VOLK_CPU_ARM
#define VOLK_CPU_ARMV7
#endif

static int has_neon(void){
#if defined(VOLK_CPU_ARM)
static int has_neonv7(void){
#if defined(VOLK_CPU_ARMV7)
FILE *auxvec_f;
unsigned long auxvec[2];
unsigned int found_neon = 0;
Expand All @@ -151,6 +151,50 @@ static int has_neon(void){
#endif
}

//\todo: Fix this to really check for neon on aarch64
//neon detection is linux specific
#if defined(__aarch64__) && defined(__linux__)
#include <asm/hwcap.h>
#include <linux/auxvec.h>
#include <stdio.h>
#define VOLK_CPU_ARMV8
#endif

static int has_neonv8(void){
#if defined(VOLK_CPU_ARMV8)
FILE *auxvec_f;
unsigned long auxvec[2];
unsigned int found_neon = 0;
auxvec_f = fopen("/proc/self/auxv", "rb");
if(!auxvec_f) return 0;

size_t r = 1;
//so auxv is basically 32b of ID and 32b of value
//so it goes like this
while(!found_neon && r) {
r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_ASIMD))
found_neon = 1;
}

fclose(auxvec_f);
return found_neon;
#else
return 0;
#endif
}

static int has_neon(void){
#if defined(VOLK_CPU_ARMV8) || defined(VOLK_CPU_ARMV7)
if (has_neonv7() || has_neonv8())
return 1;
else
return 0;
#else
return 0;
#endif
}

%for arch in archs:
static int i_can_has_${arch.name} (void) {
%for check, params in arch.checks:
Expand Down

0 comments on commit e98e927

Please sign in to comment.