Add RISC-V target #693

angsch · 2022-11-20T16:48:22Z

Add the infrastructure for RISC-V and an optimized SGEMM kernel assuming VLEN=128.

The config is intentionally set up to be as generic as possible. The code can be run with the GNU RISC-V toolchain (compiler + qemu) available as Ubuntu packages.

The vector extension for application cores supports various vector lengths starting from 128 bits. The assembly kernel assumes VLEN=128, which is, for example, used in P670 or P470. If the vector length is not 128, a fallback is needed. For the time being, this is the generic implementation. There will likely be a few revisions and, at some point, a vector-length agnostic implementation should be added.

Is this case distinction in the C file acceptable or do you prefer specialized targets?

angsch · 2023-01-09T19:08:46Z

config/rv32iv/make_defs.mk

+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2 -ftree-vectorize -march=rv32iav


atomic instructions (a flag) are not longer required to be available in hardware since it is possible to link against a library that emulates the required instructions in software.

mabi has been revised such that rv[32,64]i use the minimal mabi. rv[32,64]iv can pass floating-point values in registers, which corresponds to the default build of the toolchain and is compatible with other libraries.

march is now set twice for COPTFLAGS. The rationale is to enable user-defined hardware extensions (e.g., atomic, compressed instructions) that shall only apply to one type of build.

config/rv64gv/make_defs.mk

+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2 -ftree-vectorize -march=rv64gv


config/rv32i/make_defs.mk

+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -mabi=ilp32


config/rv32i/make_defs.mk

+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0 -march=rv32i
+else
+COPTFLAGS      := -O2 -march=rv32i


config/rv32iv/make_defs.mk

+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -mabi=ilp32d


config/rv32iv/make_defs.mk

+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0 -march=rv32iv


config/rv64gv/make_defs.mk

+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -mabi=lp64d


config/rv64gv/make_defs.mk

+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2 -ftree-vectorize -march=rv64gv


config/rv64i/make_defs.mk

+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -mabi=lp64


config/rv64i/make_defs.mk

+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0 -march=rv64i


config/rv64iv/make_defs.mk

+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -mabi=lp64d -D_RV64


config/rv64iv/make_defs.mk

+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0 -march=rv64iv


leekillough · 2023-01-17T00:59:03Z

config/rv64iv/make_defs.mk

+# The latest build hits an 'internal compiler error'
+# when compiling the reference gemm kernels. Workout
+# via -march=rv64imv
+COPTFLAGS      := -O2 -ftree-vectorize -march=rv64iv


config_registry

+rv32iv:      rv32iv/rviv
+rv64iv:      rv64iv/rviv
+
+rv64gv:      rv64gv


frame/base/bli_arch.c

+		#endif
+
+		#ifdef BLIS_FAMILY_RV64GV
+		id = BLIS_ARCH_RV64GV;


frame/base/bli_arch.c

+    "rv64i",
+    "rv32iv",
+    "rv64iv",
+    "rv64gv",


frame/base/bli_gks.c

+		bli_gks_register_cntx( BLIS_ARCH_RV64GV,      bli_cntx_init_rv64gv,
+		                                              bli_cntx_init_rv64gv_ref,
+		                                              bli_cntx_init_rv64gv_ind );
+#endif


frame/include/bli_arch_config.h

+CNTX_INIT_PROTS( rv64iv )
+#endif
+#ifdef BLIS_CONFIG_RV64GV
+CNTX_INIT_PROTS( rv64gv )


devinamatthews · 2023-01-17T02:23:29Z

kernels/rv64gv/3/bli_gemm_rv64gv_asm_s16x4.c

+
+
+        for (dim_t i = 0; i < 16 * 4; i++) {
+            ab[i] = ab[i] * (*alpha);


alpha == 0 is handled at a much higher level. The micro-kernels are only responsible for handling beta == 0 and k == 0.

No objection here, just want to point out that while alpha == 0 is handled higher up, k == 0 must still be handled properly in the microkernel. (This may have gone without saying.)

EDIT: Whoops, I missed that Devin made this explicit in his comment. Dang line wrapping.

Long-since handled correctly.

kernels/rv64gv/3/bli_gemm_rv64gv_asm_s16x4.c

+
+        for (dim_t j = 0; j < n; j++) {
+            for (dim_t i = 0; i < m; i++) {
+                c[i * rs_c + j * cs_c] = c[i * rs_c + j * cs_c] * (*beta) + ab[i + j * 16];


kernels/rv64gv/3/bli_sgemm_rv64gv_asm_16x4.S

@@ -0,0 +1,419 @@
+#define REALNAME bli_sgemm_rv64gv_ker_16x4


kernels/rv64gv/3/bli_sgemm_rv64gv_asm_16x4.S

+    flw ALPHA,(a1)
+    flw BETA, (a4)
+
+    // Multiply with alpha


devinamatthews · 2022-11-20T19:37:05Z

kernels/rv64gv/3/bli_gemm_rv64gv_asm_s16x4.c

+    // if VLEN != 128.
+    uint64_t velem = num_fp32_per_vector();
+
+    if (velem == 4) {


This should probably happen in the context initialization.

Good spot. We removed the rv64gv target in favor a fully vector-length agnostic kernels. The new kernels choose block sizes (and later on possibly also the microkernel size) during context initialization.

devinamatthews · 2023-01-17T02:23:29Z

kernels/rv64gv/3/bli_gemm_rv64gv_asm_s16x4.c

+
+
+        for (dim_t i = 0; i < 16 * 4; i++) {
+            ab[i] = ab[i] * (*alpha);


alpha == 0 is handled at a much higher level. The micro-kernels are only responsible for handling beta == 0 and k == 0.

kernels/rviv/3/bli_dgemm_rviv_asm_4vx4.S

+    fld ALPHA,(a1)
+    fld BETA, (a4)
+
+    // Multiply with alpha


kernels/rviv/3/bli_sgemm_rviv_asm_4vx4.S

+    flw ALPHA,(a1)
+    flw BETA, (a4)
+
+    // Multiply with alpha


devinamatthews · 2023-01-17T02:34:34Z

kernels/rviv/3/bli_zgemm_rviv_asm_4vx4.S

+
+    j MULTIPLYBETA
+
+ALPHAREAL:


@leekillough see my comment above re alpha.

angsch · 2023-01-18T18:17:35Z

Summary of changes:

Remove legacy target rv64gv with VLEN=128
Revise Makefiles: Type (32 bit/64 bit) is propagated via define, add comment on atomic instructions, move abi and march to CMISCFLAGS
Squash commits that do not have a value on their own.

~~Pending:~~
~~- Shortcut k == 0~~
~~- Activate assembly kernels for rv32iv target~~

This reverts commit 935b39f. .L L L$ prefixes are neither necessary nor desirable for assembly source code, because assembly code labels are always of internal linkage (static), and do not cause conflicts with other compilation units (object files), unless .global is used to make them global, and they appear in more than one object file. Assembly kernel labels do not need to be made into "Local Symbols", which are symbols which are totally omitted from debugging and symbol table output *, and hence are unable to be used as breakpoint names, or even seen as descriptive label names during step-by-step debugging, or profiled as being the locations of hotspots. (* Unless "as -L" is used.) "Local Labels" use only digits, which are even harder to read. The main motivation for local labels is during m4 macro expansion, when a snippet of code needs a label whose relative position in the code is always forward or backward from all branches to it, and hence it can be done such as: define(`macro1',` beqz loop_counter, 1f nop 1: ') or define(`macro2',` 1: nop bnez loop_counter, 1b ') macro1 macro2 ; no conflict in label 1 -- it gets reused The assembler automatically turns these numbered "Local Labels" into regularly-named "local symbols" like label_1, label_2, etc., which are then optionally outputted in the object file if -L is specified. The numbers can be reused later with no ambiguity, because the forward and backward branches and the label form a local range. It only makes sense in m4 macros, where the "local label" is "local to the macro". If removal of all non-global symbols from an object file is desired because it is unwanted debugging infomation that is making the object file too large, then you can use "strip -x" instead, without having to use weird naming conventions which are not standard across object file formats, and which make the source code harder to read and maintain. Using periods at the beginning of labels to make them "Local Symbols" confuses some syntax highlighting and auto-indenting editors, because it looks like a pseudo-op, and since a pseudo-op must appear in the second column or later, the editor might automatically indent it if the first character typed on the line is '.'. If you really wanted a label, you will have to manually un-indent it, since labels must start in the first column. The mangling of symbol names for locality conventions is the job of the assembler or object code generator, not the assembly code author. References: https://sourceware.org/binutils/docs/as/Symbol-Names.html https://sourceware.org/binutils/docs-2.18/as/Symbol-Names.html https://stackoverflow.com/questions/51150860/assembly-label-prefixes

… the base configuration (rv32i, rv32iv, rv64i, rv64iv)

If __riscv_arch_test is not defined, fall back on __riscv_mul, __riscv_atomic, and __riscv_compressed to test for M, A and C extensions.

Change bli_static_assert() to use STATIC_ASSERT_FAILED name in bitfield.

angsch · 2023-03-25T16:26:09Z

Used a rebase to retrigger, CI still not triggered. I will try out Field's idea to squash commits now.

fgvanzee · 2023-03-26T22:57:06Z

How about I take a diff of your entire PR and apply it to a separate branch myself? (We know that branches that I create and push trigger CI, so it will at least allow us to rule out that aspect of it.)

angsch · 2023-03-27T12:31:34Z

How about I take a diff of your entire PR and apply it to a separate branch myself? (We know that branches that I create and push trigger CI, so it will at least allow us to rule out that aspect of it.)

@fgvanzee It would be fantastic if you could do this. I did another attempt in #733, where everything is squashed. Also when squashed, Travis is not triggered. In order to rule out that the PR introduced an issue to .travis.yml, I removed the 2 RISC-V targets from that file. The result is the same, only Appveyor is triggered.

fgvanzee · 2023-03-27T16:20:55Z

@angsch Okay, I can try that.

Separately, we can investigate something else I just thought of: I'm wondering if you need to be added to the flame organization as an external collaborator in order for the CI to trigger.

leekillough · 2023-03-29T17:35:46Z

It was found that Clang, and some gnu toolchain packages, do not automatically enable the V extension, and it has to be enabled with -march=...v. I have added forcing of v in -march in angsch#9, which builds on my already-existing RISC-V architecture auto-detection headers.

…r __riscv_e. Add -DFORCE_RISCV_VECTOR when autodetecting architecture and rv32iv or rv64iv config has been selected, to work around Clang. Fix vsetvli instructions to use all operands, which are not optional in Clang.

…Travis

Details: - Updated config/rv[32|64]i/make_defs.mk to match that of rv[32|64]iv/make_defs.mk (i.e, escapled '#'; single-line $(shell) command). - Whitespace updates in .travis.yml.

Details: - PR #738 -- which moved -fPIC flag insertion responsibilities from common.mk to the subconfigs' individual make_defs.mk files -- was merged shortly before the introduction of new RISC-V subconfigs in #693. This commit brings those RISC-V subconfigs up to date with the new -fPIC conventions.

Details: - There are four RISC-V base configurations: 'rv32i', 'rv32iv', 'rv64i', and 'rv64iv', namely the 32-bit and 64-bit implementations with and without the 'V' vector extension. Additional extensions such as 'M' (multiplication), 'A' (atomics), 'F' ('float' hardware support), 'D' ('double' hardware support), and 'C' (compressed-length instructions), are automatically used when available. If they are not available, then software equivalents (e.g., softfloat and -latomic) are used. - './configure auto' can be invoked on a RISC-V build platform, and will automatically detect RISC-V CPU extensions through the RISC-V C API: https://github.com/riscv-non-isa/riscv-c-api-doc/blob/master/riscv-c-api.md - The assembly kernels assume the presence of the vector extension RVV 1.0. - It is possible to build 'rv[32,64]iv' for any value of VLEN. However, if VLEN < 128, the targets will fall back to the generic kernels and blocksizes. - The vector microkernels are vector-length agnostic and work with every VLEN >=128, but are expected to work best with smaller vector lengths, i.e., VLEN <= 512. - The assembly kernels cover column major storage (rs_c == 1). - The blocksizes aim at being a good generic choice for out-of-order cores. They are not tuned to a specific RISC-V HPC core. - The vector kernels have been tested using vlen={128,256,512}. - The single- and double-precision assembly code routines for 'sgemm' and 'dgemm', or for 'cgemm' and 'zgemm', are combined in their RISC-V vector assembly source code, and are differentiated only with macros. - The XLEN=32 and XLEN=64 versions of the RISC-V assembly code are identical, except that callee-saved registers are saved and restored differently. There are RISC-V assembly code #include files for handling the saving and restoring of callee-saved registers, and they are future-proof if ever XLEN=128. - Multiplications, such as computing array strides and offsets, are performed in C, and later passed to the RISC-V assembly kernels. This is so that the compiler can determine whether the 'M' (multiply) extension is available and use multiplication instructions, or call library helper functions instead. - A new macro called bli_static_assert() has been added to perform static assertions at compile-time, regardless of the C/C++ dialect of the compiler. The original motivation of this was to ensure that calling RISC-V assembly kernels would not silently truncate arguments of type 'dim_t' or 'inc_t' (so-called "narrowing conversions"). - RISC-V CI tests have been added to Travis CI, using the riscv-gnu-toolchain cross-compiler, and qemu simulator. - Thanks to Lee Killough for collaborating on this commit.

Details: - PR flame#738 -- which moved -fPIC flag insertion responsibilities from common.mk to the subconfigs' individual make_defs.mk files -- was merged shortly before the introduction of new RISC-V subconfigs in flame#693. This commit brings those RISC-V subconfigs up to date with the new -fPIC conventions.

Details: - This commit fixes issue #746, in which the _access() function (called from within blastest/f2c/open.c) is undeclared when compiling on Windows with clang 16. - (cherry picked from commit ef9d3e6) Fix bug in detecting Fortran compiler vendor (#745) `FC` was used instead of `found_fc`. - (cherry picked from 6fd9aab) Apply #738 to make_defs.mk of RISC-V subconfigs. (#740) Details: - PR #738 -- which moved -fPIC flag insertion responsibilities from common.mk to the subconfigs' individual make_defs.mk files -- was merged shortly before the introduction of new RISC-V subconfigs in #693. This commit brings those RISC-V subconfigs up to date with the new -fPIC conventions. - (cherry picked from 8215b02) Add RISC-V target (#693) Details: - There are four RISC-V base configurations: 'rv32i', 'rv32iv', 'rv64i', and 'rv64iv', namely the 32-bit and 64-bit implementations with and without the 'V' vector extension. Additional extensions such as 'M' (multiplication), 'A' (atomics), 'F' ('float' hardware support), 'D' ('double' hardware support), and 'C' (compressed-length instructions), are automatically used when available. If they are not available, then software equivalents (e.g., softfloat and -latomic) are used. - './configure auto' can be invoked on a RISC-V build platform, and will automatically detect RISC-V CPU extensions through the RISC-V C API: https://github.com/riscv-non-isa/riscv-c-api-doc/blob/master/riscv-c-api.md - The assembly kernels assume the presence of the vector extension RVV 1.0. - It is possible to build 'rv[32,64]iv' for any value of VLEN. However, if VLEN < 128, the targets will fall back to the generic kernels and blocksizes. - The vector microkernels are vector-length agnostic and work with every VLEN >=128, but are expected to work best with smaller vector lengths, i.e., VLEN <= 512. - The assembly kernels cover column major storage (rs_c == 1). - The blocksizes aim at being a good generic choice for out-of-order cores. They are not tuned to a specific RISC-V HPC core. - The vector kernels have been tested using vlen={128,256,512}. - The single- and double-precision assembly code routines for 'sgemm' and 'dgemm', or for 'cgemm' and 'zgemm', are combined in their RISC-V vector assembly source code, and are differentiated only with macros. - The XLEN=32 and XLEN=64 versions of the RISC-V assembly code are identical, except that callee-saved registers are saved and restored differently. There are RISC-V assembly code #include files for handling the saving and restoring of callee-saved registers, and they are future-proof if ever XLEN=128. - Multiplications, such as computing array strides and offsets, are performed in C, and later passed to the RISC-V assembly kernels. This is so that the compiler can determine whether the 'M' (multiply) extension is available and use multiplication instructions, or call library helper functions instead. - A new macro called bli_static_assert() has been added to perform static assertions at compile-time, regardless of the C/C++ dialect of the compiler. The original motivation of this was to ensure that calling RISC-V assembly kernels would not silently truncate arguments of type 'dim_t' or 'inc_t' (so-called "narrowing conversions"). - RISC-V CI tests have been added to Travis CI, using the riscv-gnu-toolchain cross-compiler, and qemu simulator. - Thanks to Lee Killough for collaborating on this commit. - (cherry picked from 6b38c5a)

leekillough reviewed Dec 27, 2022

View reviewed changes

config/rv64gv/make_defs.mk Outdated

ifeq ($(DEBUG_TYPE),noopt)

COPTFLAGS := -O0

else

COPTFLAGS := -O2 -ftree-vectorize -march=rv64gv

This comment was marked as outdated.

Sign in to view