diff --git a/Makefile b/Makefile index 83e2f34..6136203 100644 --- a/Makefile +++ b/Makefile @@ -1,22 +1,26 @@ +CXX ?= g++ +GENCODES ?= 50 + INCLUDE_DIRS = -I./src -NVCC_FLAGS = -ccbin clang-3.8 -Wno-deprecated-declarations -std=c++11 -Xcompiler -Wall,-Wextra +NVCC_FLAGS = -ccbin $(CXX) -std=c++11 -Xcompiler -Wall,-Wextra NVCC_OPT_FLAGS = -DNDEBUG NVCC_TEST_FLAGS = -lineinfo NVCC_DBG_FLAGS = -g -G -NVCC_LIBS = -lstdc++ -lgtest -GENCODES = 50 - -% : %.cu - nvcc $(NVCC_TEST_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) -o $@ $< +NVCC_LIBS = -lstdc++ +NVCC_TEST_LIBS = -lgtest all: @echo "Please run 'make check' or 'make bench'." tests/test-suite: tests/test-suite.cu + nvcc $(NVCC_TEST_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) $(NVCC_TEST_LIBS) -o $@ $< + check: tests/test-suite @./tests/test-suite bench/bench: bench/bench.cu + nvcc $(NVCC_OPT_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) -o $@ $< + bench: bench/bench .PHONY: clean diff --git a/README.md b/README.md index 40ee1e6..b304b17 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,10 @@ void host_function() { } ``` +## Building + +The build system for cuda-fixnum is currently, shall we say, _primitive_. Basically you can run `make bench` to build the benchmarking program, or `make check` to build and run the test suite. The test suite requires the [Google Test framework](https://github.com/google/googletest) to be installed. The Makefile will read in the variables `CXX` and `GENCODES` from the environment as a convenient way to specify the C++ compiler to use and the Cuda compute capability codes that you want to compile with. The defaults are `CXX = g++` and `GENCODES = 50`. + ## Benchmarks Here is the output from a recent run of the benchmark with a GTX Titan X (Maxwell, 1GHz clock, 3072 cores): diff --git a/bench/bench.cu b/bench/bench.cu index b1d5b5a..25a5307 100644 --- a/bench/bench.cu +++ b/bench/bench.cu @@ -68,6 +68,11 @@ void bench(int nelts) { typedef warp_fixnum fixnum; typedef fixnum_array fixnum_array; + if (nelts == 0) { + puts(" -*- nelts == 0; skipping... -*-"); + return; + } + uint8_t *input = new uint8_t[fn_bytes * nelts]; for (int i = 0; i < fn_bytes * nelts; ++i) input[i] = (i * 17 + 11) % 256; @@ -132,6 +137,7 @@ int main(int argc, char *argv[]) { long m = 1; if (argc > 1) m = atol(argv[1]); + m = std::max(m, 1000L); bench_func("mul_lo", m); puts(""); diff --git a/src/fixnum/slot_layout.cu b/src/fixnum/slot_layout.cu index 1717f81..1652a74 100644 --- a/src/fixnum/slot_layout.cu +++ b/src/fixnum/slot_layout.cu @@ -26,12 +26,6 @@ static constexpr int WARPSIZE = 32; * The term "warp" should be reserved for subwarps of width 32 * (=warpSize). * - * TODO: All of the warp vote and warp shuffle functions will be - * deprecated in CUDA 9.0 in favour of versions that take a mask - * selecting relevant lanes in the warp on which to act (see CUDA - * Programming Guide, B.15). Create an interface that encapsulates - * both. - * * TODO: Work out if using __forceinline__ in these definitions * actually achieves anything. */ @@ -122,7 +116,7 @@ struct slot_layout static __device__ __forceinline__ uint32_t ballot(int tst) { - return __ballot(tst); + return __ballot_sync(0xFFFFFFFF, tst); } #endif @@ -133,7 +127,9 @@ struct slot_layout __device__ __forceinline__ static uint32_t ballot(int tst) { - uint32_t b = __ballot(tst); + // TODO: Use the mask parameter to ballot_sync to achieve the + // masked ballot. + uint32_t b = __ballot_sync(0xFFFFFFFF, tst); b >>= offset(); return b & mask; }