Skip to content

Commit

Permalink
Fix build (#67)
Browse files Browse the repository at this point in the history
* Use 'sync' version of ballot for compatibility with CUDA 10.

* Build system improvements; document dependencies.

* Protect bench from tiny parameter.
  • Loading branch information
unzvfu committed May 20, 2019
1 parent cfa869b commit e6a1db3
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 14 deletions.
16 changes: 10 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
CXX ?= g++
GENCODES ?= 50

INCLUDE_DIRS = -I./src
NVCC_FLAGS = -ccbin clang-3.8 -Wno-deprecated-declarations -std=c++11 -Xcompiler -Wall,-Wextra
NVCC_FLAGS = -ccbin $(CXX) -std=c++11 -Xcompiler -Wall,-Wextra
NVCC_OPT_FLAGS = -DNDEBUG
NVCC_TEST_FLAGS = -lineinfo
NVCC_DBG_FLAGS = -g -G
NVCC_LIBS = -lstdc++ -lgtest
GENCODES = 50

% : %.cu
nvcc $(NVCC_TEST_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) -o $@ $<
NVCC_LIBS = -lstdc++
NVCC_TEST_LIBS = -lgtest

all:
@echo "Please run 'make check' or 'make bench'."

tests/test-suite: tests/test-suite.cu
nvcc $(NVCC_TEST_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) $(NVCC_TEST_LIBS) -o $@ $<

check: tests/test-suite
@./tests/test-suite

bench/bench: bench/bench.cu
nvcc $(NVCC_OPT_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) -o $@ $<

bench: bench/bench

.PHONY: clean
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ void host_function() {
}
```
## Building
The build system for cuda-fixnum is currently, shall we say, _primitive_. Basically you can run `make bench` to build the benchmarking program, or `make check` to build and run the test suite. The test suite requires the [Google Test framework](https://github.com/google/googletest) to be installed. The Makefile will read in the variables `CXX` and `GENCODES` from the environment as a convenient way to specify the C++ compiler to use and the Cuda compute capability codes that you want to compile with. The defaults are `CXX = g++` and `GENCODES = 50`.
## Benchmarks
Here is the output from a recent run of the benchmark with a GTX Titan X (Maxwell, 1GHz clock, 3072 cores):
Expand Down
6 changes: 6 additions & 0 deletions bench/bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ void bench(int nelts) {
typedef warp_fixnum<fn_bytes, word_fixnum> fixnum;
typedef fixnum_array<fixnum> fixnum_array;

if (nelts == 0) {
puts(" -*- nelts == 0; skipping... -*-");
return;
}

uint8_t *input = new uint8_t[fn_bytes * nelts];
for (int i = 0; i < fn_bytes * nelts; ++i)
input[i] = (i * 17 + 11) % 256;
Expand Down Expand Up @@ -132,6 +137,7 @@ int main(int argc, char *argv[]) {
long m = 1;
if (argc > 1)
m = atol(argv[1]);
m = std::max(m, 1000L);

bench_func<mul_lo>("mul_lo", m);
puts("");
Expand Down
12 changes: 4 additions & 8 deletions src/fixnum/slot_layout.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,6 @@ static constexpr int WARPSIZE = 32;
* The term "warp" should be reserved for subwarps of width 32
* (=warpSize).
*
* TODO: All of the warp vote and warp shuffle functions will be
* deprecated in CUDA 9.0 in favour of versions that take a mask
* selecting relevant lanes in the warp on which to act (see CUDA
* Programming Guide, B.15). Create an interface that encapsulates
* both.
*
* TODO: Work out if using __forceinline__ in these definitions
* actually achieves anything.
*/
Expand Down Expand Up @@ -122,7 +116,7 @@ struct slot_layout
static __device__ __forceinline__
uint32_t
ballot(int tst) {
return __ballot(tst);
return __ballot_sync(0xFFFFFFFF, tst);
}
#endif

Expand All @@ -133,7 +127,9 @@ struct slot_layout
__device__ __forceinline__
static uint32_t
ballot(int tst) {
uint32_t b = __ballot(tst);
// TODO: Use the mask parameter to ballot_sync to achieve the
// masked ballot.
uint32_t b = __ballot_sync(0xFFFFFFFF, tst);
b >>= offset();
return b & mask;
}
Expand Down

0 comments on commit e6a1db3

Please sign in to comment.