diff --git a/Makefile b/Makefile
index 83e2f34..6136203 100644
--- a/Makefile
+++ b/Makefile
@@ -1,22 +1,26 @@
+CXX ?= g++
+GENCODES ?= 50
+
 INCLUDE_DIRS = -I./src
-NVCC_FLAGS = -ccbin clang-3.8 -Wno-deprecated-declarations -std=c++11 -Xcompiler -Wall,-Wextra
+NVCC_FLAGS = -ccbin $(CXX) -std=c++11 -Xcompiler -Wall,-Wextra
 NVCC_OPT_FLAGS = -DNDEBUG
 NVCC_TEST_FLAGS = -lineinfo
 NVCC_DBG_FLAGS = -g -G
-NVCC_LIBS = -lstdc++ -lgtest
-GENCODES = 50
-
-% : %.cu
-	nvcc $(NVCC_TEST_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) -o $@ $<
+NVCC_LIBS = -lstdc++
+NVCC_TEST_LIBS = -lgtest
 
 all:
 	@echo "Please run 'make check' or 'make bench'."
 
 tests/test-suite: tests/test-suite.cu
+	nvcc $(NVCC_TEST_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) $(NVCC_TEST_LIBS) -o $@ $<
+
 check: tests/test-suite
 	@./tests/test-suite
 
 bench/bench: bench/bench.cu
+	nvcc $(NVCC_OPT_FLAGS) $(NVCC_FLAGS) $(GENCODES:%=--gpu-architecture=compute_%) $(GENCODES:%=--gpu-code=sm_%) $(INCLUDE_DIRS) $(NVCC_LIBS) -o $@ $<
+
 bench: bench/bench
 
 .PHONY: clean
diff --git a/README.md b/README.md
index 40ee1e6..b304b17 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,10 @@ void host_function() {
 }
 ```
 
+## Building
+
+The build system for cuda-fixnum is currently, shall we say, _primitive_. Basically you can run `make bench` to build the benchmarking program, or `make check` to build and run the test suite. The test suite requires the [Google Test framework](https://github.com/google/googletest) to be installed. The Makefile will read in the variables `CXX` and `GENCODES` from the environment as a convenient way to specify the C++ compiler to use and the Cuda compute capability codes that you want to compile with. The defaults are `CXX = g++` and `GENCODES = 50`.
+
 ## Benchmarks
 
 Here is the output from a recent run of the benchmark with a GTX Titan X (Maxwell, 1GHz clock, 3072 cores):
diff --git a/bench/bench.cu b/bench/bench.cu
index b1d5b5a..25a5307 100644
--- a/bench/bench.cu
+++ b/bench/bench.cu
@@ -68,6 +68,11 @@ void bench(int nelts) {
     typedef warp_fixnum<fn_bytes, word_fixnum> fixnum;
     typedef fixnum_array<fixnum> fixnum_array;
 
+    if (nelts == 0) {
+        puts(" -*-  nelts == 0; skipping...  -*-");
+        return;
+    }
+
     uint8_t *input = new uint8_t[fn_bytes * nelts];
     for (int i = 0; i < fn_bytes * nelts; ++i)
         input[i] = (i * 17 + 11) % 256;
@@ -132,6 +137,7 @@ int main(int argc, char *argv[]) {
     long m = 1;
     if (argc > 1)
         m = atol(argv[1]);
+    m = std::max(m, 1000L);
 
     bench_func<mul_lo>("mul_lo", m);
     puts("");
diff --git a/src/fixnum/slot_layout.cu b/src/fixnum/slot_layout.cu
index 1717f81..1652a74 100644
--- a/src/fixnum/slot_layout.cu
+++ b/src/fixnum/slot_layout.cu
@@ -26,12 +26,6 @@ static constexpr int WARPSIZE = 32;
  * The term "warp" should be reserved for subwarps of width 32
  * (=warpSize).
  *
- * TODO: All of the warp vote and warp shuffle functions will be
- * deprecated in CUDA 9.0 in favour of versions that take a mask
- * selecting relevant lanes in the warp on which to act (see CUDA
- * Programming Guide, B.15). Create an interface that encapsulates
- * both.
- *
  * TODO: Work out if using __forceinline__ in these definitions
  * actually achieves anything.
  */
@@ -122,7 +116,7 @@ struct slot_layout
     static __device__ __forceinline__
     uint32_t
     ballot(int tst) {
-        return __ballot(tst);
+        return __ballot_sync(0xFFFFFFFF, tst);
     }
 #endif
 
@@ -133,7 +127,9 @@ struct slot_layout
     __device__ __forceinline__
     static uint32_t
     ballot(int tst) {
-        uint32_t b = __ballot(tst);
+        // TODO: Use the mask parameter to ballot_sync to achieve the
+        // masked ballot.
+        uint32_t b = __ballot_sync(0xFFFFFFFF, tst);
         b >>= offset();
         return b & mask;
     }