diff --git a/Makefile b/Makefile index 0dca200..0489bd4 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CFLAGS := $(CFLAGS) -O2 -ffast-math -fstrict-aliasing +CFLAGS := $(CFLAGS) -O2 -ffast-math -fstrict-aliasing -fPIE CXXFLAGS := $(CXXFLAGS) -std=c++14 -Wall LDFLAGS := $(LDFLAGS) diff --git a/array.h b/array.h index 793f040..780633c 100644 --- a/array.h +++ b/array.h @@ -1357,15 +1357,18 @@ NDARRAY_UNIQUE NDARRAY_HOST_DEVICE void for_each_value_in_order_impl( } } -template +template ::value), int> = 0> NDARRAY_INLINE NDARRAY_HOST_DEVICE void for_each_value_in_order( const ExtentType& extent, Fn&& fn, Ptrs... ptrs) { using is_inner_loop = std::conditional_t; for_each_value_in_order_impl(is_inner_loop(), extent, fn, ptrs...); } -// Scalar buffers are a special case. -template +// Scalar buffers are a special case. The enable_if here (and above) are a workaround for a bug in +// old versions of GCC that causes this overload to be ambiguous. +template = 0> NDARRAY_INLINE NDARRAY_HOST_DEVICE void for_each_value_in_order( const std::tuple<>& extent, Fn&& fn, Ptrs... ptrs) { fn(*std::get<0>(ptrs)...); @@ -1387,20 +1390,15 @@ NDARRAY_HOST_DEVICE std::tuple<> make_compact_dims() { template NDARRAY_HOST_DEVICE auto make_compact_dims( const dim& dim0, const Dims&... dims) { - // We already know the stride of this dimension. - return std::tuple_cat(std::make_tuple(dim(dim0.min(), dim0.extent())), - make_compact_dims(dims...)); -} -template -NDARRAY_HOST_DEVICE auto make_compact_dims(const dim& dim0, const Dims&... dims) { // If we know the extent of this dimension, we can also provide // a constant stride for the next dimension. constexpr index_t NextStride = static_mul(CurrentStride, Extent); - // Give this dimension the current stride, and don't give it a - // runtime stride. If CurrentStride is static, that will be the - // stride. If not, it will be dynamic, and resolved later. - return std::tuple_cat(std::make_tuple(dim(dim0.min(), dim0.extent())), + // Give this dimension the current stride if we don't have one already, + // and don't give it a runtime stride. If CurrentStride is static, that + // will be the stride. If not, it will be dynamic, and resolved later. + constexpr index_t NewStride = is_static(Stride) ? Stride : CurrentStride; + return std::tuple_cat(std::make_tuple(dim(dim0.min(), dim0.extent())), make_compact_dims(dims...)); } diff --git a/test/readme.cpp b/test/readme.cpp index 54cec5c..51f7111 100644 --- a/test/readme.cpp +++ b/test/readme.cpp @@ -156,8 +156,8 @@ TEST(readme_ein_reduce) { enum { i = 0, j = 1, k = 2, l = 3 }; // Dot product dot1 = dot2 = x.y: - vector x({10}); - vector y({10}); + vector x({10}, 0.0f); + vector y({10}, 0.0f); float dot1 = make_ein_sum(ein(x) * ein(y)); float dot2 = 0.0f; ein_reduce(ein(dot2) += ein(x) * ein(y)); @@ -184,7 +184,7 @@ TEST(readme_ein_reduce) { // Maximum of each x-y plane of a 3D volume: dense_array T({8, 12, 20}); - dense_array max_xy({20}); + dense_array max_xy({20}, 0.0f); auto r = ein(max_xy); ein_reduce(r = max(r, ein(T)));