Permalink
Browse files

Add post kernel check + add more known gpu archs + fix warning (#196)

* Add post kernel check

* Revise

* Revise message

* Add more known gpu archs

* Add 52

* Fix warning
1 parent 9252a54 commit ccab3b9a41ac2e1dbe5376011db3044b39a033c3 @sxjscience sxjscience committed on GitHub Jan 7, 2017
Showing with 25 additions and 5 deletions.
  1. +1 −1 cmake/Cuda.cmake
  2. +1 −1 make/mshadow.mk
  3. +22 −2 mshadow/cuda/tensor_gpu-inl.cuh
  4. +1 −1 mshadow/extension/transpose.h
View
@@ -8,7 +8,7 @@ check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11)
# Known NVIDIA GPU achitectures mshadow can be compiled for.
# This list will be used for CUDA_ARCH_NAME = All option
-set(mshadow_known_gpu_archs "20 21(20) 30 35 50 60 61")
+set(mshadow_known_gpu_archs "20 21(20) 30 35 50 52 60 61")
################################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
View
@@ -8,7 +8,7 @@
# Add MSHADOW_NVCCFLAGS to the nvcc compile flags
#----------------------------------------------------------------------------------------
-MSHADOW_CFLAGS = -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs
+MSHADOW_CFLAGS = -funroll-loops -Wno-unused-variable -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs
MSHADOW_LDFLAGS = -lm
MSHADOW_NVCCFLAGS =
MKLROOT =
@@ -13,7 +13,12 @@
#endif
#include "../tensor.h"
#include "./reduce.cuh"
-
+#define MSHADOW_CUDA_POST_KERNEL_CHECK(x) \
+ /* Code block avoids redefinition of cudaError_t err */ \
+ do { \
+ cudaError err = cudaPeekAtLastError(); \
+ CHECK_EQ(err, cudaSuccess) << "Name: " << #x << " ErrStr:" << cudaGetErrorString(err); \
+ } while (0)
namespace mshadow {
namespace cuda {
/* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */
@@ -98,13 +103,15 @@ inline void MapPlan(expr::Plan<DstExp, DType> dst,
expr::Plan<DstExp, DType>,
expr::Plan<E, DType> >
<<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(MapPlanKernel);
} else {
int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
dim3 dimGrid(kBaseGridNum, 1 , 1);
MapPlanLargeKernel<Saver, kBaseThreadBits, kBaseGridNum,
expr::Plan<DstExp, DType>,
expr::Plan<E, DType> >
<<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan, repeat);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(MapPlanLargeKernel);
}
}
@@ -151,6 +158,7 @@ inline void MapReduceKeepLowest(expr::Plan<DstExp, DType> dst,
expr::Plan<DstExp, DType>,
expr::Plan<E, DType> >
<<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, eshape);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(MapRedKeepLowestKernel);
}
template<typename Saver, typename Reducer, int block_dim_bits,
@@ -192,6 +200,7 @@ inline void MapReduceKeepDim1(expr::Plan<DstExp, DType> dst,
expr::Plan<DstExp, DType>,
expr::Plan<E, DType> >
<<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, pshape);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(MapReduceKeepDim1Kernel);
}
template<int x_bits, typename DType>
@@ -213,6 +222,7 @@ inline void GetBatchedView(DType **dst, DType *src, int num, int stride,
CheckLaunchParam(dimGrid, dimBlock, "GetBatchedView");
GetBatchedViewKernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream_>>> (dst, src, num, stride);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(GetBatchedViewKernel);
}
template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
@@ -320,6 +330,7 @@ inline void Softmax(Tensor<gpu, 2, DType> &dst,
(expr::MakePlan(dst),
expr::MakePlan(src),
dst.size(1));
+ MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxKernel);
}
template<typename DType>
@@ -338,6 +349,7 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
expr::MakePlan(src),
expr::MakePlan(label),
dst.size(1));
+ MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
}
template<typename DType>
@@ -358,6 +370,7 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
expr::MakePlan(label),
dst.size(1),
ignore_label);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
}
template<int n_bits, typename DType>
@@ -445,6 +458,7 @@ inline void Softmax(Tensor<gpu, 3, DType> &dst,
CheckLaunchParam(dimGrid, dimBlock, "Softmax");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
Softmax3DKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DKernel);
}
template<typename DType>
@@ -459,6 +473,7 @@ inline void SoftmaxGrad(Tensor<gpu, 3, DType> &dst,
CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
Softmax3DGradKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src, label);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DGradKernel);
}
template<typename DType>
@@ -474,6 +489,7 @@ inline void SoftmaxGrad(Tensor<gpu, 3, DType> &dst,
CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
Softmax3DGradKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src, label, ignore_label);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DGradKernel);
}
template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
@@ -501,7 +517,7 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
// If the preceeding input has the same as this input, then the warp
// exits immediately. The warp also processes subsequent inputs with the
// same value.
- //
+ //
// Input Warp
// 1 <warp 1>
// 1 <warp 1> (<warp 2> exits without doing any work)
@@ -571,6 +587,7 @@ inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
expr::MakePlan(src),
src.size(0),
src.size(1));
+ MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradKernel);
}
template<typename IndexType, typename DType>
@@ -604,6 +621,7 @@ inline void AddTakeGradLargeBatch(Tensor<gpu, 2, DType> dst,
src.dptr_,
static_cast<int>(src.size(0)),
static_cast<int>(src.size(1)));
+ MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradLargeBatchKernel);
}
template<int warp_bits, typename DType, typename DstPlan, typename IndexPlan, typename SrcPlan>
@@ -643,6 +661,7 @@ inline void IndexFill(Tensor<gpu, 2, DType> dst,
expr::MakePlan(src),
src.size(0),
src.size(1));
+ MSHADOW_CUDA_POST_KERNEL_CHECK(IndexFillKernel);
}
template<typename KDType, typename VDType>
@@ -663,6 +682,7 @@ inline void SortByKey(Tensor<gpu, 1, KDType> keys, Tensor<gpu, 1, VDType> values
thrust::cuda::par.on(stream),
key_iter, key_iter + keys.size(0), value_iter, thrust::greater<KDType>());
}
+ MSHADOW_CUDA_POST_KERNEL_CHECK(SortByKey);
#else
LOG(FATAL) << "SortByKey is only supported for CUDA version >=7.0!";
#endif
@@ -98,9 +98,9 @@ struct TransposeIndicesExp:
public Exp<TransposeIndicesExp<SrcExp, DType, dimsrc, etype>, DType, etype> {
/*! \brief source expression */
const SrcExp &src_indices_; // Expression of the source indices
+ Shape<dimsrc> src_shape_; // Holds the corresponding stride of the source axes in dst
const Shape<dimsrc> axes_; // The transpose axes
Shape<dimsrc> src_in_dst_stride_; // Holds the corresponding stride of the source axes in dst
- Shape<dimsrc> src_shape_; // Holds the corresponding stride of the source axes in dst
/*! \brief constructor */
explicit TransposeIndicesExp(const SrcExp &src_indices,
Shape<dimsrc> src_shape,

0 comments on commit ccab3b9

Please sign in to comment.