Add post kernel check + add more known gpu archs + fix warning (#196)

* Add post kernel check * Revise * Revise message * Add more known gpu archs * Add 52 * Fix warning
dmlc · Jan 7, 2017 · ccab3b9 · ccab3b9
1 parent 9252a54
commit ccab3b9
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 5 deletions.
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
@@ -8,7 +8,7 @@ check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
 
 # Known NVIDIA GPU achitectures mshadow can be compiled for.
 # This list will be used for CUDA_ARCH_NAME = All option
-set(mshadow_known_gpu_archs "20 21(20) 30 35 50 60 61")
+set(mshadow_known_gpu_archs "20 21(20) 30 35 50 52 60 61")
 
 ################################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)

diff --git a/make/mshadow.mk b/make/mshadow.mk
@@ -8,7 +8,7 @@
 #  Add MSHADOW_NVCCFLAGS to the nvcc compile flags
 #----------------------------------------------------------------------------------------
 
-MSHADOW_CFLAGS = -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs
+MSHADOW_CFLAGS = -funroll-loops -Wno-unused-variable -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs
 MSHADOW_LDFLAGS = -lm
 MSHADOW_NVCCFLAGS =
 MKLROOT =

diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh
@@ -13,7 +13,12 @@
 #endif
 #include "../tensor.h"
 #include "./reduce.cuh"
-
+#define MSHADOW_CUDA_POST_KERNEL_CHECK(x) \
+  /* Code block avoids redefinition of cudaError_t err */ \
+  do { \
+    cudaError err = cudaPeekAtLastError(); \
+    CHECK_EQ(err, cudaSuccess) << "Name: " << #x << " ErrStr:" << cudaGetErrorString(err); \
+  } while (0)
 namespace mshadow {
 namespace cuda {
 /* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */
@@ -98,13 +103,15 @@ inline void MapPlan(expr::Plan<DstExp, DType> dst,
                   expr::Plan<DstExp, DType>,
                   expr::Plan<E, DType> >
         <<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(MapPlanKernel);
   } else {
     int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
     dim3 dimGrid(kBaseGridNum, 1 , 1);
     MapPlanLargeKernel<Saver, kBaseThreadBits, kBaseGridNum,
                        expr::Plan<DstExp, DType>,
                        expr::Plan<E, DType> >
         <<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan, repeat);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(MapPlanLargeKernel);
   }
 }
 
@@ -151,6 +158,7 @@ inline void MapReduceKeepLowest(expr::Plan<DstExp, DType> dst,
                          expr::Plan<DstExp, DType>,
                          expr::Plan<E, DType> >
       <<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, eshape);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(MapRedKeepLowestKernel);
 }
 
 template<typename Saver, typename Reducer, int block_dim_bits,
@@ -192,6 +200,7 @@ inline void MapReduceKeepDim1(expr::Plan<DstExp, DType> dst,
                           expr::Plan<DstExp, DType>,
                           expr::Plan<E, DType> >
       <<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, pshape);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(MapReduceKeepDim1Kernel);
 }
 
 template<int x_bits, typename DType>
@@ -213,6 +222,7 @@ inline void GetBatchedView(DType **dst, DType *src, int num, int stride,
   CheckLaunchParam(dimGrid, dimBlock, "GetBatchedView");
   GetBatchedViewKernel<kBaseThreadBits, DType>
     <<<dimGrid, dimBlock, 0, stream_>>> (dst, src, num, stride);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(GetBatchedViewKernel);
 }
 
 template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
@@ -320,6 +330,7 @@ inline void Softmax(Tensor<gpu, 2, DType> &dst,
       (expr::MakePlan(dst),
        expr::MakePlan(src),
        dst.size(1));
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxKernel);
 }
 
 template<typename DType>
@@ -338,6 +349,7 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
        expr::MakePlan(src),
        expr::MakePlan(label),
        dst.size(1));
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
 }
 
 template<typename DType>
@@ -358,6 +370,7 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
        expr::MakePlan(label),
        dst.size(1),
        ignore_label);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
 }
 
 template<int n_bits, typename DType>
@@ -445,6 +458,7 @@ inline void Softmax(Tensor<gpu, 3, DType> &dst,
   CheckLaunchParam(dimGrid, dimBlock, "Softmax");
   cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
   Softmax3DKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DKernel);
 }
 
 template<typename DType>
@@ -459,6 +473,7 @@ inline void SoftmaxGrad(Tensor<gpu, 3, DType> &dst,
   CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
   cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
   Softmax3DGradKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src, label);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DGradKernel);
 }
 
 template<typename DType>
@@ -474,6 +489,7 @@ inline void SoftmaxGrad(Tensor<gpu, 3, DType> &dst,
   CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
   cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
   Softmax3DGradKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src, label, ignore_label);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DGradKernel);
 }
 
 template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
@@ -501,7 +517,7 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
   // If the preceeding input has the same as this input, then the warp
   // exits immediately. The warp also processes subsequent inputs with the
   // same value.
-  // 
+  //
   // Input Warp
   // 1     <warp 1>
   // 1     <warp 1> (<warp 2> exits without doing any work)
@@ -571,6 +587,7 @@ inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
        expr::MakePlan(src),
        src.size(0),
        src.size(1));
+  MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradKernel);
 }
 
 template<typename IndexType, typename DType>
@@ -604,6 +621,7 @@ inline void AddTakeGradLargeBatch(Tensor<gpu, 2, DType> dst,
        src.dptr_,
        static_cast<int>(src.size(0)),
        static_cast<int>(src.size(1)));
+  MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradLargeBatchKernel);
 }
 
 template<int warp_bits, typename DType, typename DstPlan, typename IndexPlan, typename SrcPlan>
@@ -643,6 +661,7 @@ inline void IndexFill(Tensor<gpu, 2, DType> dst,
        expr::MakePlan(src),
        src.size(0),
        src.size(1));
+  MSHADOW_CUDA_POST_KERNEL_CHECK(IndexFillKernel);
 }
 
 template<typename KDType, typename VDType>
@@ -663,6 +682,7 @@ inline void SortByKey(Tensor<gpu, 1, KDType> keys, Tensor<gpu, 1, VDType> values
       thrust::cuda::par.on(stream),
       key_iter, key_iter + keys.size(0), value_iter, thrust::greater<KDType>());
   }
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SortByKey);
 #else
   LOG(FATAL) << "SortByKey is only supported for CUDA version >=7.0!";
 #endif

diff --git a/mshadow/extension/transpose.h b/mshadow/extension/transpose.h
@@ -98,9 +98,9 @@ struct TransposeIndicesExp:
       public Exp<TransposeIndicesExp<SrcExp, DType, dimsrc, etype>, DType, etype> {
   /*! \brief source expression */
   const SrcExp &src_indices_;  // Expression of the source indices
+  Shape<dimsrc> src_shape_;  // Holds the corresponding stride of the source axes in dst
   const Shape<dimsrc> axes_;  // The transpose axes
   Shape<dimsrc> src_in_dst_stride_;  // Holds the corresponding stride of the source axes in dst
-  Shape<dimsrc> src_shape_;  // Holds the corresponding stride of the source axes in dst
   /*! \brief constructor */
   explicit TransposeIndicesExp(const SrcExp &src_indices,
                                Shape<dimsrc> src_shape,