Skip to content
This repository has been archived by the owner on Aug 11, 2020. It is now read-only.

Commit

Permalink
Add post kernel check + add more known gpu archs + fix warning (#196)
Browse files Browse the repository at this point in the history
* Add post kernel check

* Revise

* Revise message

* Add more known gpu archs

* Add 52

* Fix warning
  • Loading branch information
sxjscience committed Jan 7, 2017
1 parent 9252a54 commit ccab3b9
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 5 deletions.
2 changes: 1 addition & 1 deletion cmake/Cuda.cmake
Expand Up @@ -8,7 +8,7 @@ check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11)

# Known NVIDIA GPU achitectures mshadow can be compiled for.
# This list will be used for CUDA_ARCH_NAME = All option
set(mshadow_known_gpu_archs "20 21(20) 30 35 50 60 61")
set(mshadow_known_gpu_archs "20 21(20) 30 35 50 52 60 61")

################################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
Expand Down
2 changes: 1 addition & 1 deletion make/mshadow.mk
Expand Up @@ -8,7 +8,7 @@
# Add MSHADOW_NVCCFLAGS to the nvcc compile flags
#----------------------------------------------------------------------------------------

MSHADOW_CFLAGS = -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs
MSHADOW_CFLAGS = -funroll-loops -Wno-unused-variable -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs
MSHADOW_LDFLAGS = -lm
MSHADOW_NVCCFLAGS =
MKLROOT =
Expand Down
24 changes: 22 additions & 2 deletions mshadow/cuda/tensor_gpu-inl.cuh
Expand Up @@ -13,7 +13,12 @@
#endif
#include "../tensor.h"
#include "./reduce.cuh"

#define MSHADOW_CUDA_POST_KERNEL_CHECK(x) \
/* Code block avoids redefinition of cudaError_t err */ \
do { \
cudaError err = cudaPeekAtLastError(); \
CHECK_EQ(err, cudaSuccess) << "Name: " << #x << " ErrStr:" << cudaGetErrorString(err); \
} while (0)
namespace mshadow {
namespace cuda {
/* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */
Expand Down Expand Up @@ -98,13 +103,15 @@ inline void MapPlan(expr::Plan<DstExp, DType> dst,
expr::Plan<DstExp, DType>,
expr::Plan<E, DType> >
<<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan);
MSHADOW_CUDA_POST_KERNEL_CHECK(MapPlanKernel);
} else {
int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
dim3 dimGrid(kBaseGridNum, 1 , 1);
MapPlanLargeKernel<Saver, kBaseThreadBits, kBaseGridNum,
expr::Plan<DstExp, DType>,
expr::Plan<E, DType> >
<<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan, repeat);
MSHADOW_CUDA_POST_KERNEL_CHECK(MapPlanLargeKernel);
}
}

Expand Down Expand Up @@ -151,6 +158,7 @@ inline void MapReduceKeepLowest(expr::Plan<DstExp, DType> dst,
expr::Plan<DstExp, DType>,
expr::Plan<E, DType> >
<<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, eshape);
MSHADOW_CUDA_POST_KERNEL_CHECK(MapRedKeepLowestKernel);
}

template<typename Saver, typename Reducer, int block_dim_bits,
Expand Down Expand Up @@ -192,6 +200,7 @@ inline void MapReduceKeepDim1(expr::Plan<DstExp, DType> dst,
expr::Plan<DstExp, DType>,
expr::Plan<E, DType> >
<<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, pshape);
MSHADOW_CUDA_POST_KERNEL_CHECK(MapReduceKeepDim1Kernel);
}

template<int x_bits, typename DType>
Expand All @@ -213,6 +222,7 @@ inline void GetBatchedView(DType **dst, DType *src, int num, int stride,
CheckLaunchParam(dimGrid, dimBlock, "GetBatchedView");
GetBatchedViewKernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream_>>> (dst, src, num, stride);
MSHADOW_CUDA_POST_KERNEL_CHECK(GetBatchedViewKernel);
}

template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
Expand Down Expand Up @@ -320,6 +330,7 @@ inline void Softmax(Tensor<gpu, 2, DType> &dst,
(expr::MakePlan(dst),
expr::MakePlan(src),
dst.size(1));
MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxKernel);
}

template<typename DType>
Expand All @@ -338,6 +349,7 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
expr::MakePlan(src),
expr::MakePlan(label),
dst.size(1));
MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
}

template<typename DType>
Expand All @@ -358,6 +370,7 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
expr::MakePlan(label),
dst.size(1),
ignore_label);
MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
}

template<int n_bits, typename DType>
Expand Down Expand Up @@ -445,6 +458,7 @@ inline void Softmax(Tensor<gpu, 3, DType> &dst,
CheckLaunchParam(dimGrid, dimBlock, "Softmax");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
Softmax3DKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src);
MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DKernel);
}

template<typename DType>
Expand All @@ -459,6 +473,7 @@ inline void SoftmaxGrad(Tensor<gpu, 3, DType> &dst,
CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
Softmax3DGradKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src, label);
MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DGradKernel);
}

template<typename DType>
Expand All @@ -474,6 +489,7 @@ inline void SoftmaxGrad(Tensor<gpu, 3, DType> &dst,
CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
Softmax3DGradKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src, label, ignore_label);
MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DGradKernel);
}

template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
Expand Down Expand Up @@ -501,7 +517,7 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
// If the preceeding input has the same as this input, then the warp
// exits immediately. The warp also processes subsequent inputs with the
// same value.
//
//
// Input Warp
// 1 <warp 1>
// 1 <warp 1> (<warp 2> exits without doing any work)
Expand Down Expand Up @@ -571,6 +587,7 @@ inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
expr::MakePlan(src),
src.size(0),
src.size(1));
MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradKernel);
}

template<typename IndexType, typename DType>
Expand Down Expand Up @@ -604,6 +621,7 @@ inline void AddTakeGradLargeBatch(Tensor<gpu, 2, DType> dst,
src.dptr_,
static_cast<int>(src.size(0)),
static_cast<int>(src.size(1)));
MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradLargeBatchKernel);
}

template<int warp_bits, typename DType, typename DstPlan, typename IndexPlan, typename SrcPlan>
Expand Down Expand Up @@ -643,6 +661,7 @@ inline void IndexFill(Tensor<gpu, 2, DType> dst,
expr::MakePlan(src),
src.size(0),
src.size(1));
MSHADOW_CUDA_POST_KERNEL_CHECK(IndexFillKernel);
}

template<typename KDType, typename VDType>
Expand All @@ -663,6 +682,7 @@ inline void SortByKey(Tensor<gpu, 1, KDType> keys, Tensor<gpu, 1, VDType> values
thrust::cuda::par.on(stream),
key_iter, key_iter + keys.size(0), value_iter, thrust::greater<KDType>());
}
MSHADOW_CUDA_POST_KERNEL_CHECK(SortByKey);
#else
LOG(FATAL) << "SortByKey is only supported for CUDA version >=7.0!";
#endif
Expand Down
2 changes: 1 addition & 1 deletion mshadow/extension/transpose.h
Expand Up @@ -98,9 +98,9 @@ struct TransposeIndicesExp:
public Exp<TransposeIndicesExp<SrcExp, DType, dimsrc, etype>, DType, etype> {
/*! \brief source expression */
const SrcExp &src_indices_; // Expression of the source indices
Shape<dimsrc> src_shape_; // Holds the corresponding stride of the source axes in dst
const Shape<dimsrc> axes_; // The transpose axes
Shape<dimsrc> src_in_dst_stride_; // Holds the corresponding stride of the source axes in dst
Shape<dimsrc> src_shape_; // Holds the corresponding stride of the source axes in dst
/*! \brief constructor */
explicit TransposeIndicesExp(const SrcExp &src_indices,
Shape<dimsrc> src_shape,
Expand Down

0 comments on commit ccab3b9

Please sign in to comment.