From 661cd1bb306805ebc4f2aaf30d008c00f95f30e2 Mon Sep 17 00:00:00 2001 From: Zeming Lin Date: Thu, 8 Sep 2016 05:00:46 -0400 Subject: [PATCH] Pushing internal FBCUNN updates --- .gitignore | 1 + CMakeLists.txt | 402 +++++- PATENTS | 2 +- cuda | 2 +- examples/imagenet/README.md | 2 - fbcunn/AbstractParallel.lua | 49 +- fbcunn/BatchNormalization.lua | 185 +++ fbcunn/CuBLASWrapper.lua | 39 +- fbcunn/DataParallel.lua | 10 +- fbcunn/FFTCDefs.lua | 34 + fbcunn/FFTWrapper.lua | 234 +++- fbcunn/FeatureLPPooling.lua | 4 +- fbcunn/LookupTableGPU.lua | 2 +- fbcunn/ModelParallel.lua | 47 + fbcunn/OneBitSGD.lua | 3 - fbcunn/SpatialBatchNormalization.lua | 188 +++ fbcunn/SpatialConvolution.lua | 501 ++++++++ fbcunn/SpatialConvolutionCuFFT.lua | 1081 +++++++++++++---- fbcunn/SpatialConvolutionFBFFT.lua | 433 +++++++ fbcunn/SpatialConvolutionFBFFTGemm.lua | 599 +++++++++ fbcunn/SpatialConvolutionFFT.lua | 1012 +++++++++++++++ fbcunn/SpatialConvolutionFFTTiled.lua | 924 ++++++++++++++ fbcunn/SpatialConvolutionFFTTiledAsync.lua | 369 ++++++ fbcunn/SpatialConvolutionFFTTiledIterated.lua | 231 ++++ fbcunn/SpatialConvolutionFFTTiledSync.lua | 247 ++++ fbcunn/TemporalKMaxPooling.lua | 19 + fbcunn/init.lua | 23 +- src/BLASParameters.cpp | 7 +- src/BLASParameters.h | 27 + src/BatchNormalization.cu | 460 +++++++ src/ConvolutionBias.cu | 35 +- src/CrossMapNormalization.cu | 2 +- src/CrossMapNormalizationHost.cpp | 4 +- src/CuBLASWrapper.cpp | 129 +- src/CuBLASWrapper.h | 15 +- src/CuBLASWrapperLua.cpp | 187 ++- src/CudaTensorUtils.cpp | 10 +- src/CudaTensorUtils.h | 2 +- src/DeviceTensorUtils.h | 2 +- src/FeatureLPPooling.cu | 10 +- src/FeatureLPPoolingHost.cpp | 8 +- src/HSMHost.cpp | 2 +- src/HalfPrec.cpp | 8 +- src/HalfPrecKernels.cu | 6 +- src/HalfPrecTest.cpp | 2 +- src/InitCuda.cpp | 17 +- src/LocallyConnected.cuh | 2 +- src/LocallyConnectedHost.cpp | 10 +- src/LookupTableGPUHost.cpp | 4 +- src/MM.cu | 67 +- src/MM.h | 3 +- src/OneBitQuantization.cu | 2 +- src/OneBitQuantizationHost.cpp | 6 +- src/SparseNLLCriterion.cu | 6 +- src/SparseNLLCriterionHost.cpp | 6 +- src/SpatialBatchNormalization.cu | 791 ++++++++++++ src/TemporalConvolutionFBHost.cpp | 10 +- src/TemporalKMaxPooling.cu | 6 +- src/TemporalKMaxPoolingHost.cpp | 13 +- src/TemporalMaxPooling.cu | 13 +- src/WeightedLookupTable.cu | 51 + src/WeightedLookupTableHost.cpp | 58 + src/fft/CuFFTConvolution.cpp | 30 +- src/fft/CuFFTConvolution_AccGradParameters.cu | 14 +- src/fft/CuFFTConvolution_UpdateGradInput.cu | 12 +- src/fft/CuFFTConvolution_UpdateOutput.cu | 14 +- src/fft/CuFFTStrategy.h | 4 +- src/fft/CuFFTWrapper.cu | 51 +- src/fft/CuFFTWrapper.cuh | 23 +- src/fft/FBFFTDevice.cu | 10 +- src/fft/FBFFTHost.cpp | 36 +- src/fft/FBFFTHost.h | 3 +- src/fft/FFTIteratedConvolution.cu | 98 ++ src/fft/FFTWrapperLua.cpp | 212 +++- src/fft/SpatialConvolutionCuFFT.cpp | 64 +- src/fft/SpatialConvolutionCuFFT.h | 2 +- src/fft/SpatialConvolutionCuFFTHost.cpp | 17 +- src/fft/SpatialConvolutionCuFFTTuner.cpp | 6 +- src/fft/SpatialConvolutionCuFFTTuner.h | 2 +- src/fft/Utils-inl.h | 16 +- src/fft/Utils.h | 2 +- src/util/AsyncCopier.cpp | 7 +- src/util/AsyncCopier.h | 2 +- src/util/GlobalAsyncCopier.cpp | 6 +- src/util/Misc.cpp | 72 +- src/util/Misc.h | 14 +- src/util/Transform.cu | 4 +- src/util/Transform.cuh | 2 +- test/BiasTest.cpp | 50 +- test/ConvolutionTest.cpp | 149 ++- test/CuBLASTest.cpp | 38 +- test/CudaTensorTest.cpp | 95 +- test/CudaTensorTestKernels.cu | 7 +- test/FFTTest.cpp | 55 +- test/InputCentricConvolution_UpdateOutput.cu | 8 +- test/ReferenceConvolutions.cpp | 18 +- test/ReferenceConvolutions.h | 2 +- test/TestUtils.cpp | 2 +- test/TestUtils.h | 8 +- test/test.lua | 44 + test/test_BatchNormalization.lua | 227 ++++ test/test_ClassHierarchicalNLLCriterion.lua | 6 +- .../{benchmark_cublas.lua => test_CuBLAS.lua} | 92 +- test/test_CuFFT.lua | 310 +++++ test/test_DataParallel.lua | 190 ++- test/test_DataParallelComprehensive.lua | 132 -- test/test_FBFFTTiling.lua | 208 ++++ test/test_FFT.lua | 615 ++++++++-- test/test_FFTModule.lua | 329 +++-- test/test_SequentialCriterion.lua | 3 +- test/test_SparseNLLCriterion.lua | 4 +- test/test_SpatialConvolutionTuned.lua | 209 ++++ test/test_TemporalKMaxPooling.lua | 14 - test/test_WeightedLookupTable.lua | 26 +- 114 files changed, 10624 insertions(+), 1564 deletions(-) delete mode 100644 examples/imagenet/README.md create mode 100644 fbcunn/BatchNormalization.lua create mode 100644 fbcunn/FFTCDefs.lua create mode 100644 fbcunn/SpatialBatchNormalization.lua create mode 100644 fbcunn/SpatialConvolution.lua create mode 100644 fbcunn/SpatialConvolutionFBFFT.lua create mode 100644 fbcunn/SpatialConvolutionFBFFTGemm.lua create mode 100644 fbcunn/SpatialConvolutionFFT.lua create mode 100644 fbcunn/SpatialConvolutionFFTTiled.lua create mode 100644 fbcunn/SpatialConvolutionFFTTiledAsync.lua create mode 100644 fbcunn/SpatialConvolutionFFTTiledIterated.lua create mode 100644 fbcunn/SpatialConvolutionFFTTiledSync.lua create mode 100644 src/BatchNormalization.cu create mode 100644 src/SpatialBatchNormalization.cu create mode 100644 src/WeightedLookupTable.cu create mode 100644 src/WeightedLookupTableHost.cpp create mode 100644 src/fft/FFTIteratedConvolution.cu create mode 100644 test/test_BatchNormalization.lua rename test/{benchmark_cublas.lua => test_CuBLAS.lua} (56%) create mode 100644 test/test_CuFFT.lua delete mode 100755 test/test_DataParallelComprehensive.lua create mode 100644 test/test_FBFFTTiling.lua create mode 100644 test/test_SpatialConvolutionTuned.lua diff --git a/.gitignore b/.gitignore index 943482a..21db6f4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ TARGETS facebook +build diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ebe083..c39a06e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,88 +10,89 @@ FIND_PACKAGE(Torch REQUIRED) INCLUDE(MultiLevelIncludes) MLI_SET_DEPTH(2) FIND_PACKAGE(Folly REQUIRED) -FIND_PACKAGE(CUDA 6.5 REQUIRED) +FIND_PACKAGE(CUDA 7.5 REQUIRED) LIST(APPEND CUDA_NVCC_FLAGS "-arch=sm_35") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -#SET(CMAKE_SKIP_BUILD_RPATH FALSE) -#SET(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) -#SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) # bad to hardcode like this, but i dont see any other way yet. baby steps -SET(CMAKE_INSTALL_RPATH "${Torch_INSTALL_LIB}/lua/5.1;/usr/local/lib:${CMAKE_INSTALL_RPATH}") +SET(CMAKE_INSTALL_RPATH "${Torch_INSTALL_LIB}/lua/5.1;/usr/local/lib:${CMAKE_INSTALL_RPATH}") INCLUDE_DIRECTORIES("${Torch_INSTALL_INCLUDE}/THC") LINK_DIRECTORIES("${Torch_INSTALL_LIB}") INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}") INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/src") +INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/cuda") + +##################### Lua stuff ######################## FILE(GLOB luasrc fbcunn/*.lua) +INSTALL( + FILES + ${luasrc} + DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/fbcunn") -######################################################## +################### C++ Stuff ######################### +################# libfbcunn SET(src-cuda src/init.cu - src/Utils.cpp ) -CUDA_ADD_LIBRARY(fbcunn MODULE ${src-cuda}) -TARGET_LINK_LIBRARIES(fbcunn luaT THC TH fbcunnlayers_cuda) +CUDA_ADD_LIBRARY(libfbcunn MODULE ${src-cuda}) +TARGET_LINK_LIBRARIES(libfbcunn luaT THC TH thpp folly fbcunn_custate fbcuda_util) ### Torch packages supposes libraries prefix is "lib" -SET_TARGET_PROPERTIES(fbcunn PROPERTIES - PREFIX "lib" - IMPORT_PREFIX "lib") +SET_TARGET_PROPERTIES(libfbcunn PROPERTIES + PREFIX "" + IMPORT_PREFIX "") -INSTALL(TARGETS fbcunn +INSTALL(TARGETS libfbcunn RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") -####################### -SET(src-layers-cuda - src/CrossMapNormalization.cu - src/FeatureLPPooling.cu - src/HalfPrecKernels.cu - src/HSM.cu - src/LookupTableGPU.cu - src/OneBitQuantization.cu - src/SparseNLLCriterion.cu - src/TemporalKMaxPooling.cu - # src/TemporalMaxPooling.cu this is included directly in init.cu - - src/ConvolutionBias.cu - - src/fft/CuFFTWrapper.cu - src/fft/FBFFTDevice.cu - src/fft/CuFFTConvolution_UpdateOutput.cu - src/fft/CuFFTConvolution_UpdateGradInput.cu - src/fft/CuFFTConvolution_AccGradParameters.cu - src/fft/CuFFTConvolution.cpp - src/fft/SpatialConvolutionCuFFT.cpp - src/fft/SpatialConvolutionCuFFTTuner.cpp +################# fbcunn_custate +SET(src-cuda + src/Utils.cpp + ) +CUDA_ADD_LIBRARY(fbcunn_custate SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(fbcunn_custate luaT THC TH thpp folly) - src/LocallyConnectedAccGradParameters.cu - src/LocallyConnectedUpdateGradInput.cu - src/LocallyConnectedUpdateOutput.cu +SET_TARGET_PROPERTIES(fbcunn_custate PROPERTIES + PREFIX "" + IMPORT_PREFIX "") - src/MM.cu +INSTALL(TARGETS fbcunn_custate + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") - src/util/Transform.cu - src/util/Misc.cpp - src/CudaTensorUtils.cpp - src/CuBLASWrapper.cpp - src/BLASParameters.cpp - cuda/KernelTimer.cpp - src/fft/FBFFTHost.cpp +################# layers_cuda +SET(src-cuda + src/CrossMapNormalization.cu + src/LocallyConnectedUpdateOutput.cu + src/LocallyConnectedUpdateGradInput.cu + src/LocallyConnectedAccGradParameters.cu + src/LookupTableGPU.cu + src/HSM.cu + src/TemporalKMaxPooling.cu + src/SparseNLLCriterion.cu + src/WeightedLookupTable.cu ) +CUDA_ADD_LIBRARY(layers_cuda SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(layers_cuda luaT THC TH thpp folly feature_lp_pooling one_bit_quantization) + +SET_TARGET_PROPERTIES(layers_cuda PROPERTIES + PREFIX "" + IMPORT_PREFIX "") -CUDA_ADD_LIBRARY(fbcunnlayers_cuda SHARED ${src-layers-cuda}) -TARGET_LINK_LIBRARIES(fbcunnlayers_cuda luaT THC TH folly ${CUDA_cufft_LIBRARY}) -INSTALL(TARGETS fbcunnlayers_cuda +INSTALL(TARGETS layers_cuda RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") -SET(src-layers-cpp +################# cuda_ext +SET(src-cuda src/InitCuda.cpp src/CrossMapNormalizationHost.cpp + src/LocallyConnectedHost.cpp src/LookupTableGPUHost.cpp src/HSMHost.cpp src/TemporalConvolutionFBHost.cpp @@ -99,23 +100,298 @@ SET(src-layers-cpp src/OneBitQuantizationHost.cpp src/SparseNLLCriterionHost.cpp src/FeatureLPPoolingHost.cpp - src/fft/SpatialConvolutionCuFFTHost.cpp - src/fft/FFTWrapperLua.cpp src/CuBLASWrapperLua.cpp + src/fft/FFTWrapperLua.cpp + src/fft/SpatialConvolutionCuFFT.cpp + src/fft/SpatialConvolutionCuFFTHost.cpp + src/fft/SpatialConvolutionCuFFTTuner.cpp + src/WeightedLookupTableHost.cpp + ) +CUDA_ADD_LIBRARY(cuda_ext MODULE ${src-cuda}) +TARGET_LINK_LIBRARIES(cuda_ext luaT THC TH thpp folly libtorch_fb_fbcunn_convolution_bias cublas_wrapper cufft_convolution_cuda cufft_convolution_host fbcunn_custate layers_cuda torch_fb_fbcunn_mm fbcuda_util) - src/LocallyConnectedHost.cpp - src/Utils.cpp -) +SET_TARGET_PROPERTIES(cuda_ext PROPERTIES + PREFIX "" + IMPORT_PREFIX "") -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -ADD_LIBRARY(fbcunnlayers MODULE ${src-layers-cpp}) -TARGET_LINK_LIBRARIES(fbcunnlayers fbcunnlayers_cuda luaT THC TH folly) +INSTALL(TARGETS cuda_ext + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}/fbcunn" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}/fbcunn") + +################# libhalfprec +SET(src-cuda + src/HalfPrec.cpp + ) +CUDA_ADD_LIBRARY(libhalfprec MODULE ${src-cuda}) +TARGET_LINK_LIBRARIES(libhalfprec luaT THC TH thpp folly fbcunn_custate libcudahalf fbcuda_util) + +SET_TARGET_PROPERTIES(libhalfprec PROPERTIES + PREFIX "" + IMPORT_PREFIX "") -INSTALL(TARGETS fbcunnlayers +INSTALL(TARGETS libhalfprec RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") -INSTALL( - FILES - ${luasrc} - DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/fbcunn") \ No newline at end of file +################# libcudahalf +SET(src-cuda + src/HalfPrecKernels.cu + ) +CUDA_ADD_LIBRARY(libcudahalf SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(libcudahalf luaT THC TH thpp folly util ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}) + +SET_TARGET_PROPERTIES(libcudahalf PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS libcudahalf + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# cublas_wrapper +SET(src-cuda + src/BLASParameters.cpp + src/CuBLASWrapper.cpp + ) +CUDA_ADD_LIBRARY(cublas_wrapper SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(cublas_wrapper luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}) + +SET_TARGET_PROPERTIES(cublas_wrapper PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS cublas_wrapper + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# torch_fb_fbcunn_mm +SET(src-cuda + src/MM.cu + ) +CUDA_ADD_LIBRARY(torch_fb_fbcunn_mm SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(torch_fb_fbcunn_mm luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}) + +SET_TARGET_PROPERTIES(torch_fb_fbcunn_mm PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS torch_fb_fbcunn_mm + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# torch_fb_fbcunn_FFTIteratedConvolution +SET(src-cuda + src/fft/FFTIteratedConvolution.cu + ) +CUDA_ADD_LIBRARY(torch_fb_fbcunn_FFTIteratedConvolution SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(torch_fb_fbcunn_FFTIteratedConvolution luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}) + +SET_TARGET_PROPERTIES(torch_fb_fbcunn_FFTIteratedConvolution PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS torch_fb_fbcunn_FFTIteratedConvolution + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# torch_fb_fbcunn_batch_norm +SET(src-cuda + src/BatchNormalization.cu + src/SpatialBatchNormalization.cu + ) +CUDA_ADD_LIBRARY(torch_fb_fbcunn_batch_norm SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(torch_fb_fbcunn_batch_norm luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}) + +SET_TARGET_PROPERTIES(torch_fb_fbcunn_batch_norm PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS torch_fb_fbcunn_batch_norm + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# one_bit_quantization +SET(src-cuda + src/OneBitQuantization.cu + ) +CUDA_ADD_LIBRARY(one_bit_quantization SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(one_bit_quantization luaT THC TH thpp folly) + +SET_TARGET_PROPERTIES(one_bit_quantization PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS one_bit_quantization + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + + +################# feature_lp_pooling +SET(src-cuda + src/FeatureLPPooling.cu + ) +CUDA_ADD_LIBRARY(feature_lp_pooling SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(feature_lp_pooling luaT THC TH thpp folly) + +SET_TARGET_PROPERTIES(feature_lp_pooling PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS feature_lp_pooling + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# cuda_tensor_utils +SET(src-cuda + src/CudaTensorUtils.cpp + ) +CUDA_ADD_LIBRARY(cuda_tensor_utils SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(cuda_tensor_utils luaT THC TH thpp folly) + +SET_TARGET_PROPERTIES(cuda_tensor_utils PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS cuda_tensor_utils + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# fbfft_wrapper +SET(src-cuda + src/fft/FBFFTHost.cpp + ) +CUDA_ADD_LIBRARY(fbfft_wrapper SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(fbfft_wrapper luaT THC TH thpp folly cuda_tensor_utils fbfft_lib fbcuda_kernel_timer) + +SET_TARGET_PROPERTIES(fbfft_wrapper PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS fbfft_wrapper + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# fbfft_lib +SET(src-cuda + src/fft/FBFFTDevice.cu + ) +CUDA_ADD_LIBRARY(fbfft_lib SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(fbfft_lib luaT THC TH thpp folly) + +SET_TARGET_PROPERTIES(fbfft_lib PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS fbfft_lib + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# cufft_convolution_cuda +SET(src-cuda + src/fft/CuFFTConvolution_UpdateOutput.cu + src/fft/CuFFTConvolution_AccGradParameters.cu + src/fft/CuFFTConvolution_UpdateGradInput.cu + ) +CUDA_ADD_LIBRARY(cufft_convolution_cuda SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(cufft_convolution_cuda luaT THC TH thpp folly libtorch_fb_fbcunn_convolution_bias cublas_wrapper cufft_wrapper cufft_convolution_host ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}) + +SET_TARGET_PROPERTIES(cufft_convolution_cuda PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS cufft_convolution_cuda + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# cufft_convolution_host +SET(src-cuda + src/fft/CuFFTConvolution.cpp + ) +CUDA_ADD_LIBRARY(cufft_convolution_host SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(cufft_convolution_host luaT THC TH thpp folly cufft_wrapper cublas_wrapper fbfft_wrapper torch_fb_fbcunn_mm) + +SET_TARGET_PROPERTIES(cufft_convolution_host PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS cufft_convolution_host + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# cufft_wrapper +SET(src-cuda + src/fft/CuFFTWrapper.cu + ) +CUDA_ADD_LIBRARY(cufft_wrapper SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(cufft_wrapper luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}) + +SET_TARGET_PROPERTIES(cufft_wrapper PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS cufft_wrapper + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# util +SET(src-cuda + src/util/Transform.cu + src/util/AsyncCopier.cpp + src/util/GlobalAsyncCopier.cpp + src/util/Misc.cpp + ) +CUDA_ADD_LIBRARY(util SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(util luaT THC TH thpp folly fbcuda_util) + +SET_TARGET_PROPERTIES(util PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS util + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# libtorch_fb_fbcunn_convolution_bias +SET(src-cuda + src/ConvolutionBias.cu + ) +CUDA_ADD_LIBRARY(libtorch_fb_fbcunn_convolution_bias SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(libtorch_fb_fbcunn_convolution_bias luaT THC TH thpp folly) + +SET_TARGET_PROPERTIES(libtorch_fb_fbcunn_convolution_bias PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS libtorch_fb_fbcunn_convolution_bias + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# fbcuda_util +SET(src-cuda + cuda/util/CachedDeviceProperties.cpp + ) +CUDA_ADD_LIBRARY(fbcuda_util SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(fbcuda_util luaT THC TH thpp folly) + +SET_TARGET_PROPERTIES(fbcuda_util PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS fbcuda_util + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") + +################# fbcuda_kernel_timer +SET(src-cuda + cuda/KernelTimer.cpp + ) +CUDA_ADD_LIBRARY(fbcuda_kernel_timer SHARED ${src-cuda}) +TARGET_LINK_LIBRARIES(fbcuda_kernel_timer luaT THC TH thpp folly) + +SET_TARGET_PROPERTIES(fbcuda_kernel_timer PROPERTIES + PREFIX "" + IMPORT_PREFIX "") + +INSTALL(TARGETS fbcuda_kernel_timer + RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") diff --git a/PATENTS b/PATENTS index 4da7ff6..51e5a49 100644 --- a/PATENTS +++ b/PATENTS @@ -30,4 +30,4 @@ necessarily infringed by the Software standing alone. A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, or contributory infringement or inducement to infringe any patent, including a -cross-claim or counterclaim. \ No newline at end of file +cross-claim or counterclaim. diff --git a/cuda b/cuda index 26f01ee..8ef2af9 160000 --- a/cuda +++ b/cuda @@ -1 +1 @@ -Subproject commit 26f01ee8f8a3035cd58adb5ccf02245e58c06c04 +Subproject commit 8ef2af9b579b8610c59ab0ccf6e9075350de4320 diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md deleted file mode 100644 index 2f207eb..0000000 --- a/examples/imagenet/README.md +++ /dev/null @@ -1,2 +0,0 @@ -This example has been moved to [https://github.com/soumith/imagenet-multiGPU.torch](https://github.com/soumith/imagenet-multiGPU.torch). -It will be maintained there further. diff --git a/fbcunn/AbstractParallel.lua b/fbcunn/AbstractParallel.lua index 005e83c..c8e8167 100644 --- a/fbcunn/AbstractParallel.lua +++ b/fbcunn/AbstractParallel.lua @@ -44,6 +44,45 @@ function AbstractParallel:_freeCaches() self.gradInput_gpu = {} end +-- override nn.Module.type to handle gpu_assignments +function AbstractParallel:type(type, tensorCache) + if not type then + return self._type + end + + self:_freeCaches() + + local current_gpuid = cutorch.getDevice() + for i, module in ipairs(self.modules) do + cutorch.setDevice(self.gpu_assignments[i]) + module:type('torch.FloatTensor', {}):type(type, {}) + end + cutorch.setDevice(current_gpuid) + + for key,param in pairs(self) do + if key ~= 'modules' then + self[key] = nn.utils.recursiveType(param, type, tensorCache) + end + end + + self._type = type + return self +end + +-- override nn.Module.apply to handle gpu_assignments +function AbstractParallel:apply(callback) + + callback(self) + + local current_gpuid = cutorch.getDevice() + for i, module in ipairs(self.modules) do + cutorch.setDevice(self.gpu_assignments[i]) + module:apply(callback) + end + cutorch.setDevice(current_gpuid) +end + + --[[ This function yields the GPU id for the module to be added. @@ -238,10 +277,6 @@ function AbstractParallel:updateParameters(learningRate) end end -function AbstractParallel:share(mlp,...) - error("Share is not supported for the AbstractParallel layer.") -end - function AbstractParallel:clone() local clone = parent.clone(self) clone:cuda() @@ -255,3 +290,9 @@ function AbstractParallel:reset(stdv) end) end end + +function AbstractParallel:clearState() + self:_freeCaches() + + parent.clearState(self) +end diff --git a/fbcunn/BatchNormalization.lua b/fbcunn/BatchNormalization.lua new file mode 100644 index 0000000..1135866 --- /dev/null +++ b/fbcunn/BatchNormalization.lua @@ -0,0 +1,185 @@ +--[[ + This file implements Batch Normalization as described in the paper: + "Batch Normalization: Accelerating Deep Network Training + by Reducing Internal Covariate Shift" + by Sergey Ioffe, Christian Szegedy + + This implementation is useful for inputs NOT coming from convolution layers. + For Convolution layers, see SpatialBatchNormalization.lua + + The operation implemented is: + y = ( x - mean(x) ) + -------------------- * gamma + beta + standard-deviation(x) + where gamma and beta are learnable parameters. + + The learning of gamma and beta is optional. + + Usage: + with learnable parameters: nn.BatchNormalization(N [, eps] [,momentum]) + where N = dimensionality of input + without learnable parameters: nn.BatchNormalization(0 [, eps] [,momentum]) + + eps is a small value added to the standard-deviation to avoid divide-by-zero. + Defaults to 1e-5 + + Training: this layer keeps a running estimate of it's computed mean and std. + The running sum is kept with a default momentup of 0.1 (unless over-ridden) + Testing: this running mean/std is used to normalize. +]]-- + + +local ffi = require 'ffi' + +ffi.cdef[[ + void BatchNormalizationUpdateOutputFFI( + THCState* state, + THCudaTensor* input, + THCudaTensor* output, + THCudaTensor* centered, + THCudaTensor* std, + THCudaTensor* normalized, + THCudaTensor* runningMean, + THCudaTensor* runningStddev, + THCudaTensor* weight, + THCudaTensor* bias, + float epsilon, + float momentum, + bool train, + bool affine); + void BatchNormalizationUpdateGradInputFFI( + THCState* state, + THCudaTensor* gradInput, + THCudaTensor* gradOutput, + THCudaTensor* centered, + THCudaTensor* std, + THCudaTensor* weight, + bool affine); + void BatchNormalizationAccGradParametersFFI( + THCState* state, + THCudaTensor* gradOutput, + THCudaTensor* normalized, + THCudaTensor* gradWeight, + THCudaTensor* gradBias, + float scale); +]] + +local lib_name = 'torch_fb_fbcunn_batch_norm' +local lib_path = package.searchpath(lib_name, package.cpath) +local BNFFI = ffi.load(lib_path and lib_path or lib_name) + +local BN, parent = torch.class('fbnn.BatchNormalization', 'nn.Module') + +function BN:__init(nOutput, eps, momentum, affine) + parent.__init(self) + assert(nOutput and type(nOutput) == 'number', + 'Missing argument #1: dimensionality of input. ') + assert(nOutput ~= 0, 'To set affine=false call BatchNormalization' + .. '(nOutput, eps, momentum, false) ') + if affine ~= nil then + assert(type(affine) == 'boolean', 'affine has to be true/false') + self.affine = affine + else + self.affine = true + end + self.eps = eps or 1e-5 + self.train = true + self.momentum = momentum or 0.1 + self.running_mean = torch.zeros(nOutput):cuda() + self.running_std = torch.ones(nOutput):cuda() + + if self.affine then + self.weight = torch.CudaTensor(nOutput) + self.bias = torch.CudaTensor(nOutput) + self.gradWeight = torch.CudaTensor(nOutput) + self.gradBias = torch.CudaTensor(nOutput) + self:reset() + else + -- Give me empty tensors for proper FFI behavior + self.weight = torch.CudaTensor() + self.bias = torch.CudaTensor() + self.gradWeight = torch.CudaTensor() + self.gradBias = torch.CudaTensor() + end + + -- Initialize from input on the first updateOutput / updateGradInput + self.output = nil + self.gradInput = nil +end + +function BN:reset() + self.weight:uniform() + self.bias:zero() +end + +function BN:updateOutput(input) + assert(input:dim() == 2, 'only mini-batch supported (2D tensor), got ' + .. input:dim() .. 'D tensor instead') + + self.std = self.std or self.running_std:clone():zero():cuda() + self.std:resizeAs(self.running_std) + self.centered = self.centered or input:clone():zero():cuda() + self.centered:resizeAs(input) + self.normalized = self.normalized or input:clone():zero():cuda() + self.normalized:resizeAs(input) + self.output = self.output or input:clone():zero():cuda() + self.output:resizeAs(input) + + BNFFI.BatchNormalizationUpdateOutputFFI(cutorch._state, + input:cdata(), + self.output:cdata(), + self.centered:cdata(), + self.std:cdata(), + self.normalized:cdata(), + self.running_mean:cdata(), + self.running_std:cdata(), + self.weight:cdata(), + self.bias:cdata(), + self.eps, + self.momentum, + self.train, + self.affine) + + return self.output +end + +function BN:updateGradInput(input, gradOutput) + assert(input:dim() == 2, 'only mini-batch supported') + assert(gradOutput:dim() == 2, 'only mini-batch supported') + assert(self.train == true, + 'should be in training mode when self.train is true') + + self.gradInput = self.gradInput or input:clone():zero():cuda() + self.gradInput:resizeAs(input) + + BNFFI.BatchNormalizationUpdateGradInputFFI(cutorch._state, + self.gradInput:cdata(), + gradOutput:cdata(), + self.centered:cdata(), + self.std:cdata(), + self.weight:cdata(), + self.affine) + + return self.gradInput +end + +function BN:accGradParameters(input, gradOutput, scale) + if self.affine then + scale = scale or 1.0 + BNFFI.BatchNormalizationAccGradParametersFFI(cutorch._state, + gradOutput:cdata(), + self.normalized:cdata(), + self.gradWeight:cdata(), + self.gradBias:cdata(), + scale) + end + +end + +function BN:clearState() + self.centered = nil + self.std = nil + self.normalized = nil + + parent.clearState(self) +end diff --git a/fbcunn/CuBLASWrapper.lua b/fbcunn/CuBLASWrapper.lua index 62cb6e8..5d0726d 100644 --- a/fbcunn/CuBLASWrapper.lua +++ b/fbcunn/CuBLASWrapper.lua @@ -2,17 +2,48 @@ local CuBLASWrapper = torch.class('nn.CuBLASWrapper') -function CuBLASWrapper:__init() +function CuBLASWrapper:__init(timed) self.iterDims = 0 self.batchDims = 0 self.handles = 0 self.streams = 0 + self.timed = timed or false end -function CuBLASWrapper:matmult(A, B, C, iterDims, batchDims, handles, streams) +function CuBLASWrapper:matmult( + A, B, C, iterDims, batchDims, transA, transB, scale) + self.transA = transA or 'n' + self.transB = transB or 'n' self.iterDims = table.getn(iterDims) or 0 self.batchDims = table.getn(batchDims) or 0 - self.handles = handles or 0 - self.streams = streams or 0 + self.scale = scale or 1.0 A.nn.CuBLASWrapper_matmult(self, A, B, C) end + +function CuBLASWrapper:matmultComplex( + A, B, C, iterDims, batchDims, transA, transB, scale) + self.transA = transA or 'n' + self.transB = transB or 'n' + self.iterDims = table.getn(iterDims) or 0 + self.batchDims = table.getn(batchDims) or 0 + self.scale = scale or 1.0 + A.nn.CuBLASWrapper_matmultComplex(self, A, B, C) +end + +function CuBLASWrapper:transpose( + A, B, separator, transposeMetaData, handle, stream) + self.separator = separator or 0 + self.transposeMetaData = transposeMetaData or false + self.handle = handle or 1 -- always handle 1 by default + self.stream = stream or 0 + A.nn.CuBLASWrapper_transpose(self, A, B) +end + +function CuBLASWrapper:transposeComplex( + A, B, separator, transposeMetaData, handle, stream) + self.separator = separator or 0 + self.transposeMetaData = transposeMetaData or false + self.handle = handle or 1 -- always handle 1 by default + self.stream = stream or 0 + A.nn.CuBLASWrapper_transposeComplex(self, A, B) +end diff --git a/fbcunn/DataParallel.lua b/fbcunn/DataParallel.lua index eb4571b..99686e0 100644 --- a/fbcunn/DataParallel.lua +++ b/fbcunn/DataParallel.lua @@ -43,8 +43,8 @@ Pictorially +--------+ ``` ]] -local DataParallel, _ = torch.class('nn.DataParallel', - 'nn.AbstractParallel') +local DataParallel, parent = torch.class('nn.DataParallel', + 'nn.AbstractParallel') -- `_distributeInput` slices the input along self.dimension -- and copies each portion into each child module. @@ -182,3 +182,9 @@ function DataParallel:accUpdateGradParameters(_input, _gradOutput, lr) -- like mixGrads, averages the weights across all GPUs error('accUpdateGradParameters not implemented for: ' .. torch.type(self)) end + +function DataParallel:clearState() + self.homeGradBuffers = {} + + parent.clearState(self) +end diff --git a/fbcunn/FFTCDefs.lua b/fbcunn/FFTCDefs.lua new file mode 100644 index 0000000..d2d72d5 --- /dev/null +++ b/fbcunn/FFTCDefs.lua @@ -0,0 +1,34 @@ +local ffi = require 'ffi' + +ffi.cdef[[ + void updateOutputBiasFFI(THCState*, THCudaTensor*, THCudaTensor*); + void accGradParametersBiasFFI( + THCState*, THCudaTensor*, THCudaTensor*, float scale); + void transposeMMFFI(THCState*, + THCudaTensor* tA, + THCudaTensor* tB, + THCudaTensor* tC, + float invNorm, + bool conjugateTransposeA, + bool conjugateTransposeB, + bool accumulate); + typedef struct { + static const int FFT_UpdateOutput = 0; + static const int FFT_UpdateGradInput = 1; + static const int FFT_AccGradParameters = 2; + int pass; + } FFTConvolutionPassFFI; + typedef struct { + THCudaTensor* tensor; + int padL; + int padU; + } TiledDeviceTensorFFI; + void convolveIteratedFFI(THCState* state, + TiledDeviceTensorFFI* input, + THCudaTensor* weight, + TiledDeviceTensorFFI* output, + int numTiles, + int fftSize, + FFTConvolutionPassFFI pass, + float scale); +]] diff --git a/fbcunn/FFTWrapper.lua b/fbcunn/FFTWrapper.lua index 83c35f8..42f0c6a 100644 --- a/fbcunn/FFTWrapper.lua +++ b/fbcunn/FFTWrapper.lua @@ -1,56 +1,198 @@ -- Copyright 2004-present Facebook. All Rights Reserved. +local ffi = require 'ffi' +local package_path = package.searchpath('cufft_wrapper', package.cpath) +if not package_path then -- not OSS + package_path = 'torch_fb_fbcunn_cufft_wrapper' +end +local CuFFTFFI = ffi.load(package_path) + +ffi.cdef[[ +typedef int cufftHandle; +typedef int cufftResult; +typedef int cufftHandle; + +typedef struct { + cufftHandle handle; +} cufftHandleWrapper; + +cufftResult cufftDestroy(cufftHandle plan); +void updateOutputBiasFFI(THCState*, THCudaTensor*, THCudaTensor*); +cufftHandle makeCuFFTPlanFFI(THCState* state, + THCudaTensor* realTH, + THCudaTensor* cplxTH, + bool direction, + bool normalize, + int fftVersion, + int batchDimensions); +]] + local FFTWrapper = torch.class('nn.FFTWrapper') -function FFTWrapper:__init(cufft) - self.batchDims = 0 - self.cufft = cufft or 1 +FFTWrapper.emptyBuffer = torch.CudaTensor() + +function FFTWrapper:__init(cufft, padLeft, padUp, timed) + self.batchDims = 0 + + if cufft == nil or cufft == "cufft" then + self.cufft = true + else + self.cufft = false + end + + if timed == "timed" then + self.timed = true + else + self.timed = false + end + + self.padLeft = padLeft or 0 + self.padUp = padUp or 0 end -function FFTWrapper:fft(time, frequency, batchDims) - assert(batchDims >= 1) - assert(batchDims <= 2) - self.batchDims = batchDims - -- If calling fft from lua directly, just pass a buffer in any case. - -- In practice it is only really needed for 2d-fft of size > 32 - local buffer = {} - if self.cufft == 1 then - if #frequency:size() == 4 then - assert(frequency:size()[2] / 2 + 1 == frequency:size()[3]) - end - -- Need to allocate explicit cufft plans, a buffer is not enough - buffer = torch.CudaTensor(torch.LongStorage({1, 1, 1, 1})) - else - if #frequency:size() == 4 then - assert(frequency:size()[3] / 2 + 1 == frequency:size()[2]) - end - buffer = frequency:clone() - end - time.nn.FFTWrapper_fft(self, time, frequency, buffer) +function FFTWrapper:fft(time, frequency, batchDims, plan) + assert(batchDims >= 1) + assert(batchDims <= 2) + assert(torch.type(time) == 'torch.CudaTensor', 'FBFFT only with CudaTensors') + self.batchDims = batchDims + -- If calling fft from lua directly, just pass a buffer in any case. + -- In practice it is only really needed for 2d-fft of size > 32 + local buffer = FFTWrapper.emptyBuffer + if not self.cufft then + -- Make full buffer to hold the whole complex tensor if needed + -- TODO: Maybe fix this don't want to manage memory here. + -- On the other hand we don't care much since we should use tiling anyway + local fftDim = (#time:size() - batchDims) + local needsBuffer = false + for i = 1, fftDim do + if time:size(self.batchDims + i) > 32 or + frequency:size(self.batchDims + i) > 32 then + needsBuffer = true + end + end + if needsBuffer then + if fbnn.SpatialConvolution.reportWarnings then + print('FFTWrapper.lua: Perf killed by on-the-fly allocation, ', + 'consider using tiling and stay under 32 FFT size') + end + buffer = frequency:clone() + end + end + local handle = -1 + if plan then + handle = plan.handle + end + time.nn.FFTWrapper_fft(self, time, frequency, buffer, handle) end -function FFTWrapper:ffti(time, frequency, batchDims) - assert(batchDims >= 1) - assert(batchDims <= 2) - self.batchDims = batchDims - -- In practice it is only really needed for 2d-fft of size > 32 - local size = frequency:size() - local bufferSize = {} - local buffer = torch.CudaTensor(torch.LongStorage({1, 1, 1, 1})) - -- Make full buffer to hold the whole complex tensor if needed - if self.cufft == 1 then - if #time:size() - batchDims == 2 then - assert(size[2] / 2 + 1 == size[3]) - end - elseif batchDims == 1 and #size == 4 then - if batchDims == 1 and #size == 4 then - assert(size[3] / 2 + 1 == size[2]) - -- - bufferSize = torch.LongStorage({size[1], size[3], size[3], size[4]}) +function FFTWrapper:ffti(time, frequency, batchDims, plan) + assert(batchDims >= 1) + assert(batchDims <= 2) + assert(torch.type(time) == 'torch.CudaTensor', 'FBFFT only with CudaTensors') + self.batchDims = batchDims + -- In practice it is only really needed for 2d-fft of size > 32 + local size = frequency:size() + local buffer = FFTWrapper.emptyBuffer + + if not self.cufft then + -- Make full buffer to hold the whole complex tensor if needed + -- TODO: Maybe fix this don't want to manage memory here. + -- On the other hand we don't care much since we should use tiling anyway + local fftDim = (#time:size() - batchDims) + local needsBuffer = false + for i = 1, fftDim do + if time:size(self.batchDims + i) > 32 or + frequency:size(self.batchDims + i) > 32 then + needsBuffer = true + end + end + if needsBuffer and fftDim == 2 then + if fbnn.SpatialConvolution.reportWarnings then + print('FFTWrapper.lua: Perf killed by on-the-fly allocation, ', + 'consider using tiling and stay under 32 FFT size') + end + if batchDims == 1 then + local bufferSize = torch.LongStorage({ + size[1], size[3], size[3], size[4]}) buffer = torch.CudaTensor(bufferSize) - else - buffer = frequency:clone() - end - end - time.nn.FFTWrapper_ffti(self, time, frequency, buffer) + elseif batchDims == 2 then + local bufferSize = torch.LongStorage({ + size[1], size[2], size[4], size[4], size[5]}) + buffer = torch.CudaTensor(bufferSize) + end + end + end + + local handle = -1 + if plan then + handle = plan.handle + end + + time.nn.FFTWrapper_ffti(self, time, frequency, buffer, handle) +end + + +-- CuFFTPlan allocation occurs in here because it depends on the tensor shape +-- after transposition +function FFTWrapper:fftTranspose(tensor, bufferComplex, bufferComplexTranspose, + batchDims, handle, stream, plan) + local transposeSeparator = batchDims + cutorch.setBlasHandle(handle) + cutorch.setStream(stream) + if self.cufft and not plan then + local version = 0 + plan = ffi.new('cufftHandleWrapper') + plan.handle = CuFFTFFI.makeCuFFTPlanFFI(cutorch._state, + tensor:cdata(), + bufferComplex:cdata(), + true, + false, + version, + batchDims) + ffi.gc(plan, function(p) + CuFFTFFI.cufftDestroy(p.handle) + end) + end + self:fft(tensor, bufferComplex, batchDims, plan) + local cublasWrapper = nn.CuBLASWrapper() + cublasWrapper:transposeComplex(bufferComplex, + bufferComplexTranspose, + transposeSeparator, + false, + handle, + stream) + return plan +end + +-- CuFFTPlan allocation occurs in here because it depends on the tensor shape +-- after transposition +function FFTWrapper:transposeIFFT(tensor, bufferComplex, bufferComplexTranspose, + batchDims, handle, stream, plan) + local transposeSeparator = batchDims + cutorch.setBlasHandle(handle) + cutorch.setStream(stream) + local cublasWrapper = nn.CuBLASWrapper() + cublasWrapper:transposeComplex(bufferComplexTranspose, + bufferComplex, + transposeSeparator, + false, + handle, + stream) + + if self.cufft and not plan then + local version = 0 + plan = ffi.new('cufftHandleWrapper') + plan.handle = CuFFTFFI.makeCuFFTPlanFFI(cutorch._state, + tensor:cdata(), + bufferComplex:cdata(), + false, + false, + version, + batchDims) + ffi.gc(plan, function(p) + CuFFTFFI.cufftDestroy(p.handle) + end) + end + self:ffti(tensor, bufferComplex, batchDims, plan) + return plan end diff --git a/fbcunn/FeatureLPPooling.lua b/fbcunn/FeatureLPPooling.lua index b460407..e15cbbc 100644 --- a/fbcunn/FeatureLPPooling.lua +++ b/fbcunn/FeatureLPPooling.lua @@ -50,7 +50,7 @@ function FeatureLPPooling:__init(width, stride, power, batch_mode) end function FeatureLPPooling:updateOutput(input) - if self:type() == 'torch.CudaTensor' then + if torch.type(input) == 'torch.CudaTensor' then input.nn.FeatureLPPooling_updateOutput(self, input) else error('CUDA only supported at the moment') @@ -59,7 +59,7 @@ function FeatureLPPooling:updateOutput(input) end function FeatureLPPooling:updateGradInput(input, gradOutput) - if self:type() == 'torch.CudaTensor' then + if torch.type(input) == 'torch.CudaTensor' then input.nn.FeatureLPPooling_updateGradInput(self, input, gradOutput) else error('CUDA only supported at the moment') diff --git a/fbcunn/LookupTableGPU.lua b/fbcunn/LookupTableGPU.lua index 8d33f82..1aea141 100644 --- a/fbcunn/LookupTableGPU.lua +++ b/fbcunn/LookupTableGPU.lua @@ -29,7 +29,7 @@ end function LookupTableGPU:reset(stdv) stdv = stdv or 1 - self.weight:normal(stdv) + self.weight:normal(0, stdv) end function LookupTableGPU:parameters() diff --git a/fbcunn/ModelParallel.lua b/fbcunn/ModelParallel.lua index d1ded2f..4919099 100644 --- a/fbcunn/ModelParallel.lua +++ b/fbcunn/ModelParallel.lua @@ -182,3 +182,50 @@ function ModelParallel:updateGradInput(_input, gradOutput) return self.gradInput end + + +function ModelParallel:backward(_input, gradOutput, scale) + self:_distributeGradOutput(_input, gradOutput) + + scale = scale or 1 + -- update gradInput for each module + for i,module in ipairs(self.modules) do + local gpuid = self.gpu_assignments[i] + withDevice(gpuid, function() + module:backward(self.input_gpu[gpuid], + self.gradOutput_gpu[i], + scale) + end) + end + + if not self.gradInput then return end -- if gradInput is nil, do nothing + self.gradInput:resizeAs(self.input_gpu[self.container_gpuid]) + + -- add gradInputs + for i, module in ripairs(self.modules) do + if module.gradInput then + if i == 1 then + self.gradInput:copy(module.gradInput) + return self.gradInput + end + + local parent_module_idx = math.floor(i / 2) + local parent_gpuid = self.gpu_assignments[parent_module_idx] + withDevice(parent_gpuid, function() + if not self.gradInput_gpu[i] then + self.gradInput_gpu[i] = torch.CudaTensor() + end + + self.gradInput_gpu[i]:resizeAs(module.gradInput) + self:gpuSend(self.gradInput_gpu[i], module.gradInput) + self.modules[parent_module_idx].gradInput:add( + self.gradInput_gpu[i]) + end) + end + end + + -- Combine gradients for data parallel models + self:_mixGrads() + + return self.gradInput +end diff --git a/fbcunn/OneBitSGD.lua b/fbcunn/OneBitSGD.lua index cc1b903..dc47cda 100644 --- a/fbcunn/OneBitSGD.lua +++ b/fbcunn/OneBitSGD.lua @@ -4,9 +4,6 @@ OneBitSGD contains various utility functions for use in OneBitDataParallel, expo local M = {} -local _fbd = require('fb.debugger') -local _trace = require('fb.util.trace') - local pl = require('pl.import_into')() local util = require('fb.util') local withDevice = cutorch.withDevice diff --git a/fbcunn/SpatialBatchNormalization.lua b/fbcunn/SpatialBatchNormalization.lua new file mode 100644 index 0000000..b894d7a --- /dev/null +++ b/fbcunn/SpatialBatchNormalization.lua @@ -0,0 +1,188 @@ +--[[ + This file implements Batch Normalization as described in the paper: + "Batch Normalization: Accelerating Deep Network Training + by Reducing Internal Covariate Shift" + by Sergey Ioffe, Christian Szegedy + + This implementation is useful for inputs coming from convolution layers. + For Non-convolutional layers, see BatchNormalization.lua + + The operation implemented is: + y = ( x - mean(x) ) + -------------------- * gamma + beta + standard-deviation(x) + where gamma and beta are learnable parameters. + + The learning of gamma and beta is optional. + + Usage: + with learnable parameters: nn.BatchNormalization(N [,eps] [,momentum]) + where N = dimensionality of input + without learnable parameters: nn.BatchNormalization(0 [,eps] [,momentum]) + + eps is a small value added to the standard-deviation to avoid divide-by-zero. + Defaults to 1e-5 + + At training, it keeps a running estimate of its computed mean and std. + The running sum is kept with a default momentup of 0.1 (unless over-ridden) + At testing, this running mean/std is used to normalize. +--]] + +local ffi = require 'ffi' + +ffi.cdef[[ + void SpatialBatchNormalizationUpdateOutputFFI( + THCState* state, + THCudaTensor* input, + THCudaTensor* output, + THCudaTensor* centered, + THCudaTensor* std, + THCudaTensor* normalized, + THCudaTensor* runningMean, + THCudaTensor* runningStddev, + THCudaTensor* weight, + THCudaTensor* bias, + float epsilon, + float momentum, + bool train, + bool affine); + void SpatialBatchNormalizationUpdateGradInputFFI( + THCState* state, + THCudaTensor* gradInput, + THCudaTensor* gradOutput, + THCudaTensor* centered, + THCudaTensor* std, + THCudaTensor* weight, + bool affine); + void SpatialBatchNormalizationAccGradParametersFFI( + THCState* state, + THCudaTensor* gradOutput, + THCudaTensor* normalized, + THCudaTensor* gradWeight, + THCudaTensor* gradBias, + float scale); +]] + +local lib_name = 'torch_fb_fbcunn_batch_norm' +local lib_path = package.searchpath(lib_name, package.cpath) +local BNFFI = ffi.load(lib_path and lib_path or lib_name) + +local BN, parent = torch.class('fbnn.SpatialBatchNormalization', 'nn.Module') + +function BN:__init(nFeature, eps, momentum, affine) + parent.__init(self) + assert(nFeature and type(nFeature) == 'number', + 'Missing argument #1: Number of feature planes. ') + assert(nFeature ~= 0, 'To set affine=false call SpatialBatchNormalization' + .. '(nFeature, eps, momentum, false) ') + if affine ~=nil then + assert(type(affine) == 'boolean', 'affine has to be true/false') + self.affine = affine + else + self.affine = true + end + self.eps = eps or 1e-5 + self.train = true + self.momentum = momentum or 0.1 + + self.running_mean = torch.zeros(nFeature):cuda() + self.running_std = torch.ones(nFeature):cuda() + if self.affine then + self.weight = torch.CudaTensor(nFeature) + self.bias = torch.CudaTensor(nFeature) + self.gradWeight = torch.CudaTensor(nFeature) + self.gradBias = torch.CudaTensor(nFeature) + self:reset() + else + -- Give me empty tensors for proper FFI behavior + self.weight = torch.CudaTensor() + self.bias = torch.CudaTensor() + self.gradWeight = torch.CudaTensor() + self.gradBias = torch.CudaTensor() + end + + -- Initialize from input on the first updateOutput / updateGradInput + self.output = nil + self.gradInput = nil +end + +function BN:reset() + self.weight:uniform() + self.bias:zero() +end + +function BN:updateOutput(input) + assert(input:dim() == 4, 'only mini-batch supported (4D tensor), got ' + .. input:dim() .. 'D tensor instead') + + self.std = self.std or self.running_std:clone():zero():cuda() + self.std:resizeAs(self.running_std) + self.centered = self.centered or input:clone():zero():cuda() + self.centered:resizeAs(input) + self.normalized = self.normalized or input:clone():zero():cuda() + self.normalized:resizeAs(input) + self.output = self.output or input:clone():zero():cuda() + self.output:resizeAs(input) + + BNFFI.SpatialBatchNormalizationUpdateOutputFFI( + cutorch._state, + input:cdata(), + self.output:cdata(), + self.centered:cdata(), + self.std:cdata(), + self.normalized:cdata(), + self.running_mean:cdata(), + self.running_std:cdata(), + self.weight:cdata(), + self.bias:cdata(), + self.eps, + self.momentum, + self.train, + self.affine) + + return self.output +end + +function BN:updateGradInput(input, gradOutput) + assert(input:dim() == 4, 'only mini-batch supported') + assert(gradOutput:dim() == 4, 'only mini-batch supported') + assert(self.train == true, + 'should be in training mode when self.train is true') + + self.gradInput = self.gradInput or input:clone():zero():cuda() + self.gradInput:resizeAs(input) + + BNFFI.SpatialBatchNormalizationUpdateGradInputFFI( + cutorch._state, + self.gradInput:cdata(), + gradOutput:cdata(), + self.centered:cdata(), + self.std:cdata(), + self.weight:cdata(), + self.affine) + + return self.gradInput +end + +function BN:accGradParameters(input, gradOutput, scale) + if self.affine then + scale = scale or 1.0 + BNFFI.SpatialBatchNormalizationAccGradParametersFFI( + cutorch._state, + gradOutput:cdata(), + self.normalized:cdata(), + self.gradWeight:cdata(), + self.gradBias:cdata(), + scale) + end + +end + + +function BN:clearState() + self.centered = nil + self.std = nil + self.normalized = nil + + parent.clearState(self) +end diff --git a/fbcunn/SpatialConvolution.lua b/fbcunn/SpatialConvolution.lua new file mode 100644 index 0000000..ed24526 --- /dev/null +++ b/fbcunn/SpatialConvolution.lua @@ -0,0 +1,501 @@ +-- Copyright 2014 - present Facebook. All Rights Reserved. + +-- This is the module that you should most likely call if you want the fastest +-- convolution available. It is a wrapper to cudnn as well as different +-- FFT-based implementations. +-- +-- Instantiate with fbnn.SpatialConvolution(nInputPlane, +-- nOutputPlane, +-- kW, +-- kH, +-- dW, [1] +-- dH, [1] +-- padLeft, [0] +-- padUp, [0] +-- maximalMemoryOverhead, [nil] +-- inferenceOnly) [false] +-- where: +-- - the first parameters have the traditional meaning, +-- - maximalMemoryOverhead: limit on the amount of memory +-- overhead you want to allow, nil meaning no limit +-- - inferenceOnly: whether the module is used for inference or training. +-- Spercifying inference only saves time in the autotuning process +-- +-- On the first call to updateOutput, a simple autotuning search kicks off +-- which compares the performance of different flavors of: +-- FBFFT + FBMM, FBFFT + cublasGemm, FBFFT Tiled sync, FBFFT Tiled async +-- and cudnn +-- In the future we can also wrap more specialized kernels (e.g. +-- no memory overhead FFTs, Nervana's convolutions etc) + +require 'cudnn' + +local argcheck = require 'argcheck' +local SpatialConvolution, parent = + torch.class('fbnn.SpatialConvolution', 'nn.Module') + +fbnn.SpatialConvolution.reportErrors = false +fbnn.SpatialConvolution.reportWarnings = false + +function SpatialConvolution:__init(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + maximalMemoryOverhead, + inferenceOnly) + parent.__init(self) + self.inputPlanes = nInputPlane + self.outputPlanes = nOutputPlane + self.kW = kW + self.kH = kH + self.dW = dW or 1 + self.dH = dH or 1 + self.padLeft = padLeft or 0 + self.padUp = padUp or 0 + self.inferenceOnly = inferenceOnly + self.maximalMemoryOverhead = maximalMemoryOverhead + self.reportLevel = 0 + + -- Allocate an underlying CuDNN + self.cudnnModuleInst = + cudnn.SpatialConvolution(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp):cuda() + + -- Take its tensors as my own + self.weight = self.cudnnModuleInst.weight + self.output = self.cudnnModuleInst.output + self.bias = self.cudnnModuleInst.bias + self.gradWeight = self.cudnnModuleInst.gradWeight + self.gradBias = self.cudnnModuleInst.gradBias +end + +function SpatialConvolution:setInferenceOnly(val) + assert(type(val) == 'boolean') + self.inferenceOnly = val +end + +function SpatialConvolution:setReuseWeights(val) + assert(self.bestModuleInst, 'Must tune before reusing weights') + if self.bestModuleInst.setReuseWeights then + self.bestModuleInst:setReuseWeights(val) + end +end + +-------------------------------------------------------------------------------- +-- Detail +-------------------------------------------------------------------------------- +local function _timeFunction( + fun, mod, arg1, arg2, arg3, arg4, arg5) + local numTrials = 3 + local time = 0 + cutorch.synchronize() + for i = 1, numTrials do + local timer = torch.Timer() + fun(mod, arg1, arg2, arg3, arg4, arg5) + cutorch.synchronize() + if i > 1 then + time = time + timer:time().real + end + end + time = time / (numTrials - 1) + return time * 1000 +end + +local runModule = argcheck { + { name = "mod", type = "table" }, + -- { name = "mod", type = "nn.Module" }, + -- { name = "mod", type = "nn.SpatialConvolutionFBFFT" }, + { name = "input", type = "torch.CudaTensor"}, + { name = "gradOutput", type = "torch.CudaTensor"}, + { name = "parameters", type = "table"}, + { name = "extraParameters", type = "table"}, + { name = "inferenceOnly", type = "boolean"}, + { name = "scale", type = "number"}, + call = function( + mod, input, gradOutput, parameters, extraParameters, inferenceOnly, scale) + local params = {} + for _, v in pairs(parameters) do + table.insert(params, v) + end + for _, v in pairs(extraParameters) do + table.insert(params, v) + end + + local inst = mod(unpack(params)):cuda() + + -- Setup autotuning behavior, unused in CuDNN + inst.printDebugLevel = -1 + if inst.printDebugLevel >= 3 then + print(inst, unpack(params)) + inst.cudnnDebug = true + if inst.printDebugLevel >= 4 then + input:fill(1.0) + inst.weight:fill(1.0) + gradOutput:fill(1.0) + else + input:normal() + inst.weight:normal() + gradOutput:normal() + end + end + inst.autotuningPass = true + inst.reportErrors = fbnn.SpatialConvolution.reportErrors or false + + local timing1, timing2, timing3 = 0, 0, 0 + timing1 = timing1 + + _timeFunction(inst.updateOutput, inst, input) + if not inst.success then + inst:cleanupBuffers() + return 1e32, 0, 0, nil + end + + if inferenceOnly then + return timing1, 0, 0, inst + end + + timing2 = timing2 + + _timeFunction(inst.updateGradInput, inst, input, gradOutput) + if not inst.success then + inst:cleanupBuffers() + return 1e32, 0, 0, nil + end + + timing3 = timing3 + + _timeFunction(inst.accGradParameters, inst, input, gradOutput, scale) + if not inst.success then + inst:cleanupBuffers() + return 1e32, 0, 0, nil + end + + -- Unset autotuning behavior, unused in CuDNN + inst.autotuningPass = false + inst.reportErrors = true + + return timing1, timing2, timing3, inst + end +} + +function SpatialConvolution:_tune(batchSize, + iW, + iH, + nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + inferenceOnly) + -- Just compare cudnn to various FFT variants and pick the best + local timings = {} + local ps = {batchSize, nInputPlane, iH, iW} + local input = torch.Tensor(torch.LongStorage(ps)):cuda() + local ps = {batchSize, + nOutputPlane, + math.floor((iH - kH + 2 * padUp) / dH) + 1, + math.floor((iW - kW + 2 * padLeft) / dW) + 1} + local gradOutput = torch.Tensor(torch.LongStorage(ps)):cuda() + local scale = torch.random(100) / 100.0 + + local preFree = cutorch.getMemoryUsage() + local timing1, timing2, timing3 = 0, 0, 0 + timing1 = timing1 + _timeFunction(self.cudnnModuleInst.updateOutput, + self.cudnnModuleInst, + input) + if not inferenceOnly then + timing2 = timing2 + _timeFunction(self.cudnnModuleInst.updateGradInput, + self.cudnnModuleInst, + input, + gradOutput) + timing3 = timing3 + _timeFunction(self.cudnnModuleInst.accGradParameters, + self.cudnnModuleInst, + input, + gradOutput, + scale) + end + local postFree = cutorch.getMemoryUsage() + local cudnnTiming = timing1 + timing2 + timing3 + timings[self.cudnnModuleInst] = { + parameters = nil, + memoryConsumption = preFree - postFree, + timing1, + timing2, + timing3 + } + + -- Only investigate FFT for stride == 1 + local bestTiming = 1e32 + if dW == 1 and dH == 1 then + local bestModule = nil + self.bestModuleInst = nil + local modules + + if iW > 32 or iH > 32 then + -- Don't waste time on inefficient 64x64 or 128x128 convolutions atm + -- TODO: Fix 3 issues: + -- 1. implement fast 64 and 128, + -- 2. drop buffer malloced at each call + -- 3. tune FBMM for 64x64 and 128x128 + modules = { + -- requires explicit padding and is slow + -- nn.SpatialConvolutionCuFFT, + nn.SpatialConvolutionFFTTiledSync, + nn.SpatialConvolutionFFTTiledAsync, + -- too slow atm + -- nn.SpatialConvolutionFFTTiledIterated + } + else + modules = { + -- requires explicit padding and is slow + -- nn.SpatialConvolutionCuFFT, + nn.SpatialConvolutionFBFFT, + -- only activate if fbmm perf is suspiciously low + -- nn.SpatialConvolutionFBFFTGemm, activate if suspicious fbmm perf + nn.SpatialConvolutionFFTTiledSync, + nn.SpatialConvolutionFFTTiledAsync, + -- too slow atm + -- nn.SpatialConvolutionFFTTiledIterated + } + end + + for i_mod in pairs(modules) + do + local mod = modules[i_mod] + local extraParameters = {} + if mod == nn.SpatialConvolutionFBFFT or + mod == nn.SpatialConvolutionFBFFTGemm + then + extraParameters = { + -- reuse, streams + {nn.SpatialConvolutionFFT.memoryReuseAll, 16}, + {nn.SpatialConvolutionFFT.memoryReuseNone, 16} + } + elseif mod == nn.SpatialConvolutionFFTTiledSync + or mod == nn.SpatialConvolutionFFTTiledAsync + or mod == nn.SpatialConvolutionFFTTiledIterated + then + -- tileH, tileW, reuse + if kH <= 3 and kW <= 3 then + extraParameters = { + -- Only enable 8 x 8 manually, is often too expensive by default + -- {8, 8, nn.SpatialConvolutionFFT.memoryReuseNone}, + {16, 16, nn.SpatialConvolutionFFT.memoryReuseNone}, + {32, 32, nn.SpatialConvolutionFFT.memoryReuseNone}, + -- {8, 8, nn.SpatialConvolutionFFT.memoryReuseAll}, + {16, 16, nn.SpatialConvolutionFFT.memoryReuseAll}, + {32, 32, nn.SpatialConvolutionFFT.memoryReuseAll}, + } + elseif kH <= 9 and kW <= 9 then + extraParameters = { + {16, 16, nn.SpatialConvolutionFFT.memoryReuseNone}, + {32, 32, nn.SpatialConvolutionFFT.memoryReuseNone}, + {16, 16, nn.SpatialConvolutionFFT.memoryReuseAll}, + {32, 32, nn.SpatialConvolutionFFT.memoryReuseAll}, + } + else + extraParameters = { + {32, 32, nn.SpatialConvolutionFFT.memoryReuseNone}, + {32, 32, nn.SpatialConvolutionFFT.memoryReuseAll}, + } + end + end + + for i_params in pairs(extraParameters) + do + local preFree = cutorch.getMemoryUsage() + local timing1, timing2, timing3, inst = + runModule(mod, + input, + gradOutput, + { nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp + }, + extraParameters[i_params], + inferenceOnly, + scale + ) + + local postFree = cutorch.getMemoryUsage() + local exceedsAdmissibleMemory = true + if inst then + timings[inst] = { + parameters = extraParameters[i_params], + memoryConsumption = preFree - postFree, + timing1, + timing2, + timing3 + } + exceedsAdmissibleMemory = + (self.maximalMemoryOverhead and + (timings[inst].memoryConsumption - + timings[self.cudnnModuleInst].memoryConsumption) > + self.maximalMemoryOverhead) + + end + + if timing1 + timing2 + timing3 < bestTiming and + not exceedsAdmissibleMemory + then + bestTiming = timing1 + timing2 + timing3 + bestModule = mod + if self.bestModuleInst and self.bestModuleInst.cleanupBuffers then + self.bestModuleInst:cleanupBuffers() + end + self.bestModuleInst = inst + elseif inst then + inst:cleanupBuffers() + end + inst = nil + collectgarbage() + collectgarbage() + end + end + + if self.reportLevel >= 3 then + print('Timings: ', timings) + end + if self.reportLevel >= 1 then + print('Best FFT: ', bestTiming, ' ', self.bestModuleInst) + print('cudnn : ', cudnnTiming, ' ', self.cudnnModuleInst) + end + if self.reportLevel >= 2 then + print('FFT detail ', timings[self.bestModuleInst]) + print('CuDNN detail ', timings[self.cudnnModuleInst]) + end + + -- Always run correctness check atm, move later to only run when FFT wins. + if bestModule ~= cudnn.SpatialConvolution and self.bestModuleInst then + -- Fail if check fails here, don't fallback to cudnn + self.bestModuleInst.autotuningPass = true + self.bestModuleInst.cudnnDebug = true + self.bestModuleInst.printDebugLevel = -1 + input:normal() + gradOutput:normal() + self.bestModuleInst:reset() + self.bestModuleInst:updateOutput(input) + if not inferenceOnly then + self.bestModuleInst:updateGradInput(input, gradOutput) + self.bestModuleInst:accGradParameters(input, gradOutput, scale) + end + assert(self.bestModuleInst.cudnnChecks) + self.bestModuleInst.autotuningPass = false + self.bestModuleInst.cudnnDebug = false + self.bestModuleInst.printDebugLevel = -1 + end + end + + if bestTiming > cudnnTiming then + self.bestModuleInst = self.cudnnModuleInst + self.bestModuleInst:resetWeightDescriptors() + end + + -- if self.bestModuleInst == self.cudnnModuleInst, just reduces the refcount + -- otherwise prepares for collection + self.cudnnModuleInst = nil + + -- Take as my own + self.weight = self.bestModuleInst.weight + self.output = self.bestModuleInst.output + self.bias = self.bestModuleInst.bias + self.gradWeight = self.bestModuleInst.gradWeight + self.gradBias = self.bestModuleInst.gradBias + + collectgarbage() + collectgarbage() +end + +-- Update output (i.e. forward prop) +function SpatialConvolution:updateOutput(input) + assert(#input:size() == 4, 'Only supports 4-D tensors atm') + + if not self.bestModuleInst then + -- used for tuning consistency + self.batchSize = input:size(1) + self.iH = input:size(3) + self.iW = input:size(4) + self:_tune(self.batchSize, + self.iW, + self.iH, + self.inputPlanes, + self.outputPlanes, + self.kW, + self.kH, + self.dW, + self.dH, + self.padLeft, + self.padUp, + self.inferenceOnly) + end + + assert(self.batchSize == input:size(1), + 'Batches tuned for: ' .. self.batchSize .. ' VS ' .. input:size(1)) + assert(self.inputPlanes == input:size(2), + 'InputPlanes tuned for: ' .. self.inputPlanes .. + ' VS ' .. input:size(2)) + assert(self.iH == input:size(3), + 'InputH tuned for: ' .. self.iH .. ' VS ' .. input:size(3)) + assert(self.iW == input:size(4), + 'InputW tuned for: ' .. self.iW .. ' VS ' .. input:size(4)) + + -- weights are updated each iteration, pass them on + self.bestModuleInst.weight = self.weight + self.output = self.bestModuleInst:updateOutput(input) + self.bias = self.bestModuleInst.bias + + assert(self.outputPlanes == self.output:size(2), + 'OutputPlanes tuned for: ' .. self.outputPlanes .. + ' VS ' .. self.output:size(2)) + + assert(self.bestModuleInst) + if torch.type(self.bestModuleInst) ~= 'cudnn.SpatialConvolution' then + assert(self.bestModuleInst.cudnnChecks) + end + + return self.output +end + + +function SpatialConvolution:updateGradInput(input, gradOutput) + assert(self.bestModuleInst, 'Must have been tuned in updateOutput already!') + assert(not self.inferenceOnly, 'Inference only specified => no gradInput ') + self.bestModuleInst.gradInput = + self.bestModuleInst:updateGradInput(input, gradOutput) + self.gradInput = self.bestModuleInst.gradInput + return self.gradInput +end + + +function SpatialConvolution:accGradParameters( + input, gradOutput, scale) + assert(self.bestModuleInst, 'Must have been tuned in updateOutput already!') + assert(not self.inferenceOnly, 'Inference only specified => no accGrads ') + -- gradWeight / gradBias are updated each iteration, pass them on + self.bestModuleInst.gradWeight = self.gradWeight + self.bestModuleInst.gradBias = self.gradBias + self.bestModuleInst:accGradParameters(input, gradOutput, scale) +end + + +function SpatialConvolution:cleanupBuffers() + if self.bestModuleInst and self.bestModuleInst.cleanupBuffers then + self.bestModuleInst:cleanupBuffers() + end + self.bestModuleInst = nil +end diff --git a/fbcunn/SpatialConvolutionCuFFT.lua b/fbcunn/SpatialConvolutionCuFFT.lua index 2de69d5..7031b6b 100644 --- a/fbcunn/SpatialConvolutionCuFFT.lua +++ b/fbcunn/SpatialConvolutionCuFFT.lua @@ -1,298 +1,889 @@ -- Copyright 2004-present Facebook. All Rights Reserved. -local mk = require('multikey') - --- Hoist this in a global buffer module -cudaTensorBuffers = {} -FFTConvolution = 'FFTConvolutionBuffer' -FFTConvolutionTranspose = 'FFTConvolutionTransposeBuffer' -FFTInputBufferType = 0 -FFTInputTransposeBufferType = 1 -FFTOutputBufferType = 2 -FFTOutputTransposeBufferType = 3 -FFTWeightBufferType = 4 -FFTWeightTransposeBufferType = 5 - --- Float assumed, 4 bytes -sizeOfElem = 4 - +require 'cudnn' +local List = require 'pl.List' +local thrift = require('fb.thrift') +local ffi = require 'ffi' +local lib_name = 'torch_fb_fbcunn_convolution_bias' +local lib_path = package.searchpath(lib_name, package.cpath) +local ConvolutionBiasFFI = ffi.load(lib_path and lib_path or lib_name) + +--[[ + Actual module +--]] local SpatialConvolutionCuFFT, parent = - torch.class('nn.SpatialConvolutionCuFFT', 'nn.Module') + torch.class('nn.SpatialConvolutionCuFFT', 'nn.SpatialConvolutionFFT') + +function SpatialConvolutionCuFFT:__init(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + memoryReusePolicy, + numCudaStreams) + assert(torch.type(nInputPlane) == 'number') + assert(torch.type(nOutputPlane) == 'number') + assert(torch.type(kW) == 'number') + assert(torch.type(kH) == 'number') + assert(torch.type(dW) == 'number') + assert(torch.type(dH) == 'number') + assert(memoryReusePolicy == nil or + torch.type(memoryReusePolicy) == 'string' or + torch.type(memoryReusePolicy) == 'table') + assert(numCudaStreams == nil or torch.type(numCudaStreams) == 'number') + + parent.__init(self, + nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + memoryReusePolicy, + numCudaStreams) + + parent.fftImplementation = 'cufft' + + assert(self.padUp == 0 and + self.padDown == 0 and + self.padLeft == 0 and + self.padRight == 0, "cufft does not support implicit padding!") + + -- Sanity assertions + assert(self.printDebugLevel == -1) + assert(self.nInputPlane == nInputPlane) + assert(self.nOutputPlane == nOutputPlane) + assert(self.kW == kW) + assert(self.kH == kH) + assert(self.dH == 1, "fft only supports stride-1 convolutions atm") + assert(self.dW == 1, "fft only supports stride-1 convolutions atm") -local precision = 0.00002 -local printDebug = false -local debug = false + assert(self.weight:size(1) == nOutputPlane and + self.weight:size(2) == nInputPlane and + self.weight:size(3) == kH and + self.weight:size(4) == kW) + assert(self.bias:size(1) == nOutputPlane) + assert(self.gradWeight:size(1) == nOutputPlane and + self.gradWeight:size(2) == nInputPlane and + self.gradWeight:size(3) == kH and + self.gradWeight:size(4) == kW) + assert(self.gradBias:size(1) == nOutputPlane) + + -- Temporary buffers + assert(not self.inputBuffer) + assert(not self.inputTransposeBuffer) + assert(not self.inputPadded) + assert(not self.outputBuffer) + assert(not self.outputTransposeBuffer) + assert(not self.outputPadded) + assert(not self.weightBuffer) + assert(not self.weightTransposeBuffer) + assert(not self.weightPadded) + + -- CuFFT plans + assert(not self.cufftPlanInputFFT) + assert(not self.cufftPlanWeightFFT) + assert(not self.cufftPlanOutputFFT) + assert(not self.cufftPlanInputIFFT) + assert(not self.cufftPlanWeightIFFT) + assert(not self.cufftPlanOutputIFFT) +end -function SpatialConvolutionCuFFT:__init(nInputPlane, nOutputPlane, - kW, kH, dW, dH) - parent.__init(self) +--[[ + Helper function to perform explicit padding + In the case of cufft, padding must be explicit with zeros on the + inputs of the algorithm. fbfft does not need this. +--]] +function SpatialConvolutionCuFFT:isOutputOfPass(pass, tensor) + assert(pass == nn.SpatialConvolutionFFT.ForwardFFTPass or + pass == nn.SpatialConvolutionFFT.BackwardFFTPass or + pass == nn.SpatialConvolutionFFT.AccGradientFFTPass) + if pass == nn.SpatialConvolutionFFT.ForwardFFTPass and + tensor == self.output + then + return true + end + if pass == nn.SpatialConvolutionFFT.BackwardFFTPass and + tensor == self.gradInput + then + return true + end + if pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and + tensor == self.gradWeight + then + return true + end + return false +end - self.nInputPlane = nInputPlane - self.nOutputPlane = nOutputPlane - self.kW = kW - self.kH = kH - self.dW = dW or 1 - self.dH = dH or 1 +function SpatialConvolutionCuFFT:fftPadding(tensor, pass, inputTensor) + -- Always input, weight, output + local tensorList = {} + local paddedList = {} + if pass == nn.SpatialConvolutionFFT.ForwardFFTPass then + tensorList = {tensor, self.weight, self.output} + paddedList = {self.inputPadded, self.weightPadded, self.outputPadded} + elseif pass == nn.SpatialConvolutionFFT.BackwardFFTPass then + tensorList = {self.gradInput, self.weight, tensor} + paddedList = {self.inputPadded, self.weightPadded, self.outputPadded} + elseif pass == nn.SpatialConvolutionFFT.AccGradientFFTPass then + tensorList = {inputTensor, self.gradWeight, tensor} + paddedList = {self.inputPadded, self.weightPadded, self.outputPadded} + end - assert(self.dW == 1, "fft only supports stride-1 convolutions atm") + for ind = 1, #tensorList do + -- If we have a non empty padded tensor + if paddedList[ind] and paddedList[ind]:nElement() > 0 then + local _orig = tensorList[ind] + local padded = paddedList[ind] + if not self:isOutputOfPass(pass, tensorList[ind]) then + local sizes = tensorList[ind]:size() + local paddedSizes = paddedList[ind]:size() + -- resize messes up strides, I want a fortran subarray here, + -- do it manually + padded:set(padded:storage(), + padded:storageOffset(), + sizes, + padded:stride()) + padded:copy(tensorList[ind]) + -- make tensor full again, it is now contiguous and zero padded + padded:set(padded:storage(), + padded:storageOffset(), + paddedSizes, padded:stride()) + end + end + end - self.weight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW) - self.bias = torch.Tensor(nOutputPlane) - self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW) - self.gradBias = torch.Tensor(nOutputPlane) + -- swap original and padded tensors to be transparent for the + -- convolution pass + if pass == nn.SpatialConvolutionFFT.ForwardFFTPass then + if self.inputPadded and self.inputPadded:nElement() > 0 then + tensor, self.inputPadded = self.inputPadded, tensor + end + if self.weightPadded and self.weightPadded:nElement() > 0 then + self.weight, self.weightPadded = self.weightPadded, self.weight + end + if self.outputPadded and self.outputPadded:nElement() > 0 then + self.output, self.outputPadded = self.outputPadded, self.output + end + elseif pass == nn.SpatialConvolutionFFT.BackwardFFTPass then + if self.inputPadded and self.inputPadded:nElement() > 0 then + self.gradInput, self.inputPadded = self.inputPadded, self.gradInput + end + if self.weightPadded and self.weightPadded:nElement() > 0 then + self.weight, self.weightPadded = self.weightPadded, self.weight + end + if self.outputPadded and self.outputPadded:nElement() > 0 then + tensor, self.outputPadded = self.outputPadded, tensor + end + elseif pass == nn.SpatialConvolutionFFT.AccGradientFFTPass then + if self.inputPadded and self.inputPadded:nElement() > 0 then + inputTensor, self.inputPadded = self.inputPadded, inputTensor + end + if self.weightPadded and self.weightPadded:nElement() > 0 then + self.gradWeight, self.weightPadded = self.weightPadded, self.gradWeight + end + if self.outputPadded and self.outputPadded:nElement() > 0 then + tensor, self.outputPadded = self.outputPadded, tensor + end + end - self:reset() + return tensor, inputTensor end -function SpatialConvolutionCuFFT:reset(stdv) - if stdv then - stdv = stdv * math.sqrt(3) - else - stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane) + +--[[ + Helper function to undo padding + In the case of cufft, padding must be explicit with zeros on the + inputs of the algorithm. fbfft does not need this. +--]] +function SpatialConvolutionCuFFT:fftUnpadding(tensor, pass, inputTensor) + -- Always input, weight, output + local tensorList = {} + local paddedList = {} + -- Here the paddedList and tensorList are reversed compared to fftPadding + -- Only true for those tensors that are actually padded (i.e. self. + -- inputPadded both non nil and not empty) + if pass == nn.SpatialConvolutionFFT.ForwardFFTPass then + paddedList = {tensor, self.weight, self.output} + tensorList = {self.inputPadded, self.weightPadded, self.outputPadded} + elseif pass == nn.SpatialConvolutionFFT.BackwardFFTPass then + paddedList = {self.gradInput, self.weight, tensor} + tensorList = {self.inputPadded, self.weightPadded, self.outputPadded} + elseif pass == nn.SpatialConvolutionFFT.AccGradientFFTPass then + paddedList = {inputTensor, self.gradWeight, tensor} + tensorList = {self.inputPadded, self.weightPadded, self.outputPadded} end - if nn.oldSeed then - self.weight:apply(function() - return torch.uniform(-stdv, stdv) - end) - self.bias:apply(function() - return torch.uniform(-stdv, stdv) - end) - else - self.weight:uniform(-stdv, stdv) - self.bias:uniform(-stdv, stdv) + + for ind = 1, #tensorList do + -- If we have a non-empty padded tensor + if tensorList[ind] and tensorList[ind]:nElement() > 0 then + local orig = tensorList[ind] + local padded = paddedList[ind] + if self:isOutputOfPass(pass, paddedList[ind]) then + local sizes = tensorList[ind]:size() + local paddedSizes = paddedList[ind]:size() + -- resize messes up strides, I want a fortran subarray here, + -- do it manually + padded:set(padded:storage(), + padded:storageOffset(), + sizes, + padded:stride()) + orig:copy(padded) + -- make tensor full again, it is now contiguous and zero padded + padded:set(padded:storage(), + padded:storageOffset(), + paddedSizes, + padded:stride()) + end + end end -end -local function debugVSMM(pass, module, toTest, fun, param1, param2, param3) - local o = toTest:float():clone() - toTest:zero() - module.padding = 0 - module.finput = torch.CudaTensor() - module.fgradInput = torch.CudaTensor() - -- linearize weight for MM - module.gradWeight = - module.gradWeight:view(module.nOutputPlane, - module.nInputPlane * module.kH * module.kW) - module.weight = - module.weight:view(module.nOutputPlane, - module.nInputPlane * module.kH * module.kW) - local test = fun(module, param1, param2, param3) - -- reset layout of weight after MM - module.gradWeight = - module.gradWeight:view(module.nOutputPlane, - module.nInputPlane, - module.kH, - module.kW) - module.weight = - module.weight:view(module.nOutputPlane, - module.nInputPlane, - module.kH, - module.kW) - local norm = math.sqrt(test:float():dot(test:float()) + 1e-8) - if test:float():dist(o:float()) / norm > precision then - print('error ', pass, test:float():dist(o:float()) / norm, precision) - os.exit() - elseif printDebug then - print('debug vs MM check passes ', - pass, o:min(), o:max(), o:mean(), o:std(), o:sum()) + -- swap original and padded tensors to be transparent for the + -- convolution pass + if pass == nn.SpatialConvolutionFFT.ForwardFFTPass then + if self.inputPadded and self.inputPadded:nElement() > 0 then + tensor, self.inputPadded = self.inputPadded, tensor + end + if self.weightPadded and self.weightPadded:nElement() > 0 then + self.weight, self.weightPadded = self.weightPadded, self.weight + end + if self.outputPadded and self.outputPadded:nElement() > 0 then + self.output, self.outputPadded = self.outputPadded, self.output + end + elseif pass == nn.SpatialConvolutionFFT.BackwardFFTPass then + if self.inputPadded and self.inputPadded:nElement() > 0 then + self.gradInput, self.inputPadded = self.inputPadded, self.gradInput + end + if self.weightPadded and self.weightPadded:nElement() > 0 then + self.weight, self.weightPadded = self.weightPadded, self.weight + end + if self.outputPadded and self.outputPadded:nElement() > 0 then + tensor, self.outputPadded = self.outputPadded, tensor + end + elseif pass == nn.SpatialConvolutionFFT.AccGradientFFTPass then + if self.inputPadded and self.inputPadded:nElement() > 0 then + inputTensor, self.inputPadded = self.inputPadded, inputTensor + end + if self.weightPadded and self.weightPadded:nElement() > 0 then + self.gradWeight, self.weightPadded = self.weightPadded, self.gradWeight + end + if self.outputPadded and self.outputPadded:nElement() > 0 then + tensor, self.outputPadded = self.outputPadded, tensor + end end + + return tensor, inputTensor end -function SpatialConvolutionCuFFT:updateOutput(input) - self:prepareBuffers(input:size()) - input.nn.SpatialConvolutionCuFFT_updateOutput(self, input) +function SpatialConvolutionCuFFT:prepareSizeAndBuffers(i, w, o, metaData) + return self:prepareCuFFTSizeAndBuffers(i, w, o, metaData, metaData.pass) +end - if debug == true then - debugVSMM("updateOutput", - self, - self.output, - input.nn.SpatialConvolutionMM_updateOutput, - input) +--[[ + Update output +--]] +function SpatialConvolutionCuFFT:updateOutputFFTImpl(input, reuseList) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + local metaData = {} + metaData.pass = nn.SpatialConvolutionFFT.ForwardFFTPass + + local commonSize = + self:prepareSizeAndBuffers(input, self.weight, self.output, metaData) + + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#input:size() == cublasBatchDims + 2) + + local inputFFTStream = 1 + local weightFFTStream = 2 + local gemmStream = 3 + assert(cutorch.getNumStreams() >= 3) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + -- In cufft mode, we have explicit padding tensors + input = self:fftPadding(input, nn.SpatialConvolutionFFT.ForwardFFTPass) + -- Padding / unpadding perform copies on default stream, synchronize all + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- 1. FFT + transpose input and weights + if not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType) + then + cutorch.setStream(inputFFTStream) + self.cufftPlanInputFFT = + fftWrapper:fftTranspose(input, + self.inputBuffer, + self.inputTransposeBuffer, + cublasBatchDims, + 1, -- handle + inputFFTStream, -- stream + self.cufftPlanInputFFT) end + if not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType) + then + cutorch.setStream(weightFFTStream) + self.cufftPlanWeightFFT = + fftWrapper:fftTranspose(self.weight, + self.weightBuffer, + self.weightTransposeBuffer, + cublasBatchDims, + 2, -- handle + weightFFTStream, -- stream + self.cufftPlanWeightFFT) + end + + -- 2. CGEMM on transposed tensors + -- This call uses all the handles and streams available + -- CuBLAS is column major and computes C' = B' * A' + local useBatchedMM = (commonSize[3] * commonSize[4] >= 128) + local cublasWrapper = nn.CuBLASWrapper() + local norm = self:getNormalizationFactor(commonSize, input) + + if not useBatchedMM then + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- a. multiple GEMMs on multiple streams + cublasWrapper:matmultComplex(self.inputTransposeBuffer, + self.weightTransposeBuffer, + self.outputTransposeBuffer, + {0, 1}, -- iterDims == 2 + { }, -- cublasBatchDims + 'n', + 'c', + 1.0 / norm) + + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + else + -- stream must match the IFFT stream for sync without waiting + -- explicitly + cutorch.setStream(gemmStream) + cutorch.streamWaitFor(gemmStream, {inputFFTStream, weightFFTStream}) + cublasWrapper:matmultComplex(self.inputTransposeBuffer, + self.weightTransposeBuffer, + self.outputTransposeBuffer, + {}, -- iterDims + {0, 1}, -- cublasBatchDims == 2 + 'n', + 'c', + 1.0 / norm) + end + + -- 3. transpose + IFFT output + cutorch.setStream(gemmStream) + self.cufftPlanOutputIFFT = + fftWrapper:transposeIFFT(self.output, + self.outputBuffer, + self.outputTransposeBuffer, + cublasBatchDims, + 1, -- handle + gemmStream, -- stream + self.cufftPlanOutputIFFT) + + -- ############################################## + -- Padding / unpadding perform copies on default stream, synchronize all + cutorch.streamBarrier(self.allStreams) + + -- 4. If cufft, needs resize + self:fftUnpadding(input, nn.SpatialConvolutionFFT.ForwardFFTPass) + + -- Synchronize all: Padding / unpadding perform copies on default stream + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- 5. Finally, bias update + cutorch.setStream(gemmStream) + ConvolutionBiasFFI.updateOutputBiasFFI( + cutorch._state, self.output:cdata(), self.bias:cdata()) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + return self.output end -function SpatialConvolutionCuFFT:explorePerformance(input, batches, - inputs, planes, inputRows, inputCols, kernelRows, kernelCols) - input.nn.SpatialConvolutionCuFFT_explorePerformance(self, batches, - inputs, planes, inputRows, inputCols, kernelRows, kernelCols) -end +--[[ + Update input gradients +--]] + + +function SpatialConvolutionCuFFT:updateGradInputFFTImpl( + input, gradOutput, reuseList) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + local metaData = {} + metaData.pass = nn.SpatialConvolutionFFT.BackwardFFTPass + + local commonSize = + self:prepareSizeAndBuffers(input, self.weight, gradOutput, metaData) + + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#input:size() == cublasBatchDims + 2) + + local weightFFTStream = 1 + local gradOutputFFTStream = 2 + local gemmStream = 3 + assert(cutorch.getNumStreams() >= 3) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + -- If cufft, we may have padding tensors into which to copy the data + gradOutput = self:fftPadding(gradOutput, + nn.SpatialConvolutionFFT.BackwardFFTPass) + -- Padding / unpadding perform copies on default stream, synchronize all + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- 1. FFT + transpose gradOutput and weights + if not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType) + then + cutorch.setStream(gradOutputFFTStream) + self.cufftPlanOutputFFT = + fftWrapper:fftTranspose(gradOutput, + self.outputBuffer, + self.outputTransposeBuffer, + cublasBatchDims, + 1, -- handle + gradOutputFFTStream, -- stream + self.cufftPlanOutputFFT) + end -function SpatialConvolutionCuFFT:cleanupBuffers(input) - input.nn.SpatialConvolutionCuFFT_cleanupBuffers() -end + if (not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType)) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseWeight) + then + -- TODO: fix this: transpose changes the TH metadata post buffer + -- get/put which screws up the tensor + cutorch.setStream(weightFFTStream) + self.cufftPlanWeightFFT = + fftWrapper:fftTranspose(self.weight, + self.weightBuffer, + self.weightTransposeBuffer, + cublasBatchDims, + 2, -- handle + weightFFTStream, -- stream + self.cufftPlanWeightFFT) + end -function SpatialConvolutionCuFFT:updateGradInput(input, gradOutput) - self:prepareBuffers(input:size()) - input.nn.SpatialConvolutionCuFFT_updateGradInput(self, gradOutput) - - if debug == true then - debugVSMM("updateGradInput", - self, - self.gradInput, - input.nn.SpatialConvolutionMM_updateGradInput, - input, - gradOutput) + -- 2. CGEMM on transposed tensors + -- This call uses all the handles and streams available + -- CuBLAS is column major and computes C' = B' * A' + local useBatchedMM = (commonSize[3] * commonSize[4] >= 128) + local cublasWrapper = nn.CuBLASWrapper() + local norm = self:getNormalizationFactor(commonSize, gradOutput) + if not useBatchedMM then + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + cublasWrapper:matmultComplex(self.outputTransposeBuffer, + self.weightTransposeBuffer, + self.inputTransposeBuffer, + {0, 1}, -- iterDims == 2 + { }, -- cublasBatchDims + 'n', + 'n', + 1.0 / norm) + + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + else + -- stream must match the IFFT stream for sync without waiting + -- explicitly + cutorch.setStream(gemmStream) + cutorch.streamWaitFor(gemmStream, {weightFFTStream, gradOutputFFTStream}) + + cublasWrapper:matmultComplex(self.outputTransposeBuffer, + self.weightTransposeBuffer, + self.inputTransposeBuffer, + { }, -- iterDims + {0, 1}, -- cublasBatchDims == 2 + 'n', + 'n', + 1.0 / norm) end - return self.gradInput -end + -- 3. transpose + IFFT gradInput + cutorch.setStream(gemmStream) + self.cufftPlanInputIFFT = + fftWrapper:transposeIFFT(self.gradInput, + self.inputBuffer, + self.inputTransposeBuffer, + cublasBatchDims, + 1, -- handle + gemmStream, -- stream + self.cufftPlanInputIFFT) -local -function wrapMM_accGradParameters_gradWeight(module, input, gradOutput, scale) - input.nn.SpatialConvolutionMM_accGradParameters( - module, input, gradOutput, scale) - return module.gradWeight -end + -- ############################################## + -- Padding / unpadding perform copies on default stream, synchronize all + cutorch.streamBarrier(self.allStreams) -local -function wrapMM_accGradParameters_gradBias(module, input, gradOutput, scale) - input.nn.SpatialConvolutionMM_accGradParameters( - module, input, gradOutput, scale) - return module.gradBias + -- 4. If cufft, needs resize + self:fftUnpadding(gradOutput, nn.SpatialConvolutionFFT.BackwardFFTPass) + + -- Padding / unpadding perform copies on default stream, synchronize all + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- 5. No bias operation + + return self.gradInput end -function SpatialConvolutionCuFFT:accGradParameters(input, gradOutput, scale) + +--[[ + Accumulate weight gradients +--]] +function SpatialConvolutionCuFFT:accGradParametersFFTImpl( + input, gradOutput, scale, reuseList) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") scale = scale or 1 - self:prepareBuffers(input:size()) - input.nn.SpatialConvolutionCuFFT_accGradParameters( - self, input, gradOutput, scale) - - if debug == true then - self.gradBias:zero() -- zero first to avoid accumulation - debugVSMM("accGradParameters_gradWeight", - self, - self.gradWeight, - wrapMM_accGradParameters_gradWeight, - input, - gradOutput, - scale) - local saveBias = self.gradBias:float():clone() - self.gradWeight:zero() - self.gradBias:zero() - debugVSMM("accGradParameters_gradBias", - self, - saveBias, - wrapMM_accGradParameters_gradBias, - input, - gradOutput, - scale) + + local metaData = {} + metaData.pass = nn.SpatialConvolutionFFT.AccGradientFFTPass + + local commonSize = + self:prepareSizeAndBuffers(input, self.gradWeight, gradOutput, metaData) + + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#input:size() == cublasBatchDims + 2) + + local inputFFTStream = 1 + local gradOutputFFTStream = 2 + local gradBiasFFTStream = 3 + local gemmStream = 4 + assert(cutorch.getNumStreams() >= gemmStream) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + -- If cufft, we may have padding tensors into which to copy the data + gradOutput, input = self:fftPadding( + gradOutput, nn.SpatialConvolutionFFT.AccGradientFFTPass, input) + assert(self.gradWeight:size(3) == commonSize[3]) + assert(self.gradWeight:size(4) == commonSize[4]) + + -- Padding / unpadding perform copies on default stream, synchronize all + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- 0. gradBIas update is independent + cutorch.setStream(gradBiasFFTStream) + ConvolutionBiasFFI.accGradParametersBiasFFI( + cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale) + + -- 1. FFT + transpose gradOutput and weights + if (not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType)) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseOutput) + then + -- TODO: fix this: transpose changes the TH metadata post buffer + -- get/put which screws up the tensor + cutorch.setStream(gradOutputFFTStream) + self.cufftPlanOutputFFT = + fftWrapper:fftTranspose(gradOutput, + self.outputBuffer, + self.outputTransposeBuffer, + cublasBatchDims, + 1, + gradOutputFFTStream, + self.cufftPlanOutputFFT) end -end --- Type: input/gradInput, output/gradOutput or weight/gradWeight --- Could lookup bit operations in lua and do in 1 line, just use a loop atm -local function nextPowerOf2(val) - for i = 1, 10 do - if (2 ^ i) >= val then - return (2 ^ i) - end + if (not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType)) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseInput) + then + cutorch.setStream(inputFFTStream) + self.cufftPlanInputFFT = + fftWrapper:fftTranspose(input, + self.inputBuffer, + self.inputTransposeBuffer, + cublasBatchDims, + 2, + inputFFTStream, + self.cufftPlanInputFFT) + end + + -- 2. CGEMM on transposed tensors + -- This call uses all the handles and streams available + -- CuBLAS is column major and computes C' = B' * A' + local useBatchedMM = (commonSize[3] * commonSize[4] >= 128) + local cublasWrapper = nn.CuBLASWrapper() + local norm = self:getNormalizationFactor(commonSize, gradOutput) + if not useBatchedMM then + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + cublasWrapper:matmultComplex(self.outputTransposeBuffer, + self.inputTransposeBuffer, + self.weightTransposeBuffer, + {0, 1}, -- iterDims == 2 + { }, -- cublasBatchDims + 'c', + 'n', + (1.0 * scale) / norm) + + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + else + -- stream must match the IFFT stream for sync without waiting + -- explicitly + cutorch.setStream(gemmStream) + cutorch.streamWaitFor(gemmStream, {inputFFTStream, gradOutputFFTStream}) + + cublasWrapper:matmultComplex(self.outputTransposeBuffer, + self.inputTransposeBuffer, + self.weightTransposeBuffer, + { }, -- iterDims + {0, 1}, -- cublasBatchDims == 2 + 'c', + 'n', + (1.0 * scale) / norm) end - assert(false, 'Too large a convolution dimensions: ', val) + + -- 3. transpose + IFFT gradInput + cutorch.setStream(gemmStream) + self.cufftPlanWeightIFFT = + fftWrapper:transposeIFFT(self.gradWeight, + self.weightBuffer, + self.weightTransposeBuffer, + cublasBatchDims, + 1, -- handle + gemmStream, -- stream + self.cufftPlanWeightIFFT) + + -- ############################################## + -- Padding / unpadding perform copies on default stream, synchronize all + cutorch.streamBarrier(self.allStreams) + + -- 4. If cufft, needs resize + self:fftUnpadding( + gradOutput, nn.SpatialConvolutionFFT.AccGradientFFTPass, input) + assert(self.gradWeight:size(3) == self.kH) + assert(self.gradWeight:size(4) == self.kW) + + -- Padding / unpadding perform copies on default stream, synchronize all + -- ############################################## + cutorch.streamBarrier(self.allStreams) end -function SpatialConvolutionCuFFT:prepareBuffers(inputSize) - self.inputBuffer = getBuffer(FFTConvolution, FFTInputBufferType, inputSize) - self.inputTransposeBuffer = getBuffer( - FFTConvolutionTranspose, FFTInputTransposeBufferType, inputSize) - - bufferSizesO = torch.LongStorage(4) - bufferSizesO[1] = inputSize[1] -- batch - bufferSizesO[2] = self.nOutputPlane -- output planes - bufferSizesO[3] = inputSize[3] -- input x is always max for buffer - bufferSizesO[4] = inputSize[4] -- input y is always max for buffer - self.outputBuffer = getBuffer(FFTConvolution, FFTOutputBufferType, bufferSizesO) - self.outputTransposeBuffer = getBuffer( - FFTConvolutionTranspose, FFTOutputTransposeBufferType, bufferSizesO) - - bufferSizesW = torch.LongStorage(4) - bufferSizesW[1] = self.nOutputPlane -- output planes - bufferSizesW[2] = self.nInputPlane -- input planes - bufferSizesW[3] = inputSize[3] -- input x is always max for buffer - bufferSizesW[4] = inputSize[4] -- input y is always max for buffer - self.weightBuffer = getBuffer(FFTConvolution, FFTWeightBufferType, bufferSizesW) - self.weightTransposeBuffer = getBuffer( - FFTConvolutionTranspose, FFTWeightTransposeBufferType, bufferSizesW) - - if self.inputBuffer and self.inputTransposeBuffer and - self.outputBuffer and self.outputTransposeBuffer and - self.weightBuffer and self.weightTransposeBuffer then +--[[ + -- Buffer creation and reuse given a size and a pass. + -- Different passes use different tensors as the 'output of the pass'. + -- nn.SpatialConvolutionFFT.ForwardFFTPass -> output + -- nn.SpatialConvolutionFFT.BackwardFFTPass -> input + -- nn.SpatialConvolutionFFT.AccGradientFFTPass -> weight + -- The buffers corresponding to the tensors that is the 'output of the pass' + -- must be properly transposed in order for the CGemm call to be consistent. + -- This is a simple metadata transposition, might as well construct properly. +--]] +function SpatialConvolutionCuFFT:prepareBuffers(commonSize, pass, metaData) + assert(commonSize and pass and self.fftImplementation) + assert(torch.type(metaData) == 'table', torch.type(metaData)) + + if not parent.prepareBuffers(self, commonSize, pass, metaData) + then + return false + end + + local bufferSizesO = torch.LongStorage({ + commonSize[1], self.nOutputPlane, commonSize[3], commonSize[4]}) + local bufferSizesW = torch.LongStorage({ + self.nOutputPlane, self.nInputPlane, commonSize[3], commonSize[4]}) + + self.inputPadded = self:getBuffer( + nn.SpatialConvolutionFFT.CuFFTPaddedInputBuffer, + commonSize, + false, + metaData) + self.outputPadded = self:getBuffer( + nn.SpatialConvolutionFFT.CuFFTPaddedOutputBuffer, + bufferSizesO, + false, + metaData) + self.weightPadded = self:getBuffer( + nn.SpatialConvolutionFFT.CuFFTPaddedWeightBuffer, + bufferSizesW, + false, + metaData) + + self.inputTransposeBuffer = self:getBuffer( + nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType, + commonSize, + true, + metaData) + self.outputTransposeBuffer = self:getBuffer( + nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType, + bufferSizesO, + true, + metaData) + self.weightTransposeBuffer = self:getBuffer( + nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType, + bufferSizesW, + true, + metaData) + + if self.inputTransposeBuffer and self.inputPadded and + self.outputTransposeBuffer and self.outputPadded and + self.weightTransposeBuffer and self.weightPadded then return true end - -- From here on, we should find failsafe to another SpatialConvolution - self.inputBuffer = nil + print('Not enough memory for CuFFT buffers, need to fall back') + + -- TODO: From here on, we should failsafe to another SpatialConvolution + self:cleanupBuffers() + + return false +end + +function SpatialConvolutionCuFFT:cleanupBuffers() + parent.cleanupBuffers(self) + + -- Kill cufft plans references to trigger GC + self.cufftPlanInputFFT = nil + self.cufftPlanWeightFFT = nil + self.cufftPlanOutputFFT = nil + self.cufftPlanInputIFFT = nil + self.cufftPlanWeightIFFT = nil + self.cufftPlanOutputIFFT = nil + + -- Kill local references to global buffers self.inputTransposeBuffer = nil - self.outputBuffer = nil + self.inputPadded = nil self.outputTransposeBuffer = nil - self.weightBuffer = nil + self.outputPadded = nil self.weightTransposeBuffer = nil - freeBuffer(FFTConvolution, FFTInputBufferType, inputSize) - freeBuffer(FFTConvolutionTranspose, FFTInputTransposeBufferType, inputSize) - freeBuffer(FFTConvolution, FFTOutputBufferType, bufferSizesO) - freeBuffer(FFTConvolutionTranspose, FFTOutputTransposeBufferType, bufferSizesO) - freeBuffer(FFTConvolution, FFTWeightBufferType, bufferSizesW) - freeBuffer(FFTConvolutionTranspose, FFTWeightTransposeBufferType, bufferSizesW) + self.weightPadded = nil +end - collectgarbage() - collectgarbage() - return false -end + -- TODO: CuFFT is more flexible to allow for arbitrary FFT interpolation sizes. + -- When writing the autotuner, it is easy to get different interpolation sizes + -- for the FFTs in the 3 passes, perform best. + -- For correction of reuse, reuse should only work if interpolation sizes are + -- the same between 2 passes. + -- In practice this means supporting real producer / consumer semantics in the + -- tag space. In particular we need to match any read to a unique write and + -- ensure they occur in the proper order. + -- For instance, there is no reason that updateGradInput occurs before + -- accGradParameters so we need to ensure the first one writes gradOutput and + -- the second one reads it +function SpatialConvolutionCuFFT:getBufferKey(BufferType, bufferSizes, metaData) + assert(torch.type(bufferSizes) == 'torch.LongStorage', + torch.type(bufferSizes)) + assert(torch.type(metaData) == 'table', torch.type(metaData)) + + -- If no reuse, we hit into the buffers discrimianted by device and + -- BufferType. These buffers are shared with all FFT convolution modules + -- and do not allow reuse for long dependences (i.e. only gradOutput can + -- only be reused from a supporting backward implementation) + if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone) + then + return parent.getBufferKeyGeneric(self, BufferType) + end -function getBuffer(OperationType, tensorType, tensorSizes) - d1 = tensorSizes[1] - d2 = tensorSizes[2] - -- Preemptively resize to d1 . d2 . 2^x . 2^y - d3 = math.max(nextPowerOf2(tensorSizes[3]), nextPowerOf2(tensorSizes[4])) - d4 = d3 - assert(d3 == d4, 'Squared fft convolution to support fbfft') - numElements = d1 * d2 * d3 * (d4 / 2 + 1) * 2 - - storage = torch.LongStorage(5) - - storage[1] = d1 - storage[2] = d2 - storage[3] = d3 - storage[4] = d4 / 2 + 1 - storage[5] = 2 - - -- Conservative max buffer size, always needed at least by fbfft - -- Handle memory bloat by tiled convolutions + inplace fft - if mk.get(cudaTensorBuffers, - OperationType, - tensorType, - cutorch.getDevice()) == nil then - local free_bytes, total_bytes = cutorch.getMemoryUsage() - if numElements * sizeOfElem > free_bytes then - return nil - end + if not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseWeight) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseInput) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseOutput) + then + assert(false, "unknown memory reuse policy " .. self.memoryReusePolicy) + end - mk.put(cudaTensorBuffers, OperationType, tensorType, cutorch.getDevice(), - torch.CudaTensor(storage)) - else - -- Storage already exists but may need resizing. - -- If resizing means expanding, make sure we have enough space - t = mk.get(cudaTensorBuffers, OperationType, tensorType, cutorch.getDevice()) - if numElements > t:nElement() then - -- Don't call cuda API unless really needed - local free_bytes, total_bytes = cutorch.getMemoryUsage() - if (numElements - t:nElement()) * sizeOfElem > free_bytes then - return nil - end + -- TODO: needs semantics for proper producer consumer dependences and + -- ordering for RAW dependences by using self.moduleTimeStep properly + local md = {} + if metaData then + -- This is an adhoc way to discriminate between + -- updateOutput / updateGradInput / accGradParameters + -- input (false) / gradInput (true) / input (false) + -- output (true) / gradOutput (false) / input (false) + -- weight (false) / weight (false) / gradWeight (true) + -- + local isOutputOfAlgorithm = false + -- In cufft mode, the transposed complex buffers are reused + if (metaData.pass == nn.SpatialConvolutionFFT.ForwardFFTPass and + BufferType == + nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType) or + (metaData.pass == nn.SpatialConvolutionFFT.BackwardFFTPass and + BufferType == + nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType) or + (metaData.pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and + BufferType == + nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType) + then + isOutputOfAlgorithm = true end - t:resize(storage) + md.isOutputOfAlgorithm = isOutputOfAlgorithm end - t = mk.get(cudaTensorBuffers, OperationType, tensorType, cutorch.getDevice()) - return t -end + -- If no memory reuse, all modules must use the same buffers, only + -- discriminate by buffer type and device id. + local moduleDiscr = self.moduleUID + if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone) + then + moduleDiscr = nil + bufferSizes = nil + md = nil + end -function freeBuffer(OperationType, tensorType, tensorSizes) - mk.put(cudaTensorBuffers, - OperationType, - tensorType, - cutorch.getDevice(), nil) + local bufferKey = { + self.cudaTensorBuffers, + cutorch.getDevice(), + BufferType, + bufferSizes, + moduleDiscr, + -- Be sure to put a counter for buffer and reuse btw timesteps or + -- memory will be blown (i.e. full DSA = ouch) + -- self.moduleTimeStep, + md + } + local res = thrift.to_string(bufferKey) + if not self.bufferKeys:contains(res) then + self.bufferKeys:append(res) + end + return res end diff --git a/fbcunn/SpatialConvolutionFBFFT.lua b/fbcunn/SpatialConvolutionFBFFT.lua new file mode 100644 index 0000000..015dfcf --- /dev/null +++ b/fbcunn/SpatialConvolutionFBFFT.lua @@ -0,0 +1,433 @@ +-- Copyright 2004-present Facebook. All Rights Reserved. + +require 'cudnn' +local thrift = require('fb.thrift') +local ffi = require 'ffi' + +local lib_name = 'torch_fb_fbcunn_mm' +local lib_path = package.searchpath(lib_name, package.cpath) +local FBMMFFI = ffi.load(lib_path and lib_path or lib_name) + +local lib_name = 'torch_fb_fbcunn_convolution_bias' +local lib_path = package.searchpath(lib_name, package.cpath) +local ConvolutionBiasFFI = ffi.load(lib_path and lib_path or lib_name) + +--[[ + Actual module +--]] +local SpatialConvolutionFBFFT, parent = + torch.class('nn.SpatialConvolutionFBFFT', 'nn.SpatialConvolutionFFT') + +-- memoryReusePolicy is one of: +-- SpatialConvolutionFFT.memoryReuseNone +-- SpatialConvolutionFFT.memoryReuseWeight +-- SpatialConvolutionFFT.memoryReuseInput +-- SpatialConvolutionFFT.memoryReuseOutput +function SpatialConvolutionFBFFT:__init(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + memoryReusePolicy, + numCudaStreams) + assert(torch.type(nInputPlane) == 'number') + assert(torch.type(nOutputPlane) == 'number') + assert(torch.type(kW) == 'number') + assert(torch.type(kH) == 'number') + assert(torch.type(dW) == 'number') + assert(torch.type(dH) == 'number') + assert(torch.type(padLeft) == 'number') + assert(torch.type(padUp) == 'number') + assert(memoryReusePolicy == nil or + torch.type(memoryReusePolicy) == 'string' or + torch.type(memoryReusePolicy) == 'table') + assert(numCudaStreams == nil or torch.type(numCudaStreams) == 'number') + + parent.__init( + self, nInputPlane, nOutputPlane, kW, kH, dW, dH, padLeft, padUp, + memoryReusePolicy, numCudaStreams) + parent.fftImplementation = 'fbfft' + + -- Sanity assertions + assert(self.printDebugLevel == -1) + assert(self.nInputPlane == nInputPlane) + assert(self.nOutputPlane == nOutputPlane) + assert(self.kW == kW) + assert(self.kH == kH) + assert(self.dH == 1, "fft only supports stride-1 convolutions atm") + assert(self.dW == 1, "fft only supports stride-1 convolutions atm") + + assert(self.weight:size(1) == nOutputPlane and + self.weight:size(2) == nInputPlane and + self.weight:size(3) == kH and + self.weight:size(4) == kW) + assert(self.bias:size(1) == nOutputPlane) + assert(self.gradWeight:size(1) == nOutputPlane and + self.gradWeight:size(2) == nInputPlane and + self.gradWeight:size(3) == kH and + self.gradWeight:size(4) == kW) + assert(self.gradBias:size(1) == nOutputPlane) + + -- Temporary buffers + assert(not self.inputBuffer) + assert(not self.inputTransposeBuffer) + assert(not self.inputPadded) + assert(not self.outputBuffer) + assert(not self.outputTransposeBuffer) + assert(not self.outputPadded) + assert(not self.weightBuffer) + assert(not self.weightTransposeBuffer) + assert(not self.weightPadded) + + -- FBFFT plans, useless for fbfft + assert(not self.cufftPlanInputFFT) + assert(not self.cufftPlanWeightFFT) + assert(not self.cufftPlanOutputFFT) + assert(not self.cufftPlanInputIFFT) + assert(not self.cufftPlanWeightIFFT) + assert(not self.cufftPlanOutputIFFT) + + assert(self.padUp < self.kH and self.padDown < self.kH and + self.padLeft < self.kW and self.padRight < self.kW, + "Padding must be smaller than kernel") +end + +function SpatialConvolutionFBFFT:prepareSizeAndBuffers(i, w, o, metaData) + return self:prepareFBFFTSizeAndBuffers(i, w, o, metaData) +end + +function SpatialConvolutionFBFFT:updateOutputFFTImpl(input, reuseList, metaData) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + if not metaData then + metaData = {} + metaData.pass = nn.SpatialConvolutionFFT.ForwardFFTPass + end + + local commonSize = + self:prepareSizeAndBuffers(input, self.weight, self.output, metaData) + + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#input:size() == cublasBatchDims + 2) + + local inputFFTStream = 1 + local weightFFTStream = 2 + local fbmmStream = 3 + assert(cutorch.getNumStreams() >= 3) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- 1. FFTs + if not reuseList or + not reuseList:contains(nn.SpatialConvolutionFFT.FFTInputBufferType) + then + -- Potentially reuse buffer if so told + -- Makes sense because we could asynchronously compute these AoT + local fftWrapperPadded = nn.FFTWrapper( + self.fftImplementation, self.padLeft, self.padUp) + cutorch.setStream(inputFFTStream) + fftWrapperPadded:fft(input, self.inputBuffer, cublasBatchDims) + cutorch.setStream(fbmmStream) + cutorch.streamWaitFor(fbmmStream, {inputFFTStream}) + end + + if not reuseList or + not reuseList:contains(nn.SpatialConvolutionFFT.FFTWeightBufferType) + then + -- Potentially reuse buffer if so told + -- Makes sense because we could asynchronously compute these AoT + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + cutorch.setStream(weightFFTStream) + fftWrapper:fft(self.weight, self.weightBuffer, cublasBatchDims) + cutorch.setStream(fbmmStream) + cutorch.streamWaitFor(fbmmStream, {weightFFTStream}) + end + + -- 2. GEMM with in place transpose + -- stream must match the IFFT stream for sync without waiting + -- explicitly + cutorch.setStream(fbmmStream) + local norm = self:getNormalizationFactor(commonSize, input) + FBMMFFI.transposeMMFFI(cutorch._state, + self.inputBuffer:cdata(), + self.weightBuffer:cdata(), + self.outputBuffer:cdata(), + 1.0 / norm, + false, + true, + false) + + -- 3. IFFT + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + cutorch.setStream(fbmmStream) + fftWrapper:ffti(self.output, self.outputBuffer, cublasBatchDims) + + -- 4. Finally, bias update + if not metaData.skipBias then + cutorch.setStream(fbmmStream) + ConvolutionBiasFFI.updateOutputBiasFFI( + cutorch._state, self.output:cdata(), self.bias:cdata()) + end + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + return self.output +end + + +--[[ + Update input gradients +--]] +function SpatialConvolutionFBFFT:updateGradInputFFTImpl( + input, gradOutput, reuseList, metaData) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + if not metaData then + metaData = {} + metaData.pass = nn.SpatialConvolutionFFT.BackwardFFTPass + end + + local commonSize = + self:prepareSizeAndBuffers(input, self.weight, gradOutput, metaData) + + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#input:size() == cublasBatchDims + 2) + + local gradInputFFTStream = 1 + local gradOutputFFTStream = 2 + local fbmmStream = 3 + assert(cutorch.getNumStreams() >= 3) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- 1. FFTs + if (not reuseList or + not reuseList:contains(nn.SpatialConvolutionFFT.FFTOutputBufferType)) + then + -- Potentially reuse buffer if so told + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + cutorch.setStream(gradOutputFFTStream) + fftWrapper:fft(gradOutput, self.outputBuffer, cublasBatchDims) + cutorch.setStream(fbmmStream) + cutorch.streamWaitFor(fbmmStream, {gradOutputFFTStream}) + end + + if (not reuseList or + not reuseList:contains(nn.SpatialConvolutionFFT.FFTWeightBufferType)) + and not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseWeight) + then + -- Potentially reuse buffer if so told + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + cutorch.setStream(gradInputFFTStream) + fftWrapper:fft(self.weight, self.weightBuffer, cublasBatchDims) + cutorch.setStream(fbmmStream) + cutorch.streamWaitFor(fbmmStream, {gradInputFFTStream}) + end + + -- 2. GEMM with in place transpose + -- stream must match the IFFT stream for sync without waiting + -- explicitly + cutorch.setStream(fbmmStream) + local norm = self:getNormalizationFactor(commonSize, gradOutput) + FBMMFFI.transposeMMFFI(cutorch._state, + self.outputBuffer:cdata(), + self.weightBuffer:cdata(), + self.inputBuffer:cdata(), + 1.0 / norm, + false, + false, + false) + + -- 3. IFFT + cutorch.setStream(fbmmStream) + local fftWrapperPadded = nn.FFTWrapper( + self.fftImplementation, self.padLeft, self.padUp) + fftWrapperPadded:ffti(self.gradInput, self.inputBuffer, cublasBatchDims) + + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + return self.gradInput +end + + +--[[ + Accumulate weight gradients +--]] +function SpatialConvolutionFBFFT:accGradParametersFFTImpl( + input, gradOutput, scale, reuseList, metaData) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + local scale = scale or 1 + + if not metaData then + metaData = {} + metaData.pass = nn.SpatialConvolutionFFT.AccGradientFFTPass + end + + local commonSize = + self:prepareSizeAndBuffers(input, self.gradWeight, gradOutput, metaData) + + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#input:size() == cublasBatchDims + 2) + + local inputFFTStream = 1 + local gradOutputFFTStream = 2 + local gradBiasFFTStream = 3 + local fbmmStream = 4 + assert(cutorch.getNumStreams() >= 4) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ######################################### + cutorch.streamBarrier(self.allStreams) + + -- 0. Bias update is independent + if not metaData.skipBias then + cutorch.setStream(gradBiasFFTStream) + ConvolutionBiasFFI.accGradParametersBiasFFI( + cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale) + end + + -- 1. FFTs + if (not reuseList or not reuseList:contains( + nn.SpatialConvolutionFFT.FFTOutputBufferType)) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseOutput) + then + -- Potentially reuse buffer if so told + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + cutorch.setStream(gradOutputFFTStream) + fftWrapper:fft(gradOutput, self.outputBuffer, cublasBatchDims) + cutorch.setStream(fbmmStream) + cutorch.streamWaitFor(fbmmStream, {gradOutputFFTStream}) + end + + if (not reuseList or not reuseList:contains( + nn.SpatialConvolutionFFT.FFTInputBufferType)) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseInput) + then + -- Potentially reuse buffer if so told + cutorch.setStream(inputFFTStream) + local fftWrapperPadded = nn.FFTWrapper( + self.fftImplementation, self.padLeft, self.padUp) + fftWrapperPadded:fft(input, self.inputBuffer, cublasBatchDims) + cutorch.setStream(fbmmStream) + cutorch.streamWaitFor(fbmmStream, {inputFFTStream}) + end + + -- 2. GEMM with in place transpose + -- stream must match the IFFT stream for sync without waiting + -- explicitly + cutorch.setStream(fbmmStream) + local norm = self:getNormalizationFactor(commonSize, gradOutput) + FBMMFFI.transposeMMFFI(cutorch._state, + self.outputBuffer:cdata(), + self.inputBuffer:cdata(), + self.weightBuffer:cdata(), + (1.0 * scale) / norm, + true, + false, + false) + + -- 3. IFFT + cutorch.setStream(fbmmStream) + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + fftWrapper:ffti(self.gradWeight, self.weightBuffer, cublasBatchDims) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ######################################### + cutorch.streamBarrier(self.allStreams) +end + + +function SpatialConvolutionFBFFT:getBufferKey(BufferType, bufferSizes, metaData) + assert(torch.type(bufferSizes) == 'torch.LongStorage', + torch.type(bufferSizes)) + assert(torch.type(metaData) == 'table', + torch.type(metaData)) + + if self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone) + then + return parent.getBufferKeyGeneric(self, BufferType) + end + + if not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseWeight) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseInput) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseOutput) + then + assert(false, "unknown memory reuse policy " .. self.memoryReusePolicy) + end + + -- TODO: needs semantics for proper producer consumer dependences and + -- ordering for RAW dependences by using self.moduleTimeStep properly + local md = {} + if metaData then + -- This is an adhoc way to discriminate between + -- updateOutput / updateGradInput / accGradParameters + -- input (false) / gradInput (true) / input (false) + -- output (true) / gradOutput (false) / input (false) + -- weight (false) / weight (false) / gradWeight (true) + -- + local isOutputOfAlgorithm = false + -- In cufft mode, the complex buffers are reused + if (metaData.pass == nn.SpatialConvolutionFFT.ForwardFFTPass and + BufferType == nn.SpatialConvolutionFFT.FFTOutputBufferType) or + (metaData.pass == nn.SpatialConvolutionFFT.BackwardFFTPass and + BufferType == nn.SpatialConvolutionFFT.FFTInputBufferType) or + (metaData.pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and + BufferType == nn.SpatialConvolutionFFT.FFTWeightBufferType) + then + isOutputOfAlgorithm = true + end + md.isOutputOfAlgorithm = isOutputOfAlgorithm + end + + -- If no memory reuse, all modules must use the same buffers, only + -- discriminate by buffer type and device id. + local moduleDiscr = self.moduleUID + if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone) + then + moduleDiscr = nil + bufferSizes = nil + md = nil + end + + local bufferKey = { + self.cudaTensorBuffers, + cutorch.getDevice(), + BufferType, + bufferSizes, + moduleDiscr, + -- Be sure to put a counter for buffer and reuse btw timesteps or + -- memory will be blown (i.e. full DSA = ouch) + -- self.moduleTimeStep, + md + } + local res = thrift.to_string(bufferKey) + if not self.bufferKeys:contains(res) then + self.bufferKeys:append(res) + end + return res +end + +function SpatialConvolutionFBFFT:cleanupBuffers() + parent.cleanupBuffers(self) +end diff --git a/fbcunn/SpatialConvolutionFBFFTGemm.lua b/fbcunn/SpatialConvolutionFBFFTGemm.lua new file mode 100644 index 0000000..af73204 --- /dev/null +++ b/fbcunn/SpatialConvolutionFBFFTGemm.lua @@ -0,0 +1,599 @@ +-- Copyright 2004-present Facebook. All Rights Reserved. + +require 'cudnn' +local ffi = require 'ffi' +-- TODO: @soumith, any better way than this fully convoluted path ? +local ConvolutionBiasFFI = ffi.load('torch_fb_fbcunn_convolution_bias') +local thrift = require('fb.thrift') + +ffi.cdef[[ + void updateOutputBiasFFI(THCState*, THCudaTensor*, THCudaTensor*); + void accGradParametersBiasFFI( + THCState*, THCudaTensor*, THCudaTensor*, float scale); +]] + +--[[ + Actual module +--]] +local SpatialConvolutionFBFFTGemm, parent = + torch.class('nn.SpatialConvolutionFBFFTGemm', 'nn.SpatialConvolutionFFT') + +function SpatialConvolutionFBFFTGemm:__init(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + memoryReusePolicy, + numCudaStreams) + assert(torch.type(nInputPlane) == 'number') + assert(torch.type(nOutputPlane) == 'number') + assert(torch.type(kW) == 'number') + assert(torch.type(kH) == 'number') + assert(torch.type(dW) == 'number') + assert(torch.type(dH) == 'number') + assert(memoryReusePolicy == nil or + torch.type(memoryReusePolicy) == 'string' or + torch.type(memoryReusePolicy) == 'table') + assert(numCudaStreams == nil or torch.type(numCudaStreams) == 'number') + + parent.__init(self, + nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + memoryReusePolicy, + numCudaStreams) + + parent.fftImplementation = 'fbfft' + + -- Sanity assertions + assert(self.printDebugLevel == -1) + assert(self.nInputPlane == nInputPlane) + assert(self.nOutputPlane == nOutputPlane) + assert(self.kW == kW) + assert(self.kH == kH) + assert(self.dH == 1, "fft only supports stride-1 convolutions atm") + assert(self.dW == 1, "fft only supports stride-1 convolutions atm") + + assert(self.weight:size(1) == nOutputPlane and + self.weight:size(2) == nInputPlane and + self.weight:size(3) == kH and + self.weight:size(4) == kW) + assert(self.bias:size(1) == nOutputPlane) + assert(self.gradWeight:size(1) == nOutputPlane and + self.gradWeight:size(2) == nInputPlane and + self.gradWeight:size(3) == kH and + self.gradWeight:size(4) == kW) + assert(self.gradBias:size(1) == nOutputPlane) + + -- Temporary buffers + assert(not self.inputBuffer) + assert(not self.inputTransposeBuffer) + assert(not self.outputBuffer) + assert(not self.outputTransposeBuffer) + assert(not self.weightBuffer) + assert(not self.weightTransposeBuffer) +end + +function SpatialConvolutionFBFFTGemm:prepareSizeAndBuffers(i, w, o, metaData) + return self:prepareFBFFTGemmSizeAndBuffers(i, w, o, metaData, metaData.pass) +end + +--[[ + Update output +--]] +function SpatialConvolutionFBFFTGemm:updateOutputFFTImpl(input, reuseList) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + local metaData = {} + metaData.pass = nn.SpatialConvolutionFFT.ForwardFFTPass + + local commonSize = + self:prepareSizeAndBuffers(input, self.weight, self.output, metaData) + + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#input:size() == cublasBatchDims + 2) + + local inputFFTStream = 1 + local weightFFTStream = 2 + local gemmStream = 3 + assert(cutorch.getNumStreams() >= 3) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + -- 1. FFT + transpose input and weights + if not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType) + then + cutorch.setStream(inputFFTStream) + local fftWrapperPadded = nn.FFTWrapper( + self.fftImplementation, self.padLeft, self.padUp) + fftWrapperPadded:fftTranspose(input, + self.inputBuffer, + self.inputTransposeBuffer, + cublasBatchDims, + 1, -- handle + inputFFTStream -- stream + ) + end + + if not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType) + then + cutorch.setStream(weightFFTStream) + fftWrapper:fftTranspose(self.weight, + self.weightBuffer, + self.weightTransposeBuffer, + cublasBatchDims, + 2, -- handle + weightFFTStream -- stream + ) + end + + -- 2. CGEMM on transposed tensors + -- This call uses all the handles and streams available + -- CuBLAS is column major and computes C' = B' * A' + local useBatchedMM = (commonSize[3] * commonSize[4] >= 128) + local cublasWrapper = nn.CuBLASWrapper() + local norm = self:getNormalizationFactor(commonSize, input) + + if not useBatchedMM then + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- a. multiple GEMMs on multiple streams + cublasWrapper:matmultComplex(self.inputTransposeBuffer, + self.weightTransposeBuffer, + self.outputTransposeBuffer, + {0, 1}, -- iterDims == 2 + { }, -- cublasBatchDims + 'n', + 'c', + 1.0 / norm) + + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + else + -- stream must match the IFFT stream for sync without waiting + -- explicitly + cutorch.setStream(gemmStream) + cutorch.streamWaitFor(gemmStream, {inputFFTStream, weightFFTStream}) + cublasWrapper:matmultComplex(self.inputTransposeBuffer, + self.weightTransposeBuffer, + self.outputTransposeBuffer, + {}, -- iterDims + {0, 1}, -- cublasBatchDims == 2 + 'n', + 'c', + 1.0 / norm) + end + + -- 3. transpose + IFFT output + cutorch.setStream(gemmStream) + fftWrapper:transposeIFFT(self.output, + self.outputBuffer, + self.outputTransposeBuffer, + cublasBatchDims, + 1, -- handle + gemmStream -- stream + ) + + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- 4. Finally, bias update + cutorch.setStream(gemmStream) + ConvolutionBiasFFI.updateOutputBiasFFI( + cutorch._state, self.output:cdata(), self.bias:cdata()) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + return self.output +end + +--[[ + Update input gradients +--]] + + +function SpatialConvolutionFBFFTGemm:updateGradInputFFTImpl( + input, gradOutput, reuseList) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + local metaData = {} + metaData.pass = nn.SpatialConvolutionFFT.BackwardFFTPass + + local commonSize = + self:prepareSizeAndBuffers(input, self.weight, gradOutput, metaData) + + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#input:size() == cublasBatchDims + 2) + + local weightFFTStream = 1 + local gradOutputFFTStream = 2 + local gemmStream = 3 + assert(cutorch.getNumStreams() >= 3) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + + -- 1. FFT + transpose gradOutput and weights + if not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType) + then + cutorch.setStream(gradOutputFFTStream) + fftWrapper:fftTranspose(gradOutput, + self.outputBuffer, + self.outputTransposeBuffer, + cublasBatchDims, + 1, -- handle + gradOutputFFTStream -- stream + ) + end + + if (not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType)) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseWeight) + then + -- TODO: fix this: transpose changes the TH metadata post buffer + -- get/put which screws up the tensor + cutorch.setStream(weightFFTStream) + fftWrapper:fftTranspose(self.weight, + self.weightBuffer, + self.weightTransposeBuffer, + cublasBatchDims, + 2, -- handle + weightFFTStream -- stream + ) + end + + -- 2. CGEMM on transposed tensors + -- This call uses all the handles and streams available + -- CuBLAS is column major and computes C' = B' * A' + local useBatchedMM = (commonSize[3] * commonSize[4] >= 128) + local cublasWrapper = nn.CuBLASWrapper() + local norm = self:getNormalizationFactor(commonSize, gradOutput) + if not useBatchedMM then + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + cublasWrapper:matmultComplex(self.outputTransposeBuffer, + self.weightTransposeBuffer, + self.inputTransposeBuffer, + {0, 1}, -- iterDims == 2 + { }, -- cublasBatchDims + 'n', + 'n', + 1.0 / norm) + + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + else + -- stream must match the IFFT stream for sync without waiting + -- explicitly + cutorch.setStream(gemmStream) + cutorch.streamWaitFor(gemmStream, {weightFFTStream, gradOutputFFTStream}) + + cublasWrapper:matmultComplex(self.outputTransposeBuffer, + self.weightTransposeBuffer, + self.inputTransposeBuffer, + { }, -- iterDims + {0, 1}, -- cublasBatchDims == 2 + 'n', + 'n', + 1.0 / norm) + end + + -- 3. transpose + IFFT gradInput + cutorch.setStream(gemmStream) + + local fftWrapperPadded = nn.FFTWrapper( + self.fftImplementation, self.padLeft, self.padUp) + fftWrapperPadded:transposeIFFT(self.gradInput, + self.inputBuffer, + self.inputTransposeBuffer, + cublasBatchDims, + 1, -- handle + gemmStream -- stream + ) + + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + return self.gradInput +end + + +--[[ + Accumulate weight gradients +--]] +function SpatialConvolutionFBFFTGemm:accGradParametersFFTImpl( + input, gradOutput, scale, reuseList) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + scale = scale or 1 + + local metaData = {} + metaData.pass = nn.SpatialConvolutionFFT.AccGradientFFTPass + + local commonSize = + self:prepareSizeAndBuffers(input, self.gradWeight, gradOutput, metaData) + + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#input:size() == cublasBatchDims + 2) + + local inputFFTStream = 1 + local gradOutputFFTStream = 2 + local gradBiasFFTStream = 3 + local gemmStream = 4 + assert(cutorch.getNumStreams() >= gemmStream) + + -- Synchronize all streams on SESE, change when we have a proper DAG impl + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + + -- 0. gradBIas update is independent + cutorch.setStream(gradBiasFFTStream) + ConvolutionBiasFFI.accGradParametersBiasFFI( + cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale) + + -- 1. FFT + transpose gradOutput and weights + if (not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType)) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseOutput) + then + -- TODO: fix this: transpose changes the TH metadata post buffer + -- get/put which screws up the tensor + cutorch.setStream(gradOutputFFTStream) + fftWrapper:fftTranspose(gradOutput, + self.outputBuffer, + self.outputTransposeBuffer, + cublasBatchDims, + 1, + gradOutputFFTStream) + end + + if (not reuseList or + not reuseList:contains( + nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType)) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseInput) + then + cutorch.setStream(inputFFTStream) + local fftWrapperPadded = nn.FFTWrapper( + self.fftImplementation, self.padLeft, self.padUp) + fftWrapperPadded:fftTranspose(input, + self.inputBuffer, + self.inputTransposeBuffer, + cublasBatchDims, + 2, + inputFFTStream) + end + + -- 2. CGEMM on transposed tensors + -- This call uses all the handles and streams available + -- CuBLAS is column major and computes C' = B' * A' + local useBatchedMM = (commonSize[3] * commonSize[4] >= 128) + local cublasWrapper = nn.CuBLASWrapper() + local norm = self:getNormalizationFactor(commonSize, gradOutput) + if not useBatchedMM then + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + cublasWrapper:matmultComplex(self.outputTransposeBuffer, + self.inputTransposeBuffer, + self.weightTransposeBuffer, + {0, 1}, -- iterDims == 2 + { }, -- cublasBatchDims + 'c', + 'n', + (1.0 * scale) / norm) + + -- Synchronize all streams: iterated GEMMS use all available streams + -- ############################################## + cutorch.streamBarrier(self.allStreams) + else + -- stream must match the IFFT stream for sync without waiting + -- explicitly + cutorch.setStream(gemmStream) + cutorch.streamWaitFor(gemmStream, {inputFFTStream, gradOutputFFTStream}) + + cublasWrapper:matmultComplex(self.outputTransposeBuffer, + self.inputTransposeBuffer, + self.weightTransposeBuffer, + { }, -- iterDims + {0, 1}, -- cublasBatchDims == 2 + 'c', + 'n', + (1.0 * scale) / norm) + end + + -- 3. transpose + IFFT gradInput + cutorch.setStream(gemmStream) + fftWrapper:transposeIFFT(self.gradWeight, + self.weightBuffer, + self.weightTransposeBuffer, + cublasBatchDims, + 1, -- handle + gemmStream -- stream + ) + + -- ############################################## + cutorch.streamBarrier(self.allStreams) +end + + +--[[ + -- Buffer creation and reuse given a size and a pass. + -- Different passes use different tensors as the 'output of the pass'. + -- nn.SpatialConvolutionFFT.ForwardFFTPass -> output + -- nn.SpatialConvolutionFFT.BackwardFFTPass -> input + -- nn.SpatialConvolutionFFT.AccGradientFFTPass -> weight + -- The buffers corresponding to the tensors that is the 'output of the pass' + -- must be properly transposed in order for the CGemm call to be consistent. + -- This is a simple metadata transposition, might as well construct properly. +--]] +function SpatialConvolutionFBFFTGemm:prepareBuffers(commonSize, pass, metaData) + assert(commonSize and pass and self.fftImplementation) + assert(torch.type(metaData) == 'table', torch.type(metaData)) + + if not parent.prepareBuffers(self, commonSize, pass, metaData) + then + return false + end + + local bufferSizesO = torch.LongStorage({ + commonSize[1], self.nOutputPlane, commonSize[3], commonSize[4]}) + local bufferSizesW = torch.LongStorage({ + self.nOutputPlane, self.nInputPlane, commonSize[3], commonSize[4]}) + + self.inputTransposeBuffer = self:getBuffer( + nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType, + commonSize, + true, + metaData) + self.outputTransposeBuffer = self:getBuffer( + nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType, + bufferSizesO, + true, + metaData) + self.weightTransposeBuffer = self:getBuffer( + nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType, + bufferSizesW, + true, + metaData) + + if self.inputTransposeBuffer and + self.outputTransposeBuffer and + self.weightTransposeBuffer then + return true + end + + print('Not enough memory for FBFFTGemm buffers, need to fall back') + + -- TODO: From here on, we should failsafe to another SpatialConvolution + self:cleanupBuffers() + + assert(false, 'Out of memory!') +end + +function SpatialConvolutionFBFFTGemm:cleanupBuffers() + parent.cleanupBuffers(self) + + -- Kill local references to global buffers + self.inputTransposeBuffer = nil + self.outputTransposeBuffer = nil + self.weightTransposeBuffer = nil +end + + +function SpatialConvolutionFBFFTGemm:getBufferKey( + BufferType, bufferSizes, metaData) + assert(torch.type(bufferSizes) == 'torch.LongStorage', + torch.type(bufferSizes)) + assert(torch.type(metaData) == 'table', torch.type(metaData)) + + -- If no reuse, we hit into the buffers discrimianted by device and + -- BufferType. These buffers are shared with all FFT convolution modules + -- and do not allow reuse for long dependences (i.e. only gradOutput can + -- only be reused from a supporting backward implementation) + if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone) + then + return parent.getBufferKeyGeneric(self, BufferType) + end + + if not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseWeight) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseInput) and + not self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseOutput) + then + assert(false, "unknown memory reuse policy " .. self.memoryReusePolicy) + end + + -- TODO: needs semantics for proper producer consumer dependences and + -- ordering for RAW dependences by using self.moduleTimeStep properly + local md = {} + if metaData then + -- This is an adhoc way to discriminate between + -- updateOutput / updateGradInput / accGradParameters + -- input (false) / gradInput (true) / input (false) + -- output (true) / gradOutput (false) / input (false) + -- weight (false) / weight (false) / gradWeight (true) + -- + local isOutputOfAlgorithm = false + -- In cufft mode, the transposed complex buffers are reused + if (metaData.pass == nn.SpatialConvolutionFFT.ForwardFFTPass and + BufferType == + nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType) or + (metaData.pass == nn.SpatialConvolutionFFT.BackwardFFTPass and + BufferType == + nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType) or + (metaData.pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and + BufferType == + nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType) + then + isOutputOfAlgorithm = true + end + md.isOutputOfAlgorithm = isOutputOfAlgorithm + end + + -- If no memory reuse, all modules must use the same buffers, only + -- discriminate by buffer type and device id. + local moduleDiscr = self.moduleUID + if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone) + then + moduleDiscr = nil + bufferSizes = nil + md = nil + end + + local bufferKey = { + self.cudaTensorBuffers, + cutorch.getDevice(), + BufferType, + bufferSizes, + moduleDiscr, + -- Be sure to put a counter for buffer and reuse btw timesteps or + -- memory will be blown (i.e. full DSA = ouch) + -- self.moduleTimeStep, + md + } + local res = thrift.to_string(bufferKey) + if not self.bufferKeys:contains(res) then + self.bufferKeys:append(res) + end + return res +end diff --git a/fbcunn/SpatialConvolutionFFT.lua b/fbcunn/SpatialConvolutionFFT.lua new file mode 100644 index 0000000..1aa195e --- /dev/null +++ b/fbcunn/SpatialConvolutionFFT.lua @@ -0,0 +1,1012 @@ +-- Copyright 2004-present Facebook. All Rights Reserved. + +-- TODO: Catch errors in general +-- TODO: Catch errors on cufft plan creation and cleanupBuffers +-- TODO: Cleanup buffers and make them independent of tasks +-- TODO: Auto-tuning + +require 'cudnn' +local List = require 'pl.List' +local thrift = require('fb.thrift') + +-- Float assumed, 4 bytes +local sizeOfElem = 4 + +local prec = 0.00002 + +local function isnan(n) return tostring(n) == tostring((-1)^.5) end + +-- Module + +local SpatialConvolutionFFT, parent = + torch.class('nn.SpatialConvolutionFFT', 'nn.Module') + +-- multi-key map indexed by {BufferType, deviceId, [size], [metaData]} +SpatialConvolutionFFT.cudaTensorBuffers = {} +SpatialConvolutionFFT.bufferMap = {} + +-- BufferType +SpatialConvolutionFFT.FFTInputBufferType = + "FFTInputBufferType" +SpatialConvolutionFFT.FFTOutputBufferType = + "FFTOutputBufferType" +SpatialConvolutionFFT.FFTWeightBufferType = + "FFTWeightBufferType" +SpatialConvolutionFFT.CuFFTInputTransposeBufferType = + "CuFFTInputTransposeBufferType" +SpatialConvolutionFFT.CuFFTOutputTransposeBufferType = + "CuFFTOutputTransposeBufferType" +SpatialConvolutionFFT.CuFFTWeightTransposeBufferType = + "CuFFTWeightTransposeBufferType" +SpatialConvolutionFFT.CuFFTPaddedInputBuffer = + "CuFFTPaddedInputBuffer" +SpatialConvolutionFFT.CuFFTPaddedWeightBuffer = + "CuFFTPaddedWeightBuffer" +SpatialConvolutionFFT.CuFFTPaddedOutputBuffer = + "CuFFTPaddedOutputBuffer" + +-- Convenience lists +SpatialConvolutionFFT.cudaRealBufferTypes = List{ + SpatialConvolutionFFT.CuFFTPaddedInputBuffer, + SpatialConvolutionFFT.CuFFTPaddedWeightBuffer, + SpatialConvolutionFFT.CuFFTPaddedOutputBuffer} +SpatialConvolutionFFT.cudaPaddedBufferTypes = List{ + SpatialConvolutionFFT.CuFFTPaddedInputBuffer, + SpatialConvolutionFFT.CuFFTPaddedWeightBuffer, + SpatialConvolutionFFT.CuFFTPaddedOutputBuffer} + +-- Memory reuse policy +SpatialConvolutionFFT.memoryReuseNone = "none" +SpatialConvolutionFFT.memoryReuseInput = "input" +SpatialConvolutionFFT.memoryReuseOutput = "output" +SpatialConvolutionFFT.memoryReuseWeight = "weight" +SpatialConvolutionFFT.memoryReuseAll = "all" + +-- Use to uniquely identify steps of this module and to properly track +-- producer-consumer dependences in the tagspace. +-- TODO: increment atomically in a multi-threaded environment +SpatialConvolutionFFT.moduleInstance = 0 + +-- Debug helper functions +local function wrapCUDNN_accGradParameters_gradWeight( + module, input, gradOutput, scale) + -- Needed to initialize all cudnn state properly + module:updateOutput(input) + module.gradBias:zero() + module.gradWeight:zero() + module:accGradParameters(input, gradOutput, scale) + return module.gradWeight +end + +local function wrapCUDNN_accGradParameters_gradBias( + module, input, gradOutput, scale) + -- Needed to initialize all cudnn state properly + module:updateOutput(input) + module.gradBias:zero() + module.gradWeight:zero() + module:accGradParameters(input, gradOutput, scale) + return module.gradBias +end + +function SpatialConvolutionFFT:debugVSCUDNN( + pass, module, selfModule, toTest, fun, param1, param2, param3) + local fftRes = toTest:float():clone() + + module.weight = selfModule.weight:clone() + module.bias = selfModule.bias:clone() + module.gradWeight = selfModule.gradWeight:clone() + module.gradBias = selfModule.gradBias:clone() + module.output = selfModule.output:clone() + module.gradInput = selfModule.gradInput:clone() + + local p1 = param1:contiguous() + local p2 + if param2 then + p2 = param2:contiguous() + end + local p3 = param3 + local cudnnRes = fun(module, p1, p2, p3) + + if self.printDebugLevel >= 2 then + print('FFTRES', {fftRes}, 'CUDNN', {cudnnRes}) + end + + local norm = math.sqrt(cudnnRes:float():dot(cudnnRes:float()) + 1e-8) + if isnan(fftRes:sum()) or + cudnnRes:float():dist(fftRes:float()) / norm > prec then + print(torch.type(self), ' error', pass, + cudnnRes:float():dist(fftRes:float()) / norm, prec) + print(torch.type(self), ' error', pass, + fftRes:min(), fftRes:max(), fftRes:mean(), fftRes:sum()) + if self.printDebugLevel >= 2 then + local diff = fftRes:float() - cudnnRes:float() + print('Expected\n', cudnnRes:float()) + print('Actual\n', fftRes:float()) + print('DIFFTENSOR\n', diff) + end + return false + elseif self.printDebugLevel >= 0 then + print(torch.type(self), ' debug vs CUDNN check passes ', + pass, fftRes:min(), fftRes:max(), fftRes:mean(), fftRes:sum()) + end + return true +end + +function SpatialConvolutionFFT:initCudaResources(numHandles, numStreams) + -- Init streams, handles and synchronization groups + cutorch.reserveBlasHandles(numHandles) + cutorch.reserveStreams(numStreams) + local allStreams = {} + for stream = 0, numStreams do + table.insert(allStreams, stream) + end + local allStreamsButDefault = {} + for stream = 1, numStreams do + table.insert(allStreamsButDefault, stream) + end + return allStreams, allStreamsButDefault +end + +function SpatialConvolutionFFT:__init(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + memoryReusePolicy, + numCudaStreams) + parent.__init(self) + + self.printDebugLevel = -1 -- override manually + self.cudnnDebug = false -- override manually + self.nInputPlane = nInputPlane + self.nOutputPlane = nOutputPlane + self.kW = kW + self.kH = kH + self.dW = dW or 1 + self.dH = dH or 1 + + self.padLeft = padLeft or 0 + self.padUp = padUp or 0 + self.padRight = self.padLeft + self.padDown = self.padUp + + assert(self.dW == 1, "fft only supports stride-1 convolutions atm") + + self.weight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW) + self.bias = torch.Tensor(nOutputPlane) + self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW) + self.gradBias = torch.Tensor(nOutputPlane) + + -- Temporary buffers, would be nice to reduce code size here + self.inputBuffer = nil + self.inputTransposeBuffer = nil + self.inputPadded = nil + self.outputBuffer = nil + self.outputTransposeBuffer = nil + self.outputPadded = nil + self.weightBuffer = nil + self.weightTransposeBuffer = nil + self.weightPadded = nil + + -- CuFFT plans, useless for fbfft + self.cufftPlanInputFFT = nil + self.cufftPlanWeightFFT = nil + self.cufftPlanOutputFFT = nil + self.cufftPlanInputIFFT = nil + self.cufftPlanWeightIFFT = nil + self.cufftPlanOutputIFFT = nil + + self:reset() + + self.numCudaStreams = numCudaStreams or 16 + self.numCublasHandles = self.numCudaStreams + self.allStreams = nil + self.allStreamsButDefault = nil + self.allStreams, self.allStreamsButDefault = + self:initCudaResources(self.numCublasHandles, self.numCudaStreams) + + -- List of buffers into multikey that we need to free + self.bufferKeys = List{} + + -- Memory reuse strategy + if not memoryReusePolicy or + memoryReusePolicy == nn.SpatialConvolutionFFT.memoryReuseNone + then + self.memoryReusePolicy = List{nn.SpatialConvolutionFFT.memoryReuseNone} + elseif memoryReusePolicy == nn.SpatialConvolutionFFT.memoryReuseAll + then + self.memoryReusePolicy = List{nn.SpatialConvolutionFFT.memoryReuseInput, + nn.SpatialConvolutionFFT.memoryReuseOutput, + nn.SpatialConvolutionFFT.memoryReuseWeight} + elseif torch.type(self.memoryReusePolicy) == 'table' + then + if memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseAll) + then + self.memoryReusePolicy = + List{nn.SpatialConvolutionFFT.memoryReuseInput, + nn.SpatialConvolutionFFT.memoryReuseOutput, + nn.SpatialConvolutionFFT.memoryReuseWeight} + else + self.memoryReusePolicy = memoryReusePolicy + end + else + self.memoryReusePolicy = List{memoryReusePolicy} + end + + -- Use to uniquely identify steps of this module and to properly track + -- producer-consumer dependences in the tagspace. + SpatialConvolutionFFT.moduleInstance = + SpatialConvolutionFFT.moduleInstance + 1 -- TODO: increment atomically + -- Must be a unique name + self.moduleUID = + torch.type(self) .. "--instance=" .. SpatialConvolutionFFT.moduleInstance + -- set once at the beginning of every operation to keep track of the + -- 'timestep' + self.timeSteps = + { updateOutput = 0, updateGradInput = 0, accGradParameters = 0 } + + if self.printDebugLevel >= 0 then + print('Post init ', self.moduleUID, ' memory usage: ', + cutorch.getMemoryUsage()) + end + + -- List of fallback modules, one for each function (updateOutput, + -- updateGradInput, accGradParameters) + -- When they are set, just use the specified fallback for each pass. + self.fallbackModules = nil + self.recoverFromError = true + + -- Check vs reference result + self.cudnnChecks = true + + -- Support for tuned SpatialConvolution.lua + self.success = true + self.autotuningPass = false + self.reportErrors = true +end + +function SpatialConvolutionFFT:reset(stdv) + if stdv then + stdv = stdv * math.sqrt(3) + else + stdv = 1/math.sqrt(self.kW * self.kH * self.nInputPlane) + end + + if nn.oldSeed then + self.weight:apply(function() + return torch.uniform(-stdv, stdv) + end) + self.bias:apply(function() + return torch.uniform(-stdv, stdv) + end) + else + self.weight:uniform(-stdv, stdv) + self.bias:uniform(-stdv, stdv) + end +end + +-- Update output (i.e. forward prop) +function SpatialConvolutionFFT:updateOutput(input) + self.timeSteps.updateOutput = self.timeSteps.updateOutput + 1 + self.originalStream = cutorch.getStream() + local res = self:wrapFallback(self.updateOutputFFT, input) + cutorch.setStream(self.originalStream) + return res +end + +function SpatialConvolutionFFT:updateGradInput(input, gradOutput) + self.timeSteps.updateGradInput = self.timeSteps.updateGradInput + 1 + self.originalStream = cutorch.getStream() + local res = self:wrapFallback(self.updateGradInputFFT, input, gradOutput) + cutorch.setStream(self.originalStream) + return res +end + +function SpatialConvolutionFFT:accGradParameters( + input, gradOutput, scale) + self.timeSteps.accGradParameters = self.timeSteps.accGradParameters + 1 + self.originalStream = cutorch.getStream() + self:wrapFallback( + self.accGradParametersFFT, input, gradOutput, scale) + cutorch.setStream(self.originalStream) +end + +-- This function wraps calls to updateOutput, updateGradInput and +-- accGradParameters. If any error is encountered it cleans after itself and +-- calls the corresponding cudnn function. This acts as a failsafe mechanism in +-- case FFT runs out of memory which is not a trivial thing to determine +-- beforehand. The overhead is only paid on the first invocations, all +-- subsequent ones will default to cudnn after the first failure. +function SpatialConvolutionFFT:wrapFallback( + fun, input, gradOutput, scale, reuseList) + + if not self.fallbackModules then + local ok, res = pcall(fun, self, input, gradOutput, scale, reuseList) + if ok then + return res + end + if not self.recoverFromError then + error(res) + end + + if self.reportErrors then + print("Error: " .. res .. " -> fallback to cudnn") + end + -- This path exits early for tuned SpatialConvolution.lua + self.success = false + if self.autotuningPass then + if self.reportErrors then + print('Using tuned SpatialConvolution: found an error, early exit') + end + return nil + end + end + + -- This path is the fallback path where cudnn is subsituted for our module + -- This is becoming obsolete as everyone should now use + -- tuned SpatialConvolution.lua + if not self.collectedGarbage then + self:cleanupBuffers() + collectgarbage() + collectgarbage() + self.collectedGarbage = true + end + + self.fallbackModules = {} + if not self.fallbackModules[fun] then + cutorch.synchronize() + self.fallbackModules[fun] = cudnn.SpatialConvolution(self.nInputPlane, + self.nOutputPlane, + self.kW, + self.kH, + self.dW, + self.dH, + self.padLeft, + self.padUp):cuda() + -- run updateOutput once to initialize + self.fallbackModules[fun]:updateOutput(input) + end + + -- Pass along to cudnn module + self.fallbackModules[fun].weight = self.weight + self.fallbackModules[fun].bias = self.bias + self.fallbackModules[fun].gradWeight = self.gradWeight + self.fallbackModules[fun].gradBias = self.gradBias + local res = nil + if fun == self.updateOutputFFT then + res = self.fallbackModules[fun]:updateOutput(input) + self.output = res + elseif fun == self.updateGradInputFFT then + res = self.fallbackModules[fun]:updateGradInput(input, gradOutput) + self.gradInput = res + elseif fun == self.accGradParametersFFT then + self.fallbackModules[fun]:accGradParameters(input, gradOutput, scale) + self.gradWeight = self.fallbackModules[fun].gradWeight + self.gradBias = self.fallbackModules[fun].gradBias + else + error('Unknown call ' .. fun) + end + return res +end + +function SpatialConvolutionFFT:getNormalizationFactor(commonSizes, input) + if self.fftImplementation == 'fbfft' then + return commonSizes[3] * commonSizes[4] + elseif self.fftImplementation then + return (input:size(3) + self.padUp + self.padDown) * + (input:size(4) + self.padLeft + self.padRight) + end + error("Unknown fftImpl: " .. self.fftImplementation) +end + +function SpatialConvolutionFFT:backward(input, gradOutput, scale) + self.originalStream = cutorch.getStream() + scale = scale or 1 + self:updateGradInput(input, gradOutput) + self:wrapFallback(self.accGradParametersFFT, + input, + gradOutput, + scale, + List{self.outputTransposeBuffer}) + cutorch.setStream(self.originalStream) + return self.gradInput +end + +function SpatialConvolutionFFT:updateOutputFFTImpl() + assert(false, 'This is an abstract class, must use a derived implementation') +end + +function SpatialConvolutionFFT:updateGradInputFFTImpl() + assert(false, 'This is an abstract class, must use a derived implementation') +end + +function SpatialConvolutionFFT:accGradParametersFFTImpl() + assert(false, 'This is an abstract class, must use a derived implementation') +end + +function SpatialConvolutionFFT:updateOutputFFT(input, reuseList) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + local nBatches = input:size(1) + + -- Allocate the output for this module, only once + if not self.output or self.output:nElement() == 0 then + self.output = torch.CudaTensor(torch.LongStorage({ + nBatches, + self.nOutputPlane, + input:size(3) + self.padUp + self.padDown - self.kH + 1, + input:size(4) + self.padLeft + self.padRight - self.kW + 1})) + end + + if self.printDebugLevel >= 2 then + print('PAD ', self.padUp, 'x', self.padLeft) + print('ORIGINAL INPUT', {input}) + print('ORIGINAL WEIGHT', {self.weight}) + self.output:zero() + print('ORIGINAL OUTPUT', {self.output}) + end + + -- Call the proper Impl + self:updateOutputFFTImpl(input, reuseList) + + if self.printDebugLevel >= 0 then + print('Post updateOutput ', self.moduleUID, ' memory usage: ', + cutorch.getMemoryUsage()) + end + + if self.printDebugLevel >= 2 then + print('FINAL INPUT', {input}) + print('COMPLEX INPUT POST FFT', {self.inputBuffer}) + print('COMPLEX INPUT POST TRANSPOSE', {self.inputTransposeBuffer}) + print('ORIGINAL WEIGHT', {self.weight}) + print('COMPLEX WEIGHT POST FFT', {self.weightBuffer}) + print('COMPLEX WEIGHT POST TRANSPOSE', {self.weightTransposeBuffer}) + print('OUTPUT CPLX TRANSPOSE POST MM', {self.outputTransposeBuffer}) + print('OUTPUT COMPLEX POST TRANSPOSE', {self.outputBuffer}) + print('OUTPUT REAL', {self.output}) + end + + if self.cudnnDebug then + local sp = cudnn.SpatialConvolution(self.nInputPlane, + self.nOutputPlane, + self.kW, + self.kH, + self.dW, + self.dH, + self.padLeft, + self.padUp):cuda() + self.cudnnChecks = self.cudnnChecks and + self:debugVSCUDNN("updateOutput", + sp, + self, + self.output, + sp.updateOutput, + input) + sp = nil + collectgarbage() + collectgarbage() + end + + return self.output +end + + +-- Update input gradients +function SpatialConvolutionFFT:updateGradInputFFT(input, gradOutput, reuseList) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + local nBatches = input:size(1) + -- Allocate the gradInput for this module, only once + if not self.gradInput or self.gradInput:nElement() == 0 then + self.gradInput = torch.CudaTensor(torch.LongStorage({ + nBatches, + self.nInputPlane, + input:size(3), + input:size(4)})) + end + + if self.printDebugLevel >= 2 then + print('PAD ', self.padUp, 'x', self.padLeft) + print('ORIGINAL gradOutput', gradOutput) + print('ORIGINAL WEIGHT', self.weight) + print('ORIGINAL GRADINPUT', self.gradInput) + end + + -- Call the proper Impl + self:updateGradInputFFTImpl(input, gradOutput, reuseList) + + if self.printDebugLevel >= 0 then + print('Post updateGradInput ', self.moduleUID, ' memory usage: ', + cutorch.getMemoryUsage()) + end + + if self.printDebugLevel >= 2 then + print('COMPLEX WEIGHT POST FFT', self.weightBuffer) + print('COMPLEX WEIGHT POST TRANSPOSE', self.weightTransposeBuffer) + print('COMPLEX GRADOUTPUT POST FFT', self.outputBuffer) + print('COMPLEX GRADOUTPUT POST TRANSPOSE', self.outputTransposeBuffer) + print('GRADINPUT COMPLEX POST MM', self.inputTransposeBuffer) + print('GRADINPUT COMPLEX PRE IFFT', self.inputBuffer) + print('REAL GRADINPUT', self.gradInput) + print('REAL GRADINPUT PADDED (cufft only)', self.inputPadded) + end + + if self.cudnnDebug then + local sp = cudnn.SpatialConvolution(self.nInputPlane, + self.nOutputPlane, + self.kW, + self.kH, + self.dW, + self.dH, + self.padLeft, + self.padUp):cuda() + self.cudnnChecks = self.cudnnChecks and + self:debugVSCUDNN("updateGradInput", + sp, + self, + self.gradInput, + sp.updateGradInput, + input, + gradOutput) + sp = nil + collectgarbage() + collectgarbage() + end + + return self.gradInput +end + + +-- Accumulate weight gradients +function SpatialConvolutionFFT:accGradParametersFFT( + input, gradOutput, scale, reuseList) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + if not self.gradWeight or self.gradWeight:nElement() == 0 then + assert(false, "GradWeight must already be allocated at module creation") + end + + if self.printDebugLevel >= 2 then + print('PAD ', self.padUp, 'x', self.padLeft) + print('ORIGINAL INPUT', {input}) + print('ORIGINAL OUTPUT', {gradOutput}) + print('ORIGINAL WEIGHT', {self.gradWeight}) + end + + -- Call the proper Impl + self:accGradParametersFFTImpl(input, gradOutput, scale, reuseList) + + if self.printDebugLevel >= 0 then + print('Post accGradParameters ', self.moduleUID, ' memory usage: ', + cutorch.getMemoryUsage()) + end + + if self.printDebugLevel >= 2 then + print('OUTPUT COMPLEX POST TRANSPOSE', {self.outputBuffer}) + print('OUTPUT CPLX TRANSPOSE POST MM', {self.outputTransposeBuffer}) + print('COMPLEX INPUT POST TRANSPOSE', {self.inputTransposeBuffer}) + print('COMPLEX INPUT POST FFT', {self.inputBuffer}) + print('COMPLEX WEIGHT POST FFT', {self.weightBuffer}) + print('COMPLEX WEIGHT POST TRANSPOSE', {self.weightTransposeBuffer}) + print('REAL GRADWEIGHT', {self.weightPadded}) + print('REAL GRADWEIGHT', {self.gradWeight}) + print("SCALE: " .. scale) + end + + if self.cudnnDebug then + local saveBias = self.gradBias:float():clone() + local sp = cudnn.SpatialConvolution(self.nInputPlane, + self.nOutputPlane, + self.kW, + self.kH, + self.dW, + self.dH, + self.padLeft, + self.padUp):cuda() + self.cudnnChecks = self.cudnnChecks and + self:debugVSCUDNN("accGradParameters_gradWeight", + sp, + self, + self.gradWeight, + wrapCUDNN_accGradParameters_gradWeight, + input, + gradOutput, + scale) + + self.cudnnChecks = self.cudnnChecks and + self:debugVSCUDNN("accGradParameters_gradBias", + sp, + self, + saveBias, + wrapCUDNN_accGradParameters_gradBias, + input, + gradOutput, + scale) + sp = nil + collectgarbage() + collectgarbage() + end +end + + +-- Buffer creation and reuse given a size and a pass. +-- Different passes use different tensors as the 'output of the pass'. +-- SpatialConvolutionFFT.ForwardFFTPass -> output +-- SpatialConvolutionFFT.BackwardFFTPass -> input +-- SpatialConvolutionFFT.AccGradientFFTPass -> weight +-- The buffers corresponding to the tensors that is the 'output of the pass' +-- must be properly transposed in order for the CGemm call to be consistent. +-- This is a simple metadata transposition, might as well construct properly. +-- +-- This function contains the least common denominator of buffers needed for +-- all implementations. + +SpatialConvolutionFFT.ForwardFFTPass = 1 +SpatialConvolutionFFT.BackwardFFTPass = 2 +SpatialConvolutionFFT.AccGradientFFTPass = 3 + +-- Meta-data is user specific metadata which influences the lifetime of the +-- buffers. Atm this is SpatialConvolutionFFTTiled-specific but if the network +-- is not too large, especially with parallel containers, this is a good +-- opportunity to reuse FFT computations. +function SpatialConvolutionFFT:prepareBuffers(commonSize, pass, metaData) + assert(commonSize and self.fftImplementation) + assert(torch.type(metaData) == 'table', torch.type(metaData)) + + local bufferSizesO = torch.LongStorage({ + commonSize[1], self.nOutputPlane, commonSize[3], commonSize[4]}) + local bufferSizesW = torch.LongStorage({ + self.nOutputPlane, self.nInputPlane, commonSize[3], commonSize[4]}) + + self.inputBuffer = + self:getBuffer( + SpatialConvolutionFFT.FFTInputBufferType, -- buffer type + commonSize, -- buffer size + false, -- transposeLayout + metaData) -- SpatialConvolutionFFTTiled-specific + self.outputBuffer = + self:getBuffer( + SpatialConvolutionFFT.FFTOutputBufferType, + bufferSizesO, + false, + metaData) -- SpatialConvolutionFFTTiled-specific + self.weightBuffer = + self:getBuffer( + SpatialConvolutionFFT.FFTWeightBufferType, + bufferSizesW, + false, + metaData) -- SpatialConvolutionFFTTiled-specific + + if self.inputBuffer and self.outputBuffer and self.weightBuffer then + return true + end + + -- TODO: From here on, we should failsafe to another SpatialConvolution + self:cleanupBuffers() + + error('Not enough memory for FFT buffers, need to fall back') +end + + +-- Returns nil if it cannot allocate a new buffer (for error recovery cases) +function SpatialConvolutionFFT:getBuffer( + BufferType, tensorSizes, transposedLayout, metaData) + assert(torch.type(metaData) == 'table', torch.type(metaData)) + + local d1 = tensorSizes[1] + local d2 = tensorSizes[2] + local d3 = tensorSizes[3] + local d4 = tensorSizes[4] + + local numElements = 0 + local sizes = torch.LongStorage({0}) + local isRealBuffer = SpatialConvolutionFFT.cudaRealBufferTypes:contains( + BufferType) + local isComplexBuffer = not isRealBuffer + + if isComplexBuffer then + -- fbfft and cufft have different layouts + assert(self.fftImplementation) + if self.fftImplementation == 'fbfft' then + numElements = d1 * d2 * (d3 / 2 + 1) * d4 * 2 + if transposedLayout then + -- The buffers corresponding to the tensors that is the + -- 'output of the pass' must be properly transposed in order for the + -- CGemm call to be consistent. + -- This is a simple metadata transposition, might as well construct + -- properly. + sizes = torch.LongStorage({d3 / 2 + 1, d4, d1, d2, 2}) + else + sizes = torch.LongStorage({d1, d2, d3 / 2 + 1, d4, 2}) + end + else + numElements = d1 * d2 * d3 * (d4 / 2 + 1) * 2 + if transposedLayout then + -- The buffers corresponding to the tensors that is the + -- 'output of the pass' must be properly transposed in order for the + -- CGemm call to be consistent. + -- This is a simple metadata transposition, might as well construct + -- properly. + sizes = torch.LongStorage({d3, d4 / 2 + 1, d1, d2, 2}) + else + sizes = torch.LongStorage({d1, d2, d3, d4 / 2 + 1, 2}) + end + end + else + -- Real buffers, for padding purposes in first approx + if self.fftImplementation == 'cufft' and + SpatialConvolutionFFT.cudaPaddedBufferTypes:contains(BufferType) then + numElements = d1 * d2 * d3 * d4 + -- TODO: potentially wasteful if original tensor is already of + -- tensorSizes. Could clean this up but requires knowing the original + -- tensor as a model for which we pad. + sizes = torch.LongStorage({d1, d2, d3, d4}) + end + -- else allocate an empty tensor, nil is reserved for errors + end + + assert(sizes and #sizes > 0) + + -- Conservative max buffer size, always needed at least by fbfft + -- Handle memory bloat by tiled convolutions + inplace fft + local bufferKey = self:getBufferKey(BufferType, sizes, metaData) + if SpatialConvolutionFFT.bufferMap[bufferKey] == nil then + local free_bytes = cutorch.getMemoryUsage() + if numElements * sizeOfElem > free_bytes then + return nil + end + + local before = cutorch.getMemoryUsage() + SpatialConvolutionFFT.bufferMap[bufferKey] = torch.CudaTensor(sizes) + local after = cutorch.getMemoryUsage() + if self.printDebugLevel >= 1 then + print('FFT Buffer Create Allocated ', before - after) + end + else + -- Storage already exists but may need resizing. + -- If resizing means expanding, make sure we have enough space + local t = SpatialConvolutionFFT.bufferMap[bufferKey] + if numElements > t:nElement() then + -- Don't call cuda API unless really needed + local free_bytes = cutorch.getMemoryUsage() + -- Resize is not in place, need to hold both in memory at some point + -- The subsequent resize cannot fail in cuda land or we're hosed and + -- cudaGetLastError will be 2. + if (numElements + t:nElement()) * sizeOfElem > free_bytes then + assert(false, 'Out of memory: cannot hold both tensors for resize') + end + local before = cutorch.getMemoryUsage() + t:resize(sizes) + local after = cutorch.getMemoryUsage() + if self.printDebugLevel >= 1 then + print('FFT Buffer Resize Allocated ', before - after) + end + else + -- Still need to resize to make the sizes / strides as expected but + -- this does cost extra memory + t:resize(sizes) + end + end + + local t = SpatialConvolutionFFT.bufferMap[bufferKey] + assert(t, 'Tensor buffer improperly set') + + for d = 1, t:nDimension() do + if (sizes[d] ~= t:size(d)) then + print("Put / get buffer dimension mismatch! d = ", d, " expected = ", + sizes, " actual = ", {t}) + assert(sizes[d] == t:size(d)) + end + end + + return t +end + +function SpatialConvolutionFFT:freeBuffer(bufferKey) + local tensor = SpatialConvolutionFFT.bufferMap[bufferKey] + if tensor then + SpatialConvolutionFFT.bufferMap[bufferKey] = nil + end +end + +-- Returns a string key, not hashed atm. +-- For instance, in SpatialConvolutionFFTTiled, this helps the creation of +-- different buffers for various tile tensorSize, tileSize and tileIndices. +-- This is important in order to reuse frequency domain representation +-- of tiled pieces of the tensors. +-- This allows trading off reuse for memory consumption. +-- +-- In FBFFT and CuFFT however, memory consumption can grow quickly so one should +-- only use a single buffer per BufferType. +-- If we had some user information that the buffers remain small enough, we +-- could have per module persistent buffers that would allow reuse. +function SpatialConvolutionFFT:getBufferKey(BufferType, bufferSizes, metaData) + assert(false, "getBufferKey controls buffers lifetime: must be overridden") +end + + +-- This implementation reuses buffers and keeps memory consumption minimal +-- (but this can still be a lot). +-- In particular, we only discriminate buffers by deviceId and type of buffer +-- by default. +-- This means we only have 1 copy of each type of buffer per device. +-- The same buffers are reused across any call of any module so the only +-- possible reuse is the reuse of gradOutput in the backward function. +-- This requires that backward be properly implemented in container modules +-- to allow such reuse. +-- For more advanced reuses, a proper getBufferKey function needs to be +-- implemented, tradeoffs will be made between reuse and memory consumption. +function SpatialConvolutionFFT:getBufferKeyGeneric(BufferType) + local bufferKey = { + SpatialConvolutionFFT.cudaTensorBuffers, + cutorch.getDevice(), + BufferType, + } + local res = thrift.to_string(bufferKey) + if not self.bufferKeys:contains(res) then + self.bufferKeys:append(res) + end + return res +end + +function SpatialConvolutionFFT:cleanupBuffers() + -- release all local result tensors and all buffers + self.output = nil + self.gradInput = nil + + -- Kill local references to global buffers + self.inputBuffer = nil + self.outputBuffer = nil + self.weightBuffer = nil + + -- Free all buffers + local len = self.bufferKeys:len() + for i = 1, len do + self:freeBuffer(self.bufferKeys:pop()) + end + + self.fallbackModules = {} + SpatialConvolutionFFT.cudaTensorBuffers = {} +end + + +-- Type: input/gradInput, output/gradOutput or weight/gradWeight +-- Could lookup bit operations in lua and do in 1 line, just use a loop atm +local function nextPowerOf2(val) + for i = 1, 10 do + if (2 ^ i) >= val then + return (2 ^ i) + end + end + assert(false, 'Too large a convolution dimensions: ', val) +end + +function SpatialConvolutionFFT:prepareCuFFTSizeAndBuffers( + i, w, o, metaData, pass) + local commonSize = i:size() + -- If we use cufft we should use rectangular regions where the width is a + -- power of 2. This is usually good enough approximation between FFT + -- efficiency and avoiding spurious work. + commonSize[3] = + math.max(i:size(3) + self.padUp + self.padDown, + w:size(3), + o:size(3)) + commonSize[4] = + nextPowerOf2(math.max(i:size(4) + self.padLeft + self.padRight, + w:size(4), + o:size(4))) + self:prepareBuffers(commonSize, pass, metaData) + + assert(self.fftImplementation == "cufft", + "CuFFT convolution module expected!") + assert(self.inputPadded and self.weightPadded and self.outputPadded, + "CuFFT requires padded input, weight and output") + + if o == self.output then + self.inputPadded:zero() + self.weightPadded:zero() + elseif w == self.weight then + self.weightPadded:zero() + self.outputPadded:zero() + else + self.inputPadded:zero() + self.outputPadded:zero() + end + + return commonSize -- needed for normalization factor +end + +function SpatialConvolutionFFT:prepareFBFFTGemmSizeAndBuffers( + i, w, o, metaData, pass) + local commonSize = i:size() + -- If we use cufft we should use rectangular regions where the width is a + -- power of 2. This is usually good enough approximation between FFT + -- efficiency and avoiding spurious work. + commonSize[3] = + nextPowerOf2(math.max(i:size(3) + self.padUp + self.padDown, + i:size(4) + self.padLeft + self.padRight, + w:size(3), + w:size(4), + o:size(3), + o:size(4))) + commonSize[4] = commonSize[3] + self:prepareBuffers(commonSize, pass, metaData) + + assert(self.fftImplementation == "fbfft", + "FBFFT convolution module expected!") + assert(not self.inputPadded and not self.weightPadded and + not self.outputPadded, + "CuFFT requires padded input, weight and output") + + return commonSize -- needed for normalization factor +end + +local NO_TRANSPOSE = nil + +-- Makes or reuses square FFT buffers up to the next power of 2 +function SpatialConvolutionFFT:prepareFBFFTSizeAndBuffers(i, w, o, metaData) + local commonSize = i:size() + commonSize[3] = + nextPowerOf2(math.max(i:size(3) + self.padUp + self.padDown, + i:size(4) + self.padLeft + self.padRight, + w:size(3), + w:size(4), + o:size(3), + o:size(4))) + commonSize[4] = commonSize[3] + self:prepareBuffers(commonSize, NO_TRANSPOSE, metaData) + assert(self.fftImplementation == "fbfft", + "FBFFT convolution module expected!") + assert(not self.inputPadded and not self.weightPadded and + not self.outputPadded, + "FBFFT does not expect padded input, weight and output") + return commonSize -- needed for normalization factor +end + +function SpatialConvolutionFFT:setReuseInputs(val) + assert(type(val) == 'boolean') + self:_setReuse(val, nn.SpatialConvolutionFFT.memoryReuseInput) +end + +function SpatialConvolutionFFT:setReuseOutputs(val) + assert(type(val) == 'boolean') + self:_setReuse(val, nn.SpatialConvolutionFFT.memoryReuseOutput) +end + +function SpatialConvolutionFFT:setReuseWeights(val) + assert(type(val) == 'boolean') + self:_setReuse(val, nn.SpatialConvolutionFFT.memoryReuseWeight) +end + +function SpatialConvolutionFFT:_setReuse(val, toReuse) + assert(type(val) == 'boolean') + assert(toReuse == nn.SpatialConvolutionFFT.memoryReuseInput or + toReuse == nn.SpatialConvolutionFFT.memoryReuseOutput or + toReuse == nn.SpatialConvolutionFFT.memoryReuseWeight, + toReuse) + + if val then + if self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone) then + -- Override + self.memoryReusePolicy = List{toReuse} + elseif self.memoryReusePolicy:contains(toReuse) then + -- Do nothing + return + else + self.memoryReusePolicy:append(toReuse) + end + else + if self.memoryReusePolicy:contains(toReuse) then + self.memoryReusePolicy:remove_value(toReuse) + -- Set at least "none" + if self.memoryReusePolicy:len() == 0 then + self.memoryReusePolicy:append( + nn.SpatialConvolutionFFT.memoryReuseNone) + end + else + -- Do nothing + return + end + end +end diff --git a/fbcunn/SpatialConvolutionFFTTiled.lua b/fbcunn/SpatialConvolutionFFTTiled.lua new file mode 100644 index 0000000..bf69280 --- /dev/null +++ b/fbcunn/SpatialConvolutionFFTTiled.lua @@ -0,0 +1,924 @@ +-- Copyright 2004-present Facebook. All Rights Reserved. + +require 'cudnn' +local List = require 'pl.List' +local thrift = require('fb.thrift') + +local function errorIf(cond, msg) + if cond then + error(msg) + end +end + +local function errorIfNot(cond, msg) + errorIf(not cond, msg) +end + +--[[ + Move to Tensor.lua + + This helper funtion returns a pl.List of 2-D tiled views into the tensor + passed in input, corresponding to tiling by the specified tiles sizes, with + specfied step sizes and implicit padding sizes. + Tiling is performed on the innermost 2 dimensions so tensor:nDimension must + be >= 2. + + -- TileDescriptor "declaration" + local TiledTensorDescriptor = {} + -- Original tile sizes asked for for proper Fourier basis decomposition + TiledTensorDescriptor.tileSizeH = tileSizeH + TiledTensorDescriptor.tileSizeW = tileSizeW + -- Index of the tile in tile space + TiledTensorDescriptor.tileIndexH = tileIndexH + TiledTensorDescriptor.tileIndexW = tileIndexW + -- Actual tensor size, full tiles have tensorSize == tileSize + TiledTensorDescriptor.tensorSizeH = tensorSizeH + TiledTensorDescriptor.tensorSizeW = tensorSizeW + -- Up and Left padding for up and left boundary tile. + -- Down and Right are obtained by implicit zero padding up to + -- original tile size + TiledTensorDescriptor.padUp = padUp + TiledTensorDescriptor.padLeft = padLeft + -- The view in the original tensor + TiledTensorDescriptor.tensor = torch.Tensor() + + By default tiling returns all the subtensors, including partial tensors on + the boundaries, that have at least one element when traversed by + tileSizeH x tileSizeW with stride stepH x stepW. + When performing convolutions, tiling semantics may not be sufficient. + For consistency, the tiling of the tensor written into, informs how many + tiles we should obtain from the tensor read from; this information is + conveyed by numTilesH x numTilesW. + The consistency check is that the tiling of the tensor read from, must always + cover the full read tensor. +--]] +local function TiledView2D(tensor, + tileSizeH, + tileSizeW, + stepH, + stepW, + padLeft, + padUp, + padRight, + padDown, + numTilesH, + numTilesW) + -- Initialization + local stepH = stepH or tileSizeH + local stepW = stepW or tileSizeW + local padUp = padUp or 0 + local padLeft = padLeft or 0 + local padDown = padDown or 0 + local padRight = padRight or 0 + local dimIndexH = tensor:nDimension() - 1 + local dimIndexW = tensor:nDimension() + local numTilesH = numTilesH or 1e100 -- maxint would be nice + local numTilesW = numTilesW or 1e100 -- maxint would be nice + + local printDebugLevel = -1 + if printDebugLevel >= 1 then + print("Tile ", tensor:size(), " by ", tileSizeH, "x", tileSizeW, + " with step ", stepH, "x", stepW, " and pad ", + padUp, "x", padLeft, "x", padDown, "x", padRight) + end + + -- Input validation, reject padding larger than tile size or kernel size + assert(tensor:nDimension() >= 2) + assert(tileSizeH and tileSizeW, 'both tile sizes must be specified') + assert(padUp >= 0 and padUp < tileSizeH, "padUp = " .. padUp .. + " >= (incompatible with) tileSizeH = " .. tileSizeH) + assert(padLeft >= 0 and padLeft < tileSizeW, "padLeft = " .. padLeft .. + " >= (incompatible with) with tileSizeW = " .. tileSizeW) + assert(padDown >= 0 and padDown < tileSizeH, "padDown = " .. padDown .. + " >= (incompatible with) with tileSizeH = " .. tileSizeH) + assert(padRight >= 0 and padRight < tileSizeW, "padRight = " .. padRight .. + " >= (incompatible with) with tileSizeW = " .. tileSizeW) + assert(tileSizeW > 0 and tileSizeH > 0, "") + assert(stepH > 0 and stepW > 0, + "Step sizes " .. stepH .. " x " .. stepW .. " both expected > 1. " .. + "Otherwise, tileSize <= kernel size which should not occur") + assert(padUp >= 0 and padDown >= 0 and padLeft >= 0 and padRight >= 0) + errorIfNot(tileSizeH < tensor:size(dimIndexH), + "Tiling must be smaller than tensor size !") + errorIfNot(tileSizeW < tensor:size(dimIndexW), + "Tiling must be smaller than tensor size !") + assert(#tensor:size() == dimIndexW and #tensor:stride() == dimIndexW) + + -- TileDescriptor generating loop + local maxTileIndexH = 0 + local maxTileIndexW = 0 + local tensors = List{} + local tensorStrideH = tensor:stride(dimIndexH) + local tensorStrideW = tensor:stride(dimIndexW) + local tileIndexH = 0 + for y = -padUp + 1, tensor:size(dimIndexH), stepH do + + -- Continue would be nice here to avoid level of nesting ! + if tileIndexH < numTilesH then + + local tileIndexW = 0 + for x = -padLeft + 1, tensor:size(dimIndexW), stepW do + + -- Continue would be nice here to avoid level of nesting ! + if tileIndexW < numTilesW then + + -- Descriptor for each tiled tensor + local TiledTensorDescriptor = {} + + -- Handle special boundary case for partial tile along y + local tensorSizeH = 0 + if y <= 0 then + tensorSizeH = tileSizeH + (y-1) + TiledTensorDescriptor.padUp = -(y-1) -- padUp + else + -- If we generate a tile, make sure its size does not overflow + tensorSizeH = math.max( + 1, math.min(tileSizeH, tensor:size(dimIndexH) - (y-1))) + TiledTensorDescriptor.padUp = 0 + end + TiledTensorDescriptor.tensorSizeH = tensorSizeH + TiledTensorDescriptor.tileIndexH = tileIndexH + + -- Handle special boundary case for partial tile along x + local tensorSizeW = 0 + if x <= 0 then + tensorSizeW = tileSizeW + (x-1) + TiledTensorDescriptor.padLeft = -(x-1) -- padLeft + else + -- If we generate a tile, make sure its size does not overflow + tensorSizeW = math.max( + 1, math.min(tileSizeW, tensor:size(dimIndexW) - (x-1))) + TiledTensorDescriptor.padLeft = 0 + end + TiledTensorDescriptor.tensorSizeW = tensorSizeW + TiledTensorDescriptor.tileIndexW = tileIndexW + + -- Allocate tensor with partial or full size and full stride + -- for proper wraparound + local sizes = + torch.LongStorage(tensor:nDimension()):copy(tensor:size()) + sizes[#sizes - 1] = tensorSizeH + sizes[#sizes] = tensorSizeW + local tensorTiled = torch.Tensor():typeAs(tensor) + tensorTiled:set( + tensor:storage(), + tensor:storageOffset() + + math.max((y-1), 0) * tensorStrideH + + math.max((x-1), 0) * tensorStrideW, + sizes, + tensor:stride()) + + TiledTensorDescriptor.tileSizeH = tileSizeH + TiledTensorDescriptor.tileSizeW = tileSizeW + TiledTensorDescriptor.tensor = tensorTiled + + -- Handling partial til on the bottom and right sides + -- Important to get interpolation right in frequency domain + tensors:append(TiledTensorDescriptor) + + if printDebugLevel >= 1 then + print('y = ' .. y .. ' x = ' .. x .. + ' tile index = ' .. tileIndexH .. ' x '.. tileIndexW) + print(TiledTensorDescriptor) + if printDebugLevel >= 2 then + print(TiledTensorDescriptor.tensor) + end + end + + assert(tensor:size(dimIndexH) + padUp + padDown - + tileIndexH * stepH > 0, "Error tileIndexH = " .. + tileIndexH .. " stepH = " .. stepH) + assert(tensor:size(dimIndexW) + padLeft + padRight - + tileIndexW * stepW > 0, "Error tileIndexW = " .. + tileIndexW .. " stepW = " .. stepW) + assert(tensorSizeH > 0, 'tensorSizeH = ' .. tensorSizeH) + assert(tensorSizeW > 0, 'tensorSizeW = ' .. tensorSizeW) + assert(y <= tensor:size(dimIndexH), 'Overflow y = ' .. y .. + ' > size = ' .. tensor:size(dimIndexH)) + assert(x <= tensor:size(dimIndexW), 'Overflow x = ' .. x .. + ' > size = ' .. tensor:size(dimIndexW)) + + + if maxTileIndexW < tileIndexW then + maxTileIndexW = tileIndexW + end + tileIndexW = tileIndexW + 1 + else -- if tileIndexW < numTilesW + assert(x + tileSizeW - stepW >= tensor:size(dimIndexW)) + end -- if tileIndexW < numTilesW + end -- for x + + if maxTileIndexH < tileIndexH then + maxTileIndexH = tileIndexH + end + tileIndexH = tileIndexH + 1 + else -- if not tileIndexH < numTilesH + assert(y + tileSizeH - stepH >= tensor:size(dimIndexH)) + end -- if tileIndexH < numTilesH + end -- for y + + return tensors, maxTileIndexH, maxTileIndexW +end + +-- Not really a string but I want to print this structure +local function TiledTensorDescriptorToString(TiledTensorDescriptor) + local toPrint = {} + toPrint.td = TiledTensorDescriptor + toPrint.tensorAddress = TiledTensorDescriptor.tensor:cdata() + toPrint.storageAddress = TiledTensorDescriptor.tensor:storage():cdata() + toPrint.storageOffset = TiledTensorDescriptor.tensor:storageOffset() + return toPrint +end + +local function _printDebugAndAssert( + debugLevel, index, inputTensorList, outputTensorList) + if debugLevel == 1 then + print("Convolve input", index, " / ", + outputTensorList:len(), " :\n", + TiledTensorDescriptorToString(inputTensorList[index]), + '\n Convolve output\n', + TiledTensorDescriptorToString(outputTensorList[index])) + elseif debugLevel >= 2 then + print("Convolve input", index, " / ", + outputTensorList:len(), " :\n", + TiledTensorDescriptorToString(inputTensorList[index]), + inputTensorList[index].tensor, + '\n Convolve output\n', + TiledTensorDescriptorToString(outputTensorList[index]), + outputTensorList[index].tensor) + end + + -- Assert tiles are traversed in the same order otherwise + -- you can forget about correctness + assert(outputTensorList[index].tileIndexH == + inputTensorList[index].tileIndexH) + assert(outputTensorList[index].tileIndexW == + inputTensorList[index].tileIndexW) +end + +------------------------------------------------------------------------------ +-- Actual Module +------------------------------------------------------------------------------ +local SpatialConvolutionFFTTiled, parent = + torch.class('nn.SpatialConvolutionFFTTiled', 'nn.SpatialConvolutionFBFFT') + +function SpatialConvolutionFFTTiled:__init(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + tileSizeW, + tileSizeH, + memoryReusePolicy, + numCudaStreams) + + assert(torch.type(nInputPlane) == 'number') + assert(torch.type(nOutputPlane) == 'number') + assert(torch.type(kW) == 'number') + assert(torch.type(kH) == 'number') + assert(torch.type(dW) == 'number') + assert(torch.type(dH) == 'number') + assert(padLeft == nil or torch.type(padLeft) == 'number') + assert(padUp == nil or torch.type(padUp) == 'number') + + assert(tileSizeW == nil or torch.type(tileSizeW) == 'number') + assert(tileSizeH == nil or torch.type(tileSizeH) == 'number') + assert(memoryReusePolicy == nil or + torch.type(memoryReusePolicy) == 'string' or + torch.type(memoryReusePolicy) == 'table') + assert(numCudaStreams == nil or torch.type(numCudaStreams) == 'number') + + parent.__init(self, + nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + memoryReusePolicy, + numCudaStreams) + + -- Sanity assertions + assert(self.printDebugLevel == -1) + assert(self.nInputPlane == nInputPlane) + assert(self.nOutputPlane == nOutputPlane) + assert(self.kW == kW) + assert(self.kH == kH) + assert(self.dH == 1, "fft only supports stride-1 convolutions atm") + assert(self.dW == 1, "fft only supports stride-1 convolutions atm") + + assert(self.padLeft == padLeft or self.padLeft == 0) + assert(self.padUp == padUp or self.padUp == 0) + assert(self.padRight == self.padLeft) + assert(self.padDown == self.padUp) + + assert(self.fftImplementation == 'fbfft') + + assert(self.padUp < self.kH and self.padDown < self.kH and + self.padLeft < self.kW and self.padRight < self.kW, + "Padding must be smaller than kernel") + + assert(self.weight:size(1) == nOutputPlane and + self.weight:size(2) == nInputPlane and + self.weight:size(3) == kH and + self.weight:size(4) == kW) + assert(self.bias:size(1) == nOutputPlane) + assert(self.gradWeight:size(1) == nOutputPlane and + self.gradWeight:size(2) == nInputPlane and + self.gradWeight:size(3) == kH and + self.gradWeight:size(4) == kW) + assert(self.gradBias:size(1) == nOutputPlane) + + + -- Temporary buffers, would be nice to reduce code size here + assert(not self.inputBuffer) + assert(not self.inputTransposeBuffer) + assert(not self.inputPadded) + assert(not self.outputBuffer) + assert(not self.outputTransposeBuffer) + assert(not self.outputPadded) + assert(not self.weightBuffer) + assert(not self.weightTransposeBuffer) + assert(not self.weightPadded) + + -- CuFFT plans, useless for fbfft + assert(not self.cufftPlanInputFFT) + assert(not self.cufftPlanWeightFFT) + assert(not self.cufftPlanOutputFFT) + assert(not self.cufftPlanInputIFFT) + assert(not self.cufftPlanWeightIFFT) + assert(not self.cufftPlanOutputIFFT) + + self:reset() + + -- Tiling metadata + self.tileSizeH = tileSizeH or 16 + self.tileSizeW = tileSizeW or 16 + -- updateOutput + self.inputTensorList = nil + self.outputTensorList = nil + -- updateGradInput + self.gradInputTensorList = nil + self.gradOutputTensorList = nil + -- accGradParameters + self.inputTensorList2 = nil + self.gradOutputTensorList2 = nil +end + + +local function printDebugAndAssert( + debugLevel, index, inputTensorList, outputTensorList) + if debugLevel == 1 then + print("Convolve input", index, " / ", + outputTensorList:len(), " :\n", + TiledTensorDescriptorToString(inputTensorList[index]), + '\n Convolve output\n', + TiledTensorDescriptorToString(outputTensorList[index])) + elseif debugLevel >= 2 then + print("Convolve input", index, " / ", + outputTensorList:len(), " :\n", + TiledTensorDescriptorToString(inputTensorList[index]), + inputTensorList[index].tensor, + '\n Convolve output\n', + TiledTensorDescriptorToString(outputTensorList[index]), + outputTensorList[index].tensor) + end + + -- Assert tiles are traversed in the same order otherwise + -- you can forget about correctness + assert(outputTensorList[index].tileIndexH == + inputTensorList[index].tileIndexH) + assert(outputTensorList[index].tileIndexW == + inputTensorList[index].tileIndexW) +end + + +function SpatialConvolutionFFTTiled:pushPadding(index, tensorList) + local savePadUp, savePadLeft, savePadDown, savePadRight + savePadUp, self.padUp = self.padUp, tensorList[index].padUp + savePadLeft, self.padLeft = self.padLeft, tensorList[index].padLeft + -- Complete padding up to tile size so that interpolation + -- occurs in the right Fourier basis + savePadDown, self.padDown = + self.padDown, math.max( + 0, tensorList[index].tileSizeH - + (self.padUp + tensorList[index].tensorSizeH)) + savePadRight, self.padRight = + self.padRight, math.max( + 0, tensorList[index].tileSizeW - + (self.padLeft + tensorList[index].tensorSizeW)) + + return savePadUp, savePadLeft, savePadDown, savePadRight +end + + +function SpatialConvolutionFFTTiled:pushPaddingWithCircularSymmetry( + index) + local savePadUp, savePadLeft, savePadDown, savePadRight + -- Fun with padding and circular symmetry in Fourier domain + -- This acts upon shifting the IFFT result into the proper position + -- into gradInput + savePadUp, self.padUp = + self.padUp, self.kH - 1 + self.gradInputTensorList[index].padUp - + self.gradOutputTensorList[index].padUp + savePadLeft, self.padLeft = + self.padLeft, self.kW - 1 + self.gradInputTensorList[index].padLeft - + self.gradOutputTensorList[index].padLeft + -- Complete padding up to tile size so that interpolation + -- occurs in the right Fourier basis. + -- The invariant is that the size of gradOutput and gradInput should + -- always be padded up to the tiling size. In the particular case + -- of gradInput, we must additionally consider input padding. + + assert(self.gradOutputTensorList[index].tensorSizeH) + assert(self.gradInputTensorList[index].tensorSizeH) + + savePadDown, self.padDown = + self.padDown, + math.max(0, self.tileSizeH - math.max( + self.gradOutputTensorList[index].tensorSizeH, + self.gradInputTensorList[index].tensorSizeH + self.padUp)) + savePadRight, self.padRight = + self.padRight, + math.max(0, self.tileSizeW - math.max( + self.gradOutputTensorList[index].tensorSizeW, + self.gradInputTensorList[index].tensorSizeW + self.padLeft)) + return savePadUp, savePadLeft, savePadDown, savePadRight +end + +function SpatialConvolutionFFTTiled:updateOutputFFTImpl(input) + local ok, res = + pcall(SpatialConvolutionFFTTiled.abstractUpdateOutputFFTImpl, self, input) + if ok then + return res + end + self.success = false + if self.reportErrors then + print(res .. " -> updateOutput fallback to untiled FBFFT") + end + + -- This path exits early for tuned SpatialConvolution.lua + self.success = false + if self.autotuningPass then + error('Using tuned SpatialConvolution and found an error, early exit') + end + + error("Bug in fallback form Tiled to FBFFT on updateOutput" .. + " Drop back higher up in the food chain") + -- This path is becoming obsolete + -- Safety barrier and no reuse for error recovery + self.memoryReusePolicy = List{ + nn.SpatialConvolutionFFT.memoryReuseNone} + -- ############################################## + cutorch.streamBarrier(self.allStreams) + return parent.updateOutputFFTImpl(self, input) +end + + +function SpatialConvolutionFFTTiled:instUpdateOutputFFTImpl( + input, gradOutput) + assert(false, "Do not call the abstract class directly!") +end + + +function SpatialConvolutionFFTTiled:abstractUpdateOutputFFTImpl(input) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + local nBatches = input:size(1) + -- Allocate the output for this module, only once + if not self.output or self.output:nElement() == 0 then + self.output = torch.CudaTensor(torch.LongStorage({ + nBatches, + self.nOutputPlane, + input:size(3) + self.padUp + self.padDown - self.kH + 1, + input:size(4) + self.padLeft + self.padRight - self.kW + 1})) + end + + errorIf(self.tileSizeH > self.output:size(3) or + self.tileSizeW > self.output:size(4), + 'Tile size too large (' .. self.tileSizeH .. 'x' .. self.tileSizeW .. + ') for output (' .. self.output:size(3) .. 'x' .. + self.output:size(4) .. ')') + + -- Perform tiling on meta-tensor list + if not self.inputTensorList or + not self.outputTensorList or + not self.metaDataListUpdateOutput + then + self.inputTensorList = nil + self.outputTensorList = nil + self.metaDataListUpdateOutput = nil + local maxTileIndexH + local maxTileIndexW + -- In updateOutputTiled, the tiling of output is without overlap + -- and without padding. It informs how the tiling on padded input + -- should be performed + self.outputTensorList, maxTileIndexH, maxTileIndexW = + TiledView2D(self.output, + self.tileSizeH - self.kH + 1, + self.tileSizeW - self.kW + 1, + self.tileSizeH - self.kH + 1, + self.tileSizeW - self.kW + 1) + self.inputTensorList = TiledView2D(input, + self.tileSizeH, + self.tileSizeW, + self.tileSizeH - self.kH + 1, + self.tileSizeW - self.kW + 1, + self.padLeft, + self.padUp, + self.padRight, + self.padDown, + maxTileIndexH + 1, + maxTileIndexW + 1) + + self.metaDataListUpdateOutput = List{} + for i = 1, self.inputTensorList:len() do + local metaData = self:makeMetaData( + nn.SpatialConvolutionFFT.ForwardFFTPass, + self.inputTensorList[i].tileIndexW, + self.inputTensorList[i].tileIndexH, + self.outputTensorList[i].tileIndexW, + self.outputTensorList[i].tileIndexH) + -- By default skip bias when offloading computation to FBFFT + -- and do it at the very end + metaData.skipBias = true + self.metaDataListUpdateOutput:append(metaData) + end + end + + errorIfNot(self.outputTensorList:len() == self.inputTensorList:len(), + "Error in tile metadata: not the same sizes input = " .. + self.inputTensorList:len() .. " VS output = " .. + self.outputTensorList:len()) + + -- At this point tiles / metadata for buffer management / reuse are available + -- in self.xyz just call the actual instantiation + + return self:instUpdateOutputFFTImpl(input) +end + + +function SpatialConvolutionFFTTiled:updateGradInputFFTImpl(input, gradOutput) + local ok, res = + pcall(SpatialConvolutionFFTTiled.abstractUpdateGradInputFFTImpl, + self, + input, + gradOutput) + if ok then + return res + end + self.success = false + if self.reportErrors then + print(res .. " -> updateGradInput fallback to untiled FBFFT") + end + + -- This path exits early for tuned SpatialConvolution.lua + self.success = false + if self.autotuningPass then + error('Using tuned SpatialConvolution and found an error, early exit') + end + + error("Bug in fallback form Tiled to FBFFT on updateGradInput" .. + " Drop back higher up in the food chain") + -- Safety barrier and no reuse for error recovery + self.memoryReusePolicy = List{ + nn.SpatialConvolutionFFT.memoryReuseNone} + -- ############################################## + cutorch.streamBarrier(self.allStreams) + return parent.updateGradInputFFTImpl(self, input, gradOutput) +end + +function SpatialConvolutionFFTTiled:instUpdateGradInputFFTImpl( + input, gradOutput) + assert(false, "Do not call the abstract class directly!") +end + +function SpatialConvolutionFFTTiled:abstractUpdateGradInputFFTImpl( + input, gradOutput) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + local nBatches = input:size(1) + + -- Allocate the gradInput for this module, only once + if not self.gradInput or self.gradInput:nElement() == 0 then + self.gradInput = torch.CudaTensor(torch.LongStorage({ + nBatches, + self.nInputPlane, + input:size(3), + input:size(4)})) + else + errorIfNot(self.gradInput:size(1) == input:size(1)) + errorIfNot(self.gradInput:size(2) == input:size(2)) + errorIfNot(self.gradInput:size(3) == input:size(3)) + errorIfNot(self.gradInput:size(4) == input:size(4)) + end + + errorIf(self.tileSizeH > gradOutput:size(3) or + self.tileSizeW > gradOutput:size(4), + 'Tile size too large (' .. self.tileSizeH .. 'x' .. self.tileSizeW .. + ') for gradOutput (' .. gradOutput:size(3) .. 'x' .. + gradOutput:size(4) .. ')') + + -- Perform tiling on meta-tensor list + if not self.gradOutputTensorList or + not self.gradInputTensorList or + not self.metaDataListUpdateGradInput + then + self.gradOutputTensorList = nil + self.gradInputTensorList = nil + self.metaDataListUpdateGradInput = nil + local maxTileIndexH + local maxTileIndexW + -- In updateGradInputTiled, the tiling of gradInput is without overlap + -- and with padding. It informs how the tiling on padded gradOutput + -- should be performed. + self.gradInputTensorList, maxTileIndexH, maxTileIndexW = + TiledView2D(self.gradInput, + self.tileSizeH - self.kH + 1, + self.tileSizeW - self.kW + 1, + self.tileSizeH - self.kH + 1, + self.tileSizeW - self.kW + 1, + self.padLeft, + self.padUp, + self.padRight, + self.padDown) + self.gradOutputTensorList = TiledView2D(gradOutput, + self.tileSizeH, + self.tileSizeW, + self.tileSizeH - self.kH + 1, + self.tileSizeW - self.kW + 1, + self.kW - 1, + self.kH - 1, + self.kW - 1, + self.kH - 1, + maxTileIndexH + 1, + maxTileIndexW + 1) + self.metaDataListUpdateGradInput = List{} + for i = 1, self.gradInputTensorList:len() do + local metaData = self:makeMetaData( + nn.SpatialConvolutionFFT.BackwardFFTPass, + self.gradInputTensorList[i].tileIndexW, + self.gradInputTensorList[i].tileIndexH, + self.gradOutputTensorList[i].tileIndexW, + self.gradOutputTensorList[i].tileIndexH) + self.metaDataListUpdateGradInput:append(metaData) + end + end + + errorIfNot(self.gradInputTensorList:len() == self.gradOutputTensorList:len(), + "Not the same sizes input = " .. self.gradOutputTensorList:len() .. + " VS output = " .. self.gradInputTensorList:len()) + + + -- At this point tiles / metadata for buffer management / reuse are available + -- in self.xyz just call the actual instantiation + + return self:instUpdateGradInputFFTImpl(input, gradOutput) +end + + +function SpatialConvolutionFFTTiled:accGradParametersFFTImpl( + input, gradOutput, scale) + local ok, res = + pcall(SpatialConvolutionFFTTiled.abstractAccGradParametersFFTImpl, + self, + input, + gradOutput, + scale) + if ok then + return res + end + self.success = false + if self.reportErrors then + print(res .. " -> accGradParameters fallback to untiled FBFFT") + end + + -- This path exits early for tuned SpatialConvolution.lua + self.success = false + if self.autotuningPass then + error('Using tuned SpatialConvolution and found an error, early exit') + end + + error("Bug in fallback form Tiled to FBFFT on accGradParametersFFTImpl" .. + " Drop back higher up in the food chain") + -- Safety barrier and no reuse for error recovery + self.memoryReusePolicy = List{ + nn.SpatialConvolutionFFT.memoryReuseNone} + -- ############################################## + cutorch.streamBarrier(self.allStreams) + parent.accGradParametersFFTImpl(self, input, gradOutput, scale) +end + + +function SpatialConvolutionFFTTiled:instAccGradParametersFFTImpl( + input, gradOutput) + assert(false, "Do not call the abstract class directly!") +end + + +function SpatialConvolutionFFTTiled:abstractAccGradParametersFFTImpl( + input, gradOutput, scale) + assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!") + + local scale = scale or 1 + local nBatches = input:size(1) + + -- Allocate the gradWeight for this module, only once + if not self.gradWeight or self.gradWeight:nElement() == 0 then + errorIfNot(false, + "GradWeight must already be allocated at module creation") + self.gradWeight = torch.CudaTensor(torch.LongStorage({ + nBatches, + self.nInputPlane, + self.kH, + self.kW})) + end + + errorIf(self.tileSizeH > gradOutput:size(3) or + self.tileSizeW > gradOutput:size(4), + 'Tile size too large (' .. self.tileSizeH .. 'x' .. self.tileSizeW .. + ') for gradOutput (' .. gradOutput:size(3) .. 'x' .. + gradOutput:size(4) .. ')') + + -- Perform tiling on meta-tensor list + if not self.gradOutputTensorList2 or + not self.inputTensorList2 or + not self.metaDataListAccGrad then + self.gradOutputTensorList2 = nil + self.inputTensorList2 = nil + self.metaDataListAccGrad = nil + local maxTileIndexH + local maxTileIndexW + errorIfNot(self.tileSizeH >= self.kH, + 'Tiling cannot be smaller than kernel !') + errorIfNot(self.tileSizeW >= self.kW, + 'Tiling cannot be smaller than kernel !') + -- In updateGradInputTiled, the tiling of gradOutput is without overlap + -- and without padding. It informs how the tiling on padded input + -- should be performed. + self.gradOutputTensorList2, maxTileIndexH, maxTileIndexW = + TiledView2D(gradOutput, + self.tileSizeH - (self.kH - 1), + self.tileSizeW - (self.kW - 1), + self.tileSizeH - (self.kH - 1), + self.tileSizeW - (self.kW - 1)) + self.inputTensorList2 = TiledView2D(input, + self.tileSizeH, + self.tileSizeW, + self.tileSizeH - (self.kH - 1), + self.tileSizeW - (self.kW - 1), + self.padLeft, + self.padUp, + self.padRight, + self.padDown, + maxTileIndexH + 1, + maxTileIndexW + 1) + + self.metaDataListAccGrad = List{} + for i = 1, self.inputTensorList2:len() do + local metaData = self:makeMetaData( + nn.SpatialConvolutionFFT.AccGradientFFTPass, + self.inputTensorList2[i].tileIndexW, + self.inputTensorList2[i].tileIndexH, + self.gradOutputTensorList2[i].tileIndexW, + self.gradOutputTensorList2[i].tileIndexH) + self.metaDataListAccGrad:append(metaData) + end + end + + errorIfNot(self.inputTensorList2:len() == self.gradOutputTensorList2:len(), + "Not the same sizes input = " .. self.gradOutputTensorList2:len() .. + " VS output = " .. self.inputTensorList2:len()) + + -- At this point tiles / metadata for buffer management / reuse are available + + self:instAccGradParametersFFTImpl(input, gradOutput, scale) +end + +-- Makes or reuses square FFT buffers up to the next power of 2 +function SpatialConvolutionFFTTiled:prepareSizeAndBuffers(i, w, o, metaData) + return parent.prepareSizeAndBuffers(self, i, w, o, metaData) +end + +function SpatialConvolutionFFTTiled:makeMetaData( + pass, + inputTileIndexW, inputTileIndexH, + outputTileIndexW, outputTileIndexH, + weightTileIndexW, weightTileIndexH) + local metaData = {} + metaData.pass = pass + metaData.input = {} + metaData.input.tileIndexH = inputTileIndexH + metaData.input.tileIndexW = inputTileIndexW + metaData.output = {} + metaData.output.tileIndexH = outputTileIndexH + metaData.output.tileIndexW = outputTileIndexW + metaData.weight = {} + metaData.weight.tileIndexH = weightTileIndexH + metaData.weight.tileIndexW = weightTileIndexW + return metaData +end + +-- Discriminated buffers based on bufferType, bufferSize, tileIndex and +-- whether it is an input or an output "of the algorithm" +function SpatialConvolutionFFTTiled:getBufferKey( + BufferType, bufferSizes, metaData) + assert(torch.type(bufferSizes) == 'torch.LongStorage', + torch.type(bufferSizes)) + assert(torch.type(metaData) == 'table', + torch.type(metaData)) + + -- TODO: needs semantics for proper producer consumer dependences and + -- ordering for RAW dependences by using self.moduleTimeStep properly + local md = {} + if metaData then + if BufferType == nn.SpatialConvolutionFFT.FFTInputBufferType then + md.tileIndices = metaData.input + elseif BufferType == nn.SpatialConvolutionFFT.FFTOutputBufferType then + md.tileIndices = metaData.output + else + md.tileIndices = metaData.weight + end + + -- This is an adhoc way to discriminate between + -- updateOutput / updateGradInput / accGradParameters + -- input (false) / gradInput (true) / input (false) + -- output (true) / gradOutput (false) / input (false) + -- weight (false) / weight (false) / gradWeight (true) + -- + local isOutputOfAlgorithm = false + -- In cufft mode, the tiled complex buffers are reused + if (metaData.pass == nn.SpatialConvolutionFFT.ForwardFFTPass and + BufferType == nn.SpatialConvolutionFFT.FFTOutputBufferType) or + (metaData.pass == nn.SpatialConvolutionFFT.BackwardFFTPass and + BufferType == nn.SpatialConvolutionFFT.FFTInputBufferType) or + (metaData.pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and + BufferType == nn.SpatialConvolutionFFT.FFTWeightBufferType) + then + isOutputOfAlgorithm = true + end + md.isOutputOfAlgorithm = isOutputOfAlgorithm + end + + -- If no memory reuse, all modules must use the same buffers, only + -- discriminate by buffer type and device id. + local moduleDiscr = self.moduleUID + if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone) + then + moduleDiscr = nil + bufferSizes = nil + if torch.type(self) ~= "nn.SpatialConvolutionFFTTiledAsync" then + -- if we run async we must have multiple tiles live at the same time, + -- just let all tiles be live at the same time + md = nil + end + end + + local bufferKey = { + self.cudaTensorBuffers, + cutorch.getDevice(), + BufferType, + bufferSizes, + moduleDiscr, + -- Be sure to put a counter for buffer and reuse btw timesteps or + -- memory will be blown (i.e. full DSA = ouch) + -- self.moduleTimeStep, + md + } + + local res = thrift.to_string(bufferKey) + if not self.bufferKeys:contains(res) then + self.bufferKeys:append(res) + end + + if self.printDebugLevel >= 3 then + print("BufferKey: ", bufferKey) + print("Serialized to : ", res) + end + + return res +end + +function SpatialConvolutionFFTTiled:cleanupBuffers() + parent.cleanupBuffers(self) + + -- Tiling metadata + -- updateOutput + self.inputTensorList = nil + self.outputTensorList = nil + self.metaDataListUpdateOutput = nil + -- updateGradInput + self.gradInputTensorList = nil + self.gradOutputTensorList = nil + self.metaDataListUpdateGradInput = nil + -- accGradParameters + self.inputTensorList2 = nil + self.gradOutputTensorList2 = nil + self.metaDataListAccGrad = nil + +end diff --git a/fbcunn/SpatialConvolutionFFTTiledAsync.lua b/fbcunn/SpatialConvolutionFFTTiledAsync.lua new file mode 100644 index 0000000..55f6198 --- /dev/null +++ b/fbcunn/SpatialConvolutionFFTTiledAsync.lua @@ -0,0 +1,369 @@ +-- Copyright 2004-present Facebook. All Rights Reserved. + +require 'cudnn' +local List = require 'pl.List' +local ffi = require 'ffi' + +local lib_name = 'torch_fb_fbcunn_mm' +local lib_path = package.searchpath(lib_name, package.cpath) +local FBMMFFI = ffi.load(lib_path and lib_path or lib_name) + +local lib_name = 'torch_fb_fbcunn_convolution_bias' +local lib_path = package.searchpath(lib_name, package.cpath) +local ConvolutionBiasFFI = ffi.load(lib_path and lib_path or lib_name) + +local function errorIf(cond, msg) + if cond then + error(msg) + end +end + +local function errorIfNot(cond, msg) + errorIf(not cond, msg) +end + +local function equalsTiledTensorDescriptor(td1, td2) + local res = true + if td1.tileSizeH ~= td2.tileSizeH then + res = res and false + end + if td1.tileSizeW ~= td2.tileSizeW then + res = res and false + end + if td1.tileIndexH ~= td2.tileIndexH then + res = res and false + end + if td1.tileIndexW ~= td2.tileIndexW then + res = res and false + end + if td1.tensorSizeH ~= td2.tensorSizeH then + res = res and false + end + if td1.tensorSizeW ~= td2.tensorSizeW then + res = res and false + end + if td1.padUp ~= td2.padUp then + res = res and false + end + if td1.padLeft ~= td2.padLeft then + res = res and false + end + if td1.tensor:storage() ~= td2.tensor:storage() then + res = res and false + end + if td1.tensor:storageOffset() ~= td2.tensor:storageOffset() then + res = res and false + end + return res +end + + +------------------------------------------------------------------------------ +-- Actual Module +------------------------------------------------------------------------------ +local SpatialConvolutionFFTTiledAsync, parent = + torch.class('nn.SpatialConvolutionFFTTiledAsync', + 'nn.SpatialConvolutionFFTTiled') + +function SpatialConvolutionFFTTiledAsync:__init(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + tileSizeW, + tileSizeH, + memoryReusePolicy, + numCudaStreams) + parent.__init(self, + nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + tileSizeW, + tileSizeH, + memoryReusePolicy, + numCudaStreams) +end + + +function SpatialConvolutionFFTTiledAsync:instUpdateOutputFFTImpl(input) + -- Make sure tiling information has been precomputed + assert(self.inputTensorList) + assert(self.outputTensorList) + + local currentStream = 1 + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + for i = 1, self.outputTensorList:len() do + -- Assert consistency of tensor dimensions + errorIfNot(#self.inputTensorList[i].tensor:size() == #input:size(), + "Tensor size mismatch: " .. + #self.inputTensorList[i].tensor:size() .. " vs " .. + #input:size()) + errorIfNot(#self.outputTensorList[i].tensor:size() == #self.output:size()) + + -- Set padding for this tile which can be partial and on the boundary + local savePadUp, savePadLeft, savePadDown, savePadRight = + self:pushPadding(i, self.inputTensorList) + + local firstIteration = (i == 1) + local reuseList = List{} + if not firstIteration then + -- Whatever the memory reuse policy, when tiling, we can reuse + -- the computed FFT(weight), this is one of the points of tiling + reuseList:append(self.FFTWeightBufferType) + end + local inputLocal = self.inputTensorList[i].tensor + local outputLocal = self.outputTensorList[i].tensor + local metaData = self.metaDataListUpdateOutput[i] + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#inputLocal:size() == cublasBatchDims + 2) + + local commonSize = self:prepareSizeAndBuffers( + inputLocal, self.weight, outputLocal, metaData) + + -- Run all under this currentStream + cutorch.setStream(currentStream) + local fftWrapperPadded = nn.FFTWrapper( + self.fftImplementation, self.padLeft, self.padUp) + fftWrapperPadded:fft(inputLocal, self.inputBuffer, cublasBatchDims) + if not reuseList or not reuseList:contains(self.FFTWeightBufferType) then + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + fftWrapper:fft(self.weight, self.weightBuffer, cublasBatchDims) + -- Since we're running async, everyone must wait on my mighty buffers + -- ############################################## + cutorch.streamBarrier(self.allStreams) + end + local norm = self:getNormalizationFactor(commonSize, inputLocal) + FBMMFFI.transposeMMFFI(cutorch._state, + self.inputBuffer:cdata(), + self.weightBuffer:cdata(), + self.outputBuffer:cdata(), + 1.0 / norm, + false, + true, + false) + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + fftWrapper:ffti(outputLocal, self.outputBuffer, cublasBatchDims) + currentStream = currentStream % cutorch.getNumStreams() + 1 + + -- Pop back saved padding values + self.padUp, self.padLeft, self.padDown, self.padRight = + savePadUp, savePadLeft, savePadDown, savePadRight + end + + -- ############################################## + cutorch.streamBarrier(self.allStreams) + cutorch.setStream(1) + ConvolutionBiasFFI.updateOutputBiasFFI( + cutorch._state, self.output:cdata(), self.bias:cdata()) + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + return self.output +end + + +function SpatialConvolutionFFTTiledAsync:instUpdateGradInputFFTImpl( + input, gradOutput) + -- Make sure tiling information has been precomputed + assert(self.gradInputTensorList) + assert(self.gradOutputTensorList) + + local currentStream = 1 + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + for i = 1, self.gradInputTensorList:len() do + -- Assert consistency of tensor dimensions + errorIfNot(#self.gradInputTensorList[i].tensor:size() == #input:size(), + "Tensor size mismatch: " .. + #self.gradInputTensorList[i].tensor:size() .. + " vs " .. #self.gradInput:size()) + errorIfNot( + #self.gradOutputTensorList[i].tensor:size() == #gradOutput:size()) + + -- Set padding for this tile which can be partial and on the boundary + -- Need additional padding for circular symmetry in Fourier domain + local savePadUp, savePadLeft, savePadDown, savePadRight = + self:pushPaddingWithCircularSymmetry(i, self.tileSizeH, self.tileSizeW) + + local firstIteration = (i == 1) + local reuseList = List{} + if not firstIteration then + -- Whatever the memory reuse policy, when tiling, we can reuse + -- the computed FFT(weight), this is one of the points of tiling + reuseList:append(self.FFTWeightBufferType) + end + + local inputLocal = self.gradInputTensorList[i].tensor + local outputLocal = self.gradOutputTensorList[i].tensor + local metaData = self.metaDataListUpdateGradInput[i] + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#inputLocal:size() == cublasBatchDims + 2) + + local commonSize = self:prepareSizeAndBuffers( + inputLocal, self.weight, outputLocal, metaData) + + -- Run all under this currentStream + cutorch.setStream(currentStream) + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + fftWrapper:fft(outputLocal, self.outputBuffer, cublasBatchDims) + if not reuseList or not reuseList:contains(self.FFTWeightBufferType) then + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + fftWrapper:fft(self.weight, self.weightBuffer, cublasBatchDims) + -- Since we're running async, everyone must wait on my mighty buffers + -- ############################################## + cutorch.streamBarrier(self.allStreams) + end + local norm = self:getNormalizationFactor(commonSize, outputLocal) + FBMMFFI.transposeMMFFI(cutorch._state, + self.outputBuffer:cdata(), + self.weightBuffer:cdata(), + self.inputBuffer:cdata(), + 1.0 / norm, + false, + false, + false) + local fftWrapperPadded = nn.FFTWrapper( + self.fftImplementation, self.padLeft, self.padUp) + fftWrapperPadded:ffti(inputLocal, self.inputBuffer, cublasBatchDims) + currentStream = currentStream % cutorch.getNumStreams() + 1 + + -- Pop back saved padding values + self.padUp, self.padLeft, self.padDown, self.padRight = + savePadUp, savePadLeft, savePadDown, savePadRight + end + + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + return self.gradInput +end + + +function SpatialConvolutionFFTTiledAsync:instAccGradParametersFFTImpl( + input, gradOutput, scale) + -- Make sure tiling information has been precomputed + assert(self.inputTensorList2) + assert(self.gradOutputTensorList2) + + -- At this point tiles / metadata for buffer management / reuse are available + local previousStream + local currentStream = 1 + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- Run ahead + cutorch.setStream(currentStream) + ConvolutionBiasFFI.accGradParametersBiasFFI( + cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale) + previousStream, currentStream = + currentStream, currentStream % cutorch.getNumStreams() + 1 + + for i = 1, self.inputTensorList2:len() do + -- Assert consistency of tensor dimensions + errorIfNot(#self.inputTensorList2[i].tensor:size() == #input:size(), + "Tensor size mismatch: " .. + #self.inputTensorList2[i].tensor:size() .. + " vs " .. #input:size()) + errorIfNot( + #self.gradOutputTensorList2[i].tensor:size() == #gradOutput:size()) + + -- Set padding for this tile which can be partial and on the boundary + local savePadUp, savePadLeft, savePadDown, savePadRight = + self:pushPadding(i, self.inputTensorList2) + + local firstWrite = (i == 1) + local lastWrite = (i == self.inputTensorList2:len()) + -- Interestingly, tiled input is reusable but has a long liveness + -- If we don't reuse it we can reclaim the memory for something else + -- This is all controlled by the bufferKey + -- local reuseList = List{} + local reuseList = List{} + if self.inputTensorList and -- not cleaned earlier -> may want to reuse + equalsTiledTensorDescriptor(self.inputTensorList[i], + self.inputTensorList2[i]) then + reuseList:append(self.FFTInputBufferType) + end + + local inputLocal = self.inputTensorList2[i].tensor + local outputLocal = self.gradOutputTensorList2[i].tensor + local metaData = self.metaDataListAccGrad[i] + local cublasBatchDims = 2 + -- 2D convolutions on 4D tensors atm + assert(#inputLocal:size() == cublasBatchDims + 2) + + local commonSize = self:prepareSizeAndBuffers( + inputLocal, self.gradWeight, outputLocal, metaData) + + -- Run all under this currentStream + cutorch.setStream(currentStream) + if not reuseList or not reuseList:contains(self.FFTOutputBufferType) + then + -- Potentially reuse buffer if so told + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + fftWrapper:fft(outputLocal, self.outputBuffer, cublasBatchDims) + else + error('UpdateGradInput and AccGradParameter tiled padded ' .. + 'gradOuput cannot be shared atm') + end + if not reuseList or not reuseList:contains(self.FFTInputBufferType) + then + -- Potentially reuse buffer if so told + local fftWrapperPadded = nn.FFTWrapper( + self.fftImplementation, self.padLeft, self.padUp) + fftWrapperPadded:fft( + inputLocal, self.inputBuffer, cublasBatchDims) + end + + -- Because we accumulate into C, we must synchronize with the + -- previous transposeMMFFI call. We statically know by construction + -- that it leaves on previousStream and by transitivity of + -- dependences we're good to go + cutorch.streamWaitFor(currentStream, {previousStream}) + local lastWriteNorm = 1.0 + if lastWrite then + local norm = self:getNormalizationFactor(commonSize, outputLocal) + lastWriteNorm = (1.0 * scale) / norm + end + FBMMFFI.transposeMMFFI(cutorch._state, + self.outputBuffer:cdata(), + self.inputBuffer:cdata(), + self.weightBuffer:cdata(), + lastWriteNorm, + true, -- conjugate A + false, -- B + not firstWrite) -- accumulate into C + + -- 3. Accumulate in the frequency domain, IFFT on last write + if lastWrite then + local fftWrapper = nn.FFTWrapper(self.fftImplementation) + fftWrapper:ffti( + self.gradWeight, self.weightBuffer, cublasBatchDims) + end + + if self.printDebugLevel >= 3 then + print('Step ASYNC gradWeight: ', self.gradWeight) + end + previousStream, currentStream = + currentStream, currentStream % cutorch.getNumStreams() + 1 + + -- Pop back saved padding values + self.padUp, self.padLeft, self.padDown, self.padRight = + savePadUp, savePadLeft, savePadDown, savePadRight + end + + -- ############################################## + cutorch.streamBarrier(self.allStreams) +end diff --git a/fbcunn/SpatialConvolutionFFTTiledIterated.lua b/fbcunn/SpatialConvolutionFFTTiledIterated.lua new file mode 100644 index 0000000..77d1cca --- /dev/null +++ b/fbcunn/SpatialConvolutionFFTTiledIterated.lua @@ -0,0 +1,231 @@ +-- Copyright 2004-present Facebook. All Rights Reserved. + +require 'cudnn' +local List = require 'pl.List' +local ffi = require 'ffi' + +local lib_name = 'torch_fb_fbcunn_convolution_bias' +local lib_path = package.searchpath(lib_name, package.cpath) +local ConvolutionBiasFFI = ffi.load(lib_path and lib_path or lib_name) + +local lib_name = 'torch_fb_fbcunn_FFTIteratedConvolution' +local lib_path = package.searchpath(lib_name, package.cpath) +local FFTIteratedConvolution = ffi.load(lib_path and lib_path or lib_name) + +------------------------------------------------------------------------------ +-- Actual Module +------------------------------------------------------------------------------ +local SpatialConvolutionFFTTiledIterated, parent = + torch.class('nn.SpatialConvolutionFFTTiledIterated', + 'nn.SpatialConvolutionFFTTiled') + +function SpatialConvolutionFFTTiledIterated:__init(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + tileSizeW, + tileSizeH, + memoryReusePolicy, + numCudaStreams) + parent.__init(self, + nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + tileSizeW, + tileSizeH, + memoryReusePolicy, + numCudaStreams) + + -- Override any memory reuse scheme: just no reuse + self.memoryReusePolicy = List{nn.SpatialConvolutionFFT.memoryReuseNone} +end + +-- Adjustment needed for updateGradInput since we don't do circular +-- shifts in the Fourier domain, just shift in time. +local function buildTiledDeviceTensorFFI( + inputTensorList, outputTensorList, adjustInputShiftW, adjustInputShiftH) + local adjustInputShiftW = adjustInputShiftW or 0 + local adjustInputShiftH = adjustInputShiftH or 0 + local size = inputTensorList:len() + assert(outputTensorList:len() == size) + local inputTiledDeviceTensorFFI = + ffi.new("TiledDeviceTensorFFI[?]", size) + local outputTiledDeviceTensorFFI = + ffi.new("TiledDeviceTensorFFI[?]", size) + for i = 1, size do + inputTiledDeviceTensorFFI[i - 1].tensor = + inputTensorList[i].tensor:cdata() + inputTiledDeviceTensorFFI[i - 1].padL = + inputTensorList[i].padLeft + adjustInputShiftW + inputTiledDeviceTensorFFI[i - 1].padU = + inputTensorList[i].padUp + adjustInputShiftH + outputTiledDeviceTensorFFI[i - 1].tensor = + outputTensorList[i].tensor:cdata() + outputTiledDeviceTensorFFI[i - 1].padL = outputTensorList[i].padLeft + outputTiledDeviceTensorFFI[i - 1].padU = outputTensorList[i].padUp + end + return inputTiledDeviceTensorFFI, outputTiledDeviceTensorFFI, size +end + +function SpatialConvolutionFFTTiledIterated:instUpdateOutputFFTImpl(input) + -- Make sure tiling information has been precomputed + assert(self.inputTensorList) + assert(self.outputTensorList) + assert(self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone)) + + local inputTiledDeviceTensorFFI, outputTiledDeviceTensorFFI, numTiles = + buildTiledDeviceTensorFFI(self.inputTensorList, self.outputTensorList) + + + for _, actualTileSize in ipairs({8, 16, 32}) do + if self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone) and + self.tileSizeH <= actualTileSize and self.tileSizeW <= actualTileSize + then + -- Only do iterated convolutions if there is no reuse + self.output:zero() + -- ############################################## + cutorch.streamBarrier(self.allStreams) + local convolutionPassFFI = + ffi.new("FFTConvolutionPassFFI") + convolutionPassFFI.pass = convolutionPassFFI.FFT_UpdateOutput + + FFTIteratedConvolution.convolveIteratedFFI( + cutorch._state, + inputTiledDeviceTensorFFI, + self.weight:cdata(), + outputTiledDeviceTensorFFI, + numTiles, + actualTileSize, + convolutionPassFFI, + 1.0) + + -- ############################################## + cutorch.streamBarrier(self.allStreams) + ConvolutionBiasFFI.updateOutputBiasFFI( + cutorch._state, self.output:cdata(), self.bias:cdata()) + -- ############################################## + cutorch.streamBarrier(self.allStreams) + return self.output + end + end + + error('updateOutputIterated tiling by ' .. self.tileSizeW .. 'x' .. + self.tileSizeH .. ' not supported') +end + + + +function SpatialConvolutionFFTTiledIterated:instUpdateGradInputFFTImpl( + input, gradOutput) + -- Make sure tiling information has been precomputed + assert(self.gradInputTensorList) + assert(self.gradOutputTensorList) + assert(self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone)) + + local gradInputTiledDeviceTensorFFI, + gradOutputTiledDeviceTensorFFI, + numTiles = + buildTiledDeviceTensorFFI(self.gradInputTensorList, + self.gradOutputTensorList, + -- Adjust for no circular rotation in + -- Fourier domain + self.kW - 1, + self.kH - 1 + ) + + for _, actualTileSize in ipairs({8, 16, 32}) do + if self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone) and + self.tileSizeH <= actualTileSize and self.tileSizeW <= actualTileSize + then + -- Only do iterated convolutions if there is not reuse + self.gradInput:zero() + -- ############################################## + cutorch.streamBarrier(self.allStreams) + local convolutionPassFFI = + ffi.new("FFTConvolutionPassFFI") + convolutionPassFFI.pass = convolutionPassFFI.FFT_UpdateGradInput + FFTIteratedConvolution.convolveIteratedFFI( + cutorch._state, + gradInputTiledDeviceTensorFFI, + self.weight:cdata(), + gradOutputTiledDeviceTensorFFI, + numTiles, + actualTileSize, + convolutionPassFFI, + 1.0) + -- ############################################## + cutorch.streamBarrier(self.allStreams) + return self.gradInput + end + end + + error('updateGradInputIterated tiling by ' .. self.tileSizeW .. 'x' .. + self.tileSizeH .. ' not supported') +end + + +function SpatialConvolutionFFTTiledIterated:instAccGradParametersFFTImpl( + input, gradOutput, scale) + + -- Make sure tiling information has been precomputed + assert(self.inputTensorList2) + assert(self.gradOutputTensorList2) + assert(self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone)) + + local inputTiledDeviceTensorFFI, + gradOutputTiledDeviceTensorFFI, + numTiles = + buildTiledDeviceTensorFFI(self.inputTensorList2, + self.gradOutputTensorList2) + + for _, actualTileSize in ipairs({8, 16, 32}) do + if self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone) and + self.tileSizeH <= actualTileSize and self.tileSizeW <= actualTileSize + then + -- Only do iterated convolutions if there is no reuse + self.gradWeight:zero() + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- Run ahead + cutorch.setStream(1) + ConvolutionBiasFFI.accGradParametersBiasFFI( + cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale) + + cutorch.setStream(2) + local convolutionPassFFI = + ffi.new("FFTConvolutionPassFFI") + convolutionPassFFI.pass = convolutionPassFFI.FFT_AccGradParameters + FFTIteratedConvolution.convolveIteratedFFI( + cutorch._state, + inputTiledDeviceTensorFFI, + self.gradWeight:cdata(), + gradOutputTiledDeviceTensorFFI, + numTiles, + actualTileSize, + convolutionPassFFI, + scale) + -- ############################################## + cutorch.streamBarrier(self.allStreams) + return + end + end + + error('accGradParametersIterated tiling by ' .. self.tileSizeW .. 'x' .. + self.tileSizeH .. ' not supported') +end diff --git a/fbcunn/SpatialConvolutionFFTTiledSync.lua b/fbcunn/SpatialConvolutionFFTTiledSync.lua new file mode 100644 index 0000000..decac43 --- /dev/null +++ b/fbcunn/SpatialConvolutionFFTTiledSync.lua @@ -0,0 +1,247 @@ +-- Copyright 2004-present Facebook. All Rights Reserved. + +require 'cudnn' +local List = require 'pl.List' +local ffi = require 'ffi' +local ConvolutionBiasFFI = ffi.load('torch_fb_fbcunn_convolution_bias') + +local function errorIf(cond, msg) + if cond then + error(msg) + end +end + +local function errorIfNot(cond, msg) + errorIf(not cond, msg) +end + +------------------------------------------------------------------------------ +-- Actual Module +------------------------------------------------------------------------------ +local SpatialConvolutionFFTTiledSync, parent = + torch.class('nn.SpatialConvolutionFFTTiledSync', + 'nn.SpatialConvolutionFFTTiled') + +function SpatialConvolutionFFTTiledSync:__init(nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + tileSizeW, + tileSizeH, + memoryReusePolicy, + numCudaStreams) + parent.__init(self, + nInputPlane, + nOutputPlane, + kW, + kH, + dW, + dH, + padLeft, + padUp, + tileSizeW, + tileSizeH, + memoryReusePolicy, + numCudaStreams) + + -- Override any memory reuse scheme: just no reuse + self.memoryReusePolicy = List{nn.SpatialConvolutionFFT.memoryReuseNone} +end + +function SpatialConvolutionFFTTiledSync:instUpdateOutputFFTImpl(input) + -- Make sure tiling information has been precomputed + assert(self.inputTensorList) + assert(self.outputTensorList) + assert(self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone)) + + -- Push / pop the local tensor, we're calling a parent in sync mode + local saveOutput = self.output + for i = 1, self.outputTensorList:len() do + -- Assert consistency of tensor dimensions + errorIfNot(#self.inputTensorList[i].tensor:size() == #input:size(), + "Tensor size mismatch: " .. + #self.inputTensorList[i].tensor:size() .. " vs " .. + #input:size()) + errorIfNot(#self.outputTensorList[i].tensor:size() == #self.output:size()) + + -- Set padding for this tile which can be partial and on the boundary + local savePadUp, savePadLeft, savePadDown, savePadRight = + self:pushPadding(i, self.inputTensorList) + + -- Even in the absence of reuse we can compute the weight buffers only + -- once. This is one of the points of tiling in the first place + local firstIteration = (i == 1) + local reuseList = List{} + if not firstIteration then + reuseList:append(self.FFTWeightBufferType) + end + self.output = self.outputTensorList[i].tensor + -- Go up 2 levels, 'cast' as SpatialConvolutionFBFFT + nn.SpatialConvolutionFBFFT.updateOutputFFTImpl( + self, + self.inputTensorList[i].tensor, + reuseList, + self.metaDataListUpdateOutput[i]) + + -- Pop back saved padding values + self.padUp, self.padLeft, self.padDown, self.padRight = + savePadUp, savePadLeft, savePadDown, savePadRight + end + + self.output = saveOutput + -- ############################################## + cutorch.streamBarrier(self.allStreams) + cutorch.setStream(1) + ConvolutionBiasFFI.updateOutputBiasFFI( + cutorch._state, self.output:cdata(), self.bias:cdata()) + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + return self.output +end + + + +function SpatialConvolutionFFTTiledSync:instUpdateGradInputFFTImpl( + input, gradOutput) + -- Make sure tiling information has been precomputed + assert(self.gradInputTensorList) + assert(self.gradOutputTensorList) + assert(self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone)) + + -- Push / pop the local tensor, we're calling a parent in sync mode + local saveGradInput = self.gradInput + for i = 1, self.gradInputTensorList:len() do + -- Assert consistency of tensor dimensions + errorIfNot(#self.gradInputTensorList[i].tensor:size() == #input:size(), + "Tensor size mismatch: " .. + #self.gradInputTensorList[i].tensor:size() .. + " vs " .. #self.gradInput:size()) + errorIfNot( + #self.gradOutputTensorList[i].tensor:size() == #gradOutput:size()) + + -- Set padding for this tile which can be partial and on the boundary + -- Need additional padding for circular symmetry in Fourier domain + local savePadUp, savePadLeft, savePadDown, savePadRight = + self:pushPaddingWithCircularSymmetry(i, self.tileSizeH, self.tileSizeW) + + local firstIteration = (i == 1) + local reuseList = List{} + if not firstIteration then + reuseList:append(self.FFTWeightBufferType) + end + + self.gradInput = self.gradInputTensorList[i].tensor + -- Go up 2 levels, 'cast' as SpatialConvolutionFBFFT + nn.SpatialConvolutionFBFFT.updateGradInputFFTImpl( + self, + self.gradInput, -- used only as model + self.gradOutputTensorList[i].tensor, + -- weight buffers can always be reused + -- since we enforce that tiles are larger + -- than weights + reuseList, + self.metaDataListUpdateGradInput[i]) + + -- Pop back saved padding values + self.padUp, self.padLeft, self.padDown, self.padRight = + savePadUp, savePadLeft, savePadDown, savePadRight + end + self.gradInput = saveGradInput + return self.gradInput +end + + +function SpatialConvolutionFFTTiledSync:instAccGradParametersFFTImpl( + input, gradOutput, scale) + -- Make sure tiling information has been precomputed + assert(self.inputTensorList2) + assert(self.gradOutputTensorList2) + assert(self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseNone)) + + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + -- Run ahead + local currentStream = 0 + cutorch.setStream(currentStream) + ConvolutionBiasFFI.accGradParametersBiasFFI( + cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale) + + for i = 1, self.inputTensorList2:len() do + -- Assert consistency of tensor dimensions + errorIfNot(#self.inputTensorList2[i].tensor:size() == #input:size(), + "Tensor size mismatch: " .. + #self.inputTensorList2[i].tensor:size() .. + " vs " .. #input:size()) + errorIfNot( + #self.gradOutputTensorList2[i].tensor:size() == #gradOutput:size()) + + -- Set padding for this tile which can be partial and on the boundary + local savePadUp, savePadLeft, savePadDown, savePadRight = + self:pushPadding(i, self.inputTensorList2) + + local firstWrite = (i == 1) + local lastWrite = (i == self.inputTensorList2:len()) + + -- We accumulate in this thing, make sure it is zero + self.gradWeight:zero() + self.gradBias:zero() + + if firstWrite then + self.gradWeightAcc = self.gradWeight:clone() + self.gradBiasAcc = self.gradBias:clone() + end + + -- Can't reuse tiled gradOutput without extra work + errorIf(self.memoryReusePolicy:contains( + nn.SpatialConvolutionFFT.memoryReuseOutput), + "Reuse output in tiled accGradParameters is not supproted") + + if self.printDebugLevel >= 3 then + print('Pre step synchronous gradWeight @', + self.gradWeight:cdata(), ': ', self.gradWeight:float()) + end + + -- Go up 2 levels, 'cast' as SpatialConvolutionFBFFT + nn.SpatialConvolutionFBFFT.accGradParametersFFTImpl( + self, + self.inputTensorList2[i].tensor, + self.gradOutputTensorList2[i].tensor, + scale, + List{}, -- reuseList + self.metaDataListAccGrad[i]) + + -- Super heavy, need to clear this up + -- ############################################## + cutorch.streamBarrier(self.allStreams) + self.gradWeightAcc:add(self.gradWeight) + self.gradBiasAcc:add(self.gradBias) + -- ############################################## + cutorch.streamBarrier(self.allStreams) + + if self.printDebugLevel >= 3 then + print('Step synchronous gradWeight @', + self.gradWeight:cdata(), ': ', self.gradWeight:float()) + end + + if lastWrite then + self.gradWeight:copy(self.gradWeightAcc) + self.gradBias:copy(self.gradBiasAcc) + end + + -- Pop back saved padding values + self.padUp, self.padLeft, self.padDown, self.padRight = + savePadUp, savePadLeft, savePadDown, savePadRight + end + + -- ############################################## + cutorch.streamBarrier(self.allStreams) +end diff --git a/fbcunn/TemporalKMaxPooling.lua b/fbcunn/TemporalKMaxPooling.lua index 2fe82ff..34fb88d 100644 --- a/fbcunn/TemporalKMaxPooling.lua +++ b/fbcunn/TemporalKMaxPooling.lua @@ -1,5 +1,11 @@ -- Copyright 2004-present Facebook. All Rights Reserved. +-- TemporalKmaxPooling +-- Input : (bsize x) width x height +-- Output : (bisze x) k_out x height +-- with k_out = max(k_out_prop, inputSeqLen) +-- where k_out_prop = max(k, ceil(k_dynamic*inputSeqLen)) + require 'cutorch' require 'nn' @@ -10,8 +16,21 @@ function TemporalKMaxPooling:__init(k, k_dynamic) parent.__init(self) self.k = k + if k_dynamic then + assert(k_dynamic <= 1 and k_dynamic >=0, + 'k_dynamic must be between 0 and 1') + end self.k_dynamic = k_dynamic or -1 + -- k_dynamic is an optional scalar parameter between 0 and 1 + -- that makes k a fraction of the input sequence size. + + -- To follow Kalchbrenner et al's architecture on Dynamic k-Max Pooling: + -- Use (k = k_top, kDynamic = (L - l)/L), with + -- L : total number of conv layers, + -- l : current convolutional layer to which the pooling is applied, + -- k_top : fixed pooling parameter for the topmost convolutional layer. + self.output = torch.CudaTensor() self.gradInput = torch.CudaTensor() self.indices = torch.CudaTensor() diff --git a/fbcunn/init.lua b/fbcunn/init.lua index 67ab302..913cae9 100644 --- a/fbcunn/init.lua +++ b/fbcunn/init.lua @@ -2,20 +2,31 @@ require 'nn' require 'fbnn' require 'cunn' require 'libfbcunn' -require 'libfbcunnlayers' +require 'fbcunn.cuda_ext' include('AbstractParallel.lua') +include('BatchNormalization.lua') include('CuBLASWrapper.lua') include('DataParallel.lua') include('FeatureLPPooling.lua') include('FFTWrapper.lua') --- include('HalfPrecision.lua') +include('HalfPrecision.lua') include('LookupTableGPU.lua') include('ModelParallel.lua') include('OneBitDataParallel.lua') include('OneBitQuantization.lua') include('OneBitSGD.lua') -include('SpatialConvolutionCuFFT.lua') +include('FFTCDefs.lua') +include('SpatialBatchNormalization.lua') +-- include('SpatialConvolutionFFT.lua') +-- include('SpatialConvolutionCuFFT.lua') +-- include('SpatialConvolutionFBFFT.lua') +-- include('SpatialConvolutionFBFFTGemm.lua') +-- include('SpatialConvolutionFFTTiled.lua') +-- include('SpatialConvolutionFFTTiledSync.lua') +-- include('SpatialConvolutionFFTTiledAsync.lua') +-- include('SpatialConvolutionFFTTiledIterated.lua') +-- include('SpatialConvolution.lua') include('TemporalConvolutionFB.lua') include('TemporalKMaxPooling.lua') @@ -65,11 +76,11 @@ function nn.Module:getParametersByDevice() return nil end if dev == 0 then - return nn.Module._gather(params), nn.Module._gather(grads) + return nn.Module.flatten(params), nn.Module.flatten(grads) end return cutorch.withDevice(dev, - function() return nn.Module._gather(params), - nn.Module._gather(grads) + function() return nn.Module.flatten(params), + nn.Module.flatten(grads) end) end diff --git a/src/BLASParameters.cpp b/src/BLASParameters.cpp index f6b4c7b..6b4e164 100644 --- a/src/BLASParameters.cpp +++ b/src/BLASParameters.cpp @@ -16,8 +16,11 @@ std::ostream& operator<<(ostream& os, const BLASParameters& params) { os << " batchStepC = " << params.batchStepC; os << " #handles = " << params.handles.size(); os << " #streams = " << params.streams.size(); - os << " transposeA = " << (params.transposeA == CUBLAS_OP_T); - os << " transposeB = " << (params.transposeB == CUBLAS_OP_T); + os << " transposeA = " << ((params.transposeA == CUBLAS_OP_T) ? "t " : + (params.transposeA == CUBLAS_OP_C) ? "c " : "n"); + os << " transposeB = " << ((params.transposeB == CUBLAS_OP_T) ? "t " : + (params.transposeB == CUBLAS_OP_C) ? "c " : "n"); + os << " scale = (" << params.scaleRe << ", " << params.scaleIm << ")"; return os; } diff --git a/src/BLASParameters.h b/src/BLASParameters.h index b9890c4..abe06b6 100644 --- a/src/BLASParameters.h +++ b/src/BLASParameters.h @@ -33,12 +33,14 @@ struct BLASParameters { iterDims = i; return *this; } + // After iterDims, remaining outermost dimensions to be treated as batch // dimensions, for instance, in a gemmbatched call. BLASParameters& withBatchDims(int i) { batchDims = i; return *this; } + // Force running on a particular handle / stream index in the handle / // stream vectors. The actual handle / stream we will end up running on is // recovered by modulo indexing into the vector, default handle / stream if @@ -47,6 +49,7 @@ struct BLASParameters { resourceIndex = i; return *this; } + // Distance between two batches of A, used in batched mode, in case we want // to compute one entry every k. Step of zerom means the same matrix A will // be read over and over again. @@ -54,6 +57,7 @@ struct BLASParameters { batchStepA = i; return *this; } + // Distance between two batches of B, used in batched mode, in case we want // to compute one entry every k. Step of zerom means the same matrix B will // be read over and over again. @@ -61,6 +65,7 @@ struct BLASParameters { batchStepB = i; return *this; } + // Distance between two batches of C, used in batched mode, in case we want // to compute one entry every k. Step of zerom means the same matrix C will // be written over and over again. @@ -68,47 +73,69 @@ struct BLASParameters { batchStepC = i; return *this; } + // Sets real scale in C += alpha * C + scale * A * B BLASParameters& withScaleReal(float f) { scaleRe = f; return *this; } + // Sets imaginary scale in C += alpha * C + scale * A * B BLASParameters& withScaleImaginary(float f) { scaleIm = f; return *this; } + // Use cgemm instead of sgemm BLASParameters& withComplex(bool b) { asComplex = b; return *this; } + // If true, computes C += scale * A * B. Default is C = scale * A * B. BLASParameters& withAccumulate(bool b) { accumulate = b; return *this; } + // Set vector of handle resources BLASParameters& withHandles(const std::vector& h) { handles = h; return *this; } + // Set vector of stream resources BLASParameters& withStreams(const std::vector& s) { streams = s; return *this; } + // Transpose A BLASParameters& withTransposeA(cublasOperation_t t) { transposeA = t; return *this; } + // Transpose B BLASParameters& withTransposeB(cublasOperation_t t) { transposeB = t; return *this; } + // Transpose A + BLASParameters& withTransposeA(char c) { + transposeA = (c == 't') ? CUBLAS_OP_T : + ((c == 'c') ? CUBLAS_OP_C : CUBLAS_OP_N); + return *this; + } + + // Transpose B + BLASParameters& withTransposeB(char c) { + transposeB = (c == 't') ? CUBLAS_OP_T : + ((c == 'c') ? CUBLAS_OP_C : CUBLAS_OP_N); + return *this; + } + unsigned int iterDims; unsigned int batchDims; unsigned int resourceIndex; diff --git a/src/BatchNormalization.cu b/src/BatchNormalization.cu new file mode 100644 index 0000000..2a40d7d --- /dev/null +++ b/src/BatchNormalization.cu @@ -0,0 +1,460 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#include "src/DeviceTensorUtils.h" +#include "THCTensor.h" + +#include "cuda/CudaUtils.cuh" +#include "cuda/DeviceTensor.cuh" +#include "cuda/MemoryAccess.cuh" +#include "cuda/util/CachedDeviceProperties.h" + +#define ENABLE_CUDA_DEBUG +#include "cuda/CudaDebugUtils.cuh" + +#include +#include + +#include + +using namespace facebook::cuda; + +namespace facebook { namespace deeplearning { namespace torch { + +#define LOG_TARGET VLOG(1) // LOG(INFO) + +template +__global__ void BatchNormalizationUpdateOutputInferenceUnrolled_kernel( + const DeviceTensor input, + DeviceTensor output, + DeviceTensor runningMean, + DeviceTensor runningStddev, + const DeviceTensor weight, + const DeviceTensor bias) { + + static_assert(std::is_same::value , "type"); + + auto batch = blockIdx.y; + auto x = blockIdx.x * blockDim.x + threadIdx.x; + if (x >= input.getSize(1)) { + return; + } + + // stddev is actually 1 / stddev + ComputeT stddev = runningStddev[x].ldg(); + ComputeT mean = runningMean[x].ldg(); + ComputeT inp = input[batch][x].ldg(); + if (affine) { + // multiply with gamma and add beta + // TODO: everyone pulling this, optimize by reusing better + ComputeT beta = bias[x].ldg(); + ComputeT gamma = weight[x].ldg(); + output[batch][x] = gamma * (inp - mean) * (stddev) + beta; + } else { + output[batch][x] = (inp - mean) * (stddev); + } +} + +template +__global__ void BatchNormalizationUpdateOutput_kernel( + const DeviceTensor input, + DeviceTensor output, + DeviceTensor centered, + DeviceTensor std, + DeviceTensor normalized, + DeviceTensor runningMean, + DeviceTensor runningStddev, + const DeviceTensor weight, + const DeviceTensor bias, + T epsilon, + T momentum) { + + static_assert(std::is_same::value , "type"); + + auto x = blockIdx.x * blockDim.x + threadIdx.x; + if (x >= output.getSize(1)) { + return; + } + + ComputeT norm = (ComputeT)1 / input.getSize(0); + + ComputeT batchMean = (ComputeT)0; + for (auto batch = 0; batch < output.getSize(0); ++batch) { + ComputeT b = input[batch][x].ldg(); + batchMean += b; + } + batchMean *= norm; + runningMean[x] = (1 - momentum) * runningMean[x] + momentum * batchMean; + + ComputeT stdMean = (ComputeT)0; + for (auto batch = 0; batch < output.getSize(0); ++batch) { + ComputeT inp = input[batch][x].ldg() ; + centered[batch][x] = inp - batchMean; + stdMean += (inp - batchMean) * (inp - batchMean); + } + stdMean = 1 / sqrt(stdMean * norm + epsilon); + + std[x] = stdMean; + runningStddev[x] = (1 - momentum) * runningStddev[x] + momentum * stdMean; + + for (auto batch = 0; batch < output.getSize(0); ++batch) { + output[batch][x] = centered[batch][x] * stdMean; + normalized[batch][x] = centered[batch][x] * stdMean; + if (affine) { + ComputeT beta = bias[x]; + ComputeT gamma = weight[x]; + output[batch][x] = gamma * output[batch][x] + beta; + } + } +} + + +template +void BatchNormalizationUpdateOutput( + const DeviceTensor input, + DeviceTensor output, + DeviceTensor centered, + DeviceTensor std, + DeviceTensor normalized, + DeviceTensor runningMean, + DeviceTensor runningStddev, + const DeviceTensor weight, + const DeviceTensor bias, + T epsilon, + T momentum, + cudaStream_t s) +{ + static_assert(BatchDims == 2, "BatchDims == 2 only atm"); + static_assert(ImageDims == 0, "ImageDims == 0 only atm"); + + dim3 threads(128); + // auto prop = getCurrentDeviceProperties(); + if (!train) { + dim3 blocks(ceil(input.getSize(1), 128), input.getSize(0)); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + BatchNormalizationUpdateOutputInferenceUnrolled_kernel + + <<>> + (input, output, runningMean, runningStddev, weight, bias); + } else { + dim3 blocks(ceil(input.getSize(1), 128)); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + BatchNormalizationUpdateOutput_kernel + <<>>(input, + output, + centered, + std, + normalized, + runningMean, + runningStddev, + weight, + bias, + epsilon, + momentum); + } + +} + +extern "C" void BatchNormalizationUpdateOutputFFI( + THCState* state, + THCudaTensor* input, + THCudaTensor* output, + THCudaTensor* centered, + THCudaTensor* std, + THCudaTensor* normalized, + THCudaTensor* runningMean, + THCudaTensor* runningStddev, + THCudaTensor* weight, + THCudaTensor* bias, + float epsilon, + float momentum, + bool train, + bool affine) +{ + // The BatchNormalization lua module is designed for + // 2-D only: batch, plane + constexpr int BatchDims = 2; + constexpr int ImageDims = 0; + typedef double ComputeT; + if (!train) { + if (!affine) { + // Collapse + BatchNormalizationUpdateOutput + + ( + torchToDeviceTensor(state, input), + torchToDeviceTensor(state, output), + DeviceTensor(), + DeviceTensor(), + DeviceTensor(), + torchToDeviceTensor(state, runningMean), + torchToDeviceTensor(state, runningStddev), + DeviceTensor(), + DeviceTensor(), + epsilon, + momentum, + THCState_getCurrentStream(state) + ); + } else { + // Collapse + BatchNormalizationUpdateOutput + + ( + torchToDeviceTensor(state, input), + torchToDeviceTensor(state, output), + DeviceTensor(), + DeviceTensor(), + DeviceTensor(), + torchToDeviceTensor(state, runningMean), + torchToDeviceTensor(state, runningStddev), + torchToDeviceTensor(state, weight), + torchToDeviceTensor(state, bias), + epsilon, + momentum, + THCState_getCurrentStream(state) + ); + } + } else { + if (!affine) { + BatchNormalizationUpdateOutput + + ( + torchToDeviceTensor(state, input), + torchToDeviceTensor(state, output), + torchToDeviceTensor(state, centered), + torchToDeviceTensor(state, std), + torchToDeviceTensor(state, normalized), + torchToDeviceTensor(state, runningMean), + torchToDeviceTensor(state, runningStddev), + DeviceTensor(), + DeviceTensor(), + epsilon, + momentum, + THCState_getCurrentStream(state) + ); + } else { + BatchNormalizationUpdateOutput + + ( + torchToDeviceTensor(state, input), + torchToDeviceTensor(state, output), + torchToDeviceTensor(state, centered), + torchToDeviceTensor(state, std), + torchToDeviceTensor(state, normalized), + torchToDeviceTensor(state, runningMean), + torchToDeviceTensor(state, runningStddev), + torchToDeviceTensor(state, weight), + torchToDeviceTensor(state, bias), + epsilon, + momentum, + THCState_getCurrentStream(state) + ); + } + } + + THCudaCheck(cudaGetLastError()); +} + + +template +__global__ void BatchNormalizationUpdateGradInput_kernel( + DeviceTensor gradInput, + const DeviceTensor gradOutput, + DeviceTensor centered, + DeviceTensor std, + const DeviceTensor weight) { + + static_assert(std::is_same::value , "type"); + + auto x = blockIdx.x * blockDim.x + threadIdx.x; + if (x >= gradOutput.getSize(1)) { + return; + } + + ComputeT norm = (ComputeT)1 / gradInput.getSize(0); + ComputeT gradMean = (ComputeT)0; + ComputeT centeredGradMean = (ComputeT)0; + for (auto batch = 0; batch < gradOutput.getSize(0); ++batch) { + ComputeT g = gradOutput[batch][x].ldg(); + ComputeT c = centered[batch][x].ldg(); + gradMean += g; + centeredGradMean += c * g; + } + gradMean *= norm; + centeredGradMean *= norm; + + ComputeT stdVal = std[x]; + ComputeT weightVal = (ComputeT)0; + if (affine) { + weightVal = weight[x]; + } + for (auto batch = 0; batch < gradOutput.getSize(0); ++batch) { + if (affine) { + gradInput[batch][x] = + ( + - centeredGradMean * centered[batch][x] * stdVal * stdVal + + gradOutput[batch][x] + - gradMean + ) * stdVal * weightVal; + } else { + gradInput[batch][x] = + ( + - centeredGradMean * centered[batch][x] * stdVal * stdVal + + gradOutput[batch][x] + - gradMean + ) * stdVal; + } + } +} + +template +void BatchNormalizationUpdateGradInput( + DeviceTensor gradInput, + const DeviceTensor gradOutput, + DeviceTensor centered, + DeviceTensor std, + const DeviceTensor weight, + cudaStream_t s) +{ + static_assert(BatchDims == 2, "BatchDims == 2 only atm"); + static_assert(ImageDims == 0, "ImageDims == 0 only atm"); + + dim3 blocks(ceil(gradOutput.getSize(1), 128)); + dim3 threads(128); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + BatchNormalizationUpdateGradInput_kernel + <<>>(gradInput, + gradOutput, + centered, + std, + weight); +} + +extern "C" void BatchNormalizationUpdateGradInputFFI( + THCState* state, + THCudaTensor* gradInput, + THCudaTensor* gradOutput, + THCudaTensor* centered, + THCudaTensor* std, + THCudaTensor* weight, + bool affine) { + + // The BatchNormalization lua module is designed for + // 2-D only: batch, plane + constexpr int BatchDims = 2; + constexpr int ImageDims = 0; + typedef double ComputeT; + if (!affine) { + // Collapse + BatchNormalizationUpdateGradInput + + ( + torchToDeviceTensor(state, gradInput), + torchToDeviceTensor(state, gradOutput), + torchToDeviceTensor(state, centered), + torchToDeviceTensor(state, std), + DeviceTensor(), + THCState_getCurrentStream(state) + ); + } else { + // Collapse + BatchNormalizationUpdateGradInput + + ( + torchToDeviceTensor(state, gradInput), + torchToDeviceTensor(state, gradOutput), + torchToDeviceTensor(state, centered), + torchToDeviceTensor(state, std), + torchToDeviceTensor(state, weight), + THCState_getCurrentStream(state) + ); + } + + THCudaCheck(cudaGetLastError()); +} + + +template +__global__ void BatchNormalizationAccGradParameters_kernel( + const DeviceTensor gradOutput, + const DeviceTensor normalized, + DeviceTensor gradWeight, + DeviceTensor gradBias, + T scale) +{ + + static_assert(std::is_same::value , "type"); + + auto x = blockIdx.x * blockDim.x + threadIdx.x; + if (x >= gradOutput.getSize(1)) { + return; + } + + ComputeT gradMean = (ComputeT)0; + ComputeT normalizedGradMean = (ComputeT)0; + for (auto batch = 0; batch < gradOutput.getSize(0); ++batch) { + ComputeT g = gradOutput[batch][x].ldg(); + ComputeT n = normalized[batch][x].ldg(); + gradMean += g; + normalizedGradMean += n * g; + } + gradBias[x] += scale * gradMean; + gradWeight[x] += scale * normalizedGradMean; +} + +template +void BatchNormalizationAccGradParameters( + const DeviceTensor gradOutput, + const DeviceTensor normalized, + DeviceTensor gradWeight, + DeviceTensor gradBias, + T scale, + cudaStream_t s) +{ + static_assert(BatchDims == 2, "BatchDims == 2 only atm"); + static_assert(ImageDims == 0, "ImageDims == 0 only atm"); + + dim3 blocks(ceil(gradOutput.getSize(1), 128)); + dim3 threads(128); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + BatchNormalizationAccGradParameters_kernel + <<>>(gradOutput, + normalized, + gradWeight, + gradBias, + scale); + +} + +extern "C" void BatchNormalizationAccGradParametersFFI( + THCState* state, + THCudaTensor* gradOutput, + THCudaTensor* normalized, + THCudaTensor* gradWeight, + THCudaTensor* gradBias, + float scale) { + // The BatchNormalization lua module is designed for + // 2-D only: batch, plane + constexpr int BatchDims = 2; + constexpr int ImageDims = 0; + typedef double ComputeT; + // Collapse + BatchNormalizationAccGradParameters + + ( + torchToDeviceTensor(state, gradOutput), + torchToDeviceTensor(state, normalized), + torchToDeviceTensor(state, gradWeight), + torchToDeviceTensor(state, gradBias), + scale, + THCState_getCurrentStream(state) + ); + + THCudaCheck(cudaGetLastError()); +} + + +}}} diff --git a/src/ConvolutionBias.cu b/src/ConvolutionBias.cu index f738b9d..37ebb6e 100644 --- a/src/ConvolutionBias.cu +++ b/src/ConvolutionBias.cu @@ -1,19 +1,19 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "ConvolutionBias.cuh" +#include "src/ConvolutionBias.cuh" #include "cuda/ComputeCapabilities.cuh" #include "cuda/CudaUtils.cuh" #include "cuda/DeviceTensor.cuh" #include "cuda/WarpReductions.cuh" -#include "DeviceTensorUtils.h" -#include "util/Misc.h" +#include "cuda/util/CachedDeviceProperties.h" +#include "src/DeviceTensorUtils.h" #include #include using namespace facebook::cuda; -using namespace facebook::CUDAUtil; +using namespace facebook::cuda; // This layer computes the following: // @@ -252,4 +252,31 @@ accGradParametersTemporalBias(THCState* state, 0, THCState_getCurrentStream(state)>>>(gradBias, output, biasScale); } + +extern "C" void updateOutputBiasFFI(THCState* state, + THCudaTensor* outputTH, + THCudaTensor* biasTH) { + updateOutputBias(state, outputTH, biasTH); +} + +extern "C" void updateOutputTemporalBiasFFI(THCState* state, + THCudaTensor* outputTH, + THCudaTensor* biasTH) { + updateOutputTemporalBias(state, outputTH, biasTH); +} + +extern "C" void accGradParametersBiasFFI(THCState* state, + THCudaTensor* outputTH, + THCudaTensor* gradBiasTH, + float biasScale) { + accGradParametersBias(state, outputTH, gradBiasTH, biasScale); +} + +extern "C" void accGradParametersTemporalBiasFFI(THCState* state, + THCudaTensor* outputTH, + THCudaTensor* gradBiasTH, + float biasScale) { + accGradParametersTemporalBias(state, outputTH, gradBiasTH, biasScale); +} + } } } } // namespace diff --git a/src/CrossMapNormalization.cu b/src/CrossMapNormalization.cu index 9a69f7d..db53866 100644 --- a/src/CrossMapNormalization.cu +++ b/src/CrossMapNormalization.cu @@ -3,7 +3,7 @@ * @author Tudor Bosman (tudorb@fb.com) */ -#include "CrossMapNormalization.cuh" +#include "src/CrossMapNormalization.cuh" namespace facebook { namespace deeplearning { namespace torch { diff --git a/src/CrossMapNormalizationHost.cpp b/src/CrossMapNormalizationHost.cpp index 535b1b6..43ea2d4 100644 --- a/src/CrossMapNormalizationHost.cpp +++ b/src/CrossMapNormalizationHost.cpp @@ -4,8 +4,8 @@ */ #include "THC.h" -#include "CrossMapNormalization.cuh" -#include "Utils.h" +#include "src/CrossMapNormalization.cuh" +#include "src/Utils.h" #include #include diff --git a/src/CuBLASWrapper.cpp b/src/CuBLASWrapper.cpp index 4339eb2..f5f0065 100644 --- a/src/CuBLASWrapper.cpp +++ b/src/CuBLASWrapper.cpp @@ -1,10 +1,10 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "CuBLASWrapper.h" +#include "src/CuBLASWrapper.h" #include "cuda/DeviceTensor.cuh" #include "THCTensor.h" -#include "BLASParameters.h" +#include "src/BLASParameters.h" #include #include @@ -33,8 +33,9 @@ const cuFloatComplex kOneComplex = make_cuComplex(1.0f, 0.0f); template void transpose(const DeviceTensor& in, DeviceTensor& out, - int sep, + int separator, bool asComplex, + bool transposeMetaData, cublasHandle_t handle, cudaStream_t stream) { cublasHandle_t localHandle; @@ -55,18 +56,35 @@ void transpose(const DeviceTensor& in, CHECK_EQ(true, in.isContiguousDim(i)) << "Not contiguous dim = " << i; CHECK_EQ(true, out.isContiguousDim(i)) << "Not contiguous dim = " << i; } - for (int i = 0; i < Dim; ++i) { - CHECK_EQ(in.getSize(i), out.getSize(i)) << - "Not eq dim = " << i << " in = " << in << " out = " << out; + if (transposeMetaData) { + for (int i = 0; i < Dim; ++i) { + CHECK_EQ(in.getSize(i), out.getSize(i)) << + "Not eq dim = " << i << " in = " << in << " out = " << out; + } + } else { + auto upper = (asComplex) ? Dim - 2 : Dim - 1; + if (!asComplex) { + for (int i = 0; i < separator; ++i) { + CHECK_EQ(in.getSize(i), out.getSize(Dim - separator + i)) << + "Not eq dim, in(" << i << ") = " << in << " out(" << + (Dim - separator + i) << ") = " << out; + } + for (int i = separator; i < upper; ++i) { + CHECK_EQ(in.getSize(i), out.getSize(i - separator)) << + "Not eq dim, in(" << i << ") = " << in << " out(" << + (i - separator) << ") = " << out; + } + } } + int rows = 1; - for (int i = 0; i < sep; ++i) { + for (int i = 0; i < separator; ++i) { rows *= in.getSize(i); } int cols = 1; - for (int i = sep; i < Dim; ++i) { + for (int i = separator; i < Dim; ++i) { cols *= in.getSize(i); } @@ -122,32 +140,33 @@ void transpose(const DeviceTensor& in, } CHECK_EQ(CUBLAS_STATUS_SUCCESS, res); - // Permute the sizes to keep the CudaTensor consistent. - // This only works because all dims are contiguous. - std::vector permDims; - permDims.reserve(Dim); - if (!asComplex) { - // Non-complex case is easy - for (int i = sep; i < Dim; ++i) { - permDims.push_back(i); + if (transposeMetaData) { + // Permute the sizes to keep the CudaTensor consistent. + // This only works because all dims are contiguous. + std::vector permDims; + permDims.reserve(Dim); + if (!asComplex) { + // Non-complex case is easy + for (int i = separator; i < Dim; ++i) { + permDims.push_back(i); + } + for (int i = 0; i < separator; ++i) { + permDims.push_back(i); + } + } else { + // Complex case is trickier since it is float[2] that must stay in + // horizontal order whatever happens + for (int i = separator; i < Dim - 1; ++i) { + permDims.push_back(i); + } + for (int i = 0; i < separator; ++i) { + permDims.push_back(i); + } + permDims.push_back(Dim - 1); } - for (int i = 0; i < sep; ++i) { - permDims.push_back(i); - } - } else { - // Complex case is trickier since it is float[2] that must stay in - // horizontal order whatever happens - for (int i = sep; i < Dim - 1; ++i) { - permDims.push_back(i); - } - for (int i = 0; i < sep; ++i) { - permDims.push_back(i); - } - permDims.push_back(Dim - 1); + out.permuteDims(permDims); } - out.permuteDims(permDims); - THCudaCheck(cudaGetLastError()); CHECK_EQ(CUBLAS_STATUS_SUCCESS, res); } @@ -155,25 +174,28 @@ void transpose(const DeviceTensor& in, template void transposeAsComplex(const DeviceTensor& in, DeviceTensor& out, - int sep, + int separator, + bool transposeMetaData, cublasHandle_t handle, cudaStream_t stream) { - transpose(in, out, sep, true, handle, stream); + transpose(in, out, separator, true, transposeMetaData, handle, stream); } #define TRANSPOSE_INSTANTIATION(DIM) \ template void transpose(const DeviceTensor& in, \ DeviceTensor& out, \ - int sep, \ + int separator, \ bool asComplex, \ + bool transposeMetaData, \ cublasHandle_t handle, \ cudaStream_t stream); -#define TRANSPOSE_AS_COMPLEX_INSTANTIATION(DIM) \ +#define TRANSPOSE_AS_COMPLEX_INSTANTIATION(DIM) \ template void transposeAsComplex(const DeviceTensor& in, \ - DeviceTensor& out, \ - int sep, \ - cublasHandle_t handle, \ + DeviceTensor& out, \ + int separator, \ + bool transposeMetaData, \ + cublasHandle_t handle, \ cudaStream_t stream); TRANSPOSE_INSTANTIATION(2); @@ -516,11 +538,11 @@ struct matmultBatchedStruct { #define BATCHEDMM_TAIL_INSTANTIATION(DIM1, DIM2) \ template <> \ struct matmultBatchedStruct { \ - void run(DeviceTensor& C, \ - DeviceTensor& A, \ - DeviceTensor& B, \ + void run(DeviceTensor& C, \ + DeviceTensor& A, \ + DeviceTensor& B, \ const BLASParameters& params) { \ - throw invalid_argument("BatchedMM needs at least 3 dimensions"); \ + THError("BatchedMM needs at least 3 dimensions"); \ } \ } \ @@ -586,7 +608,7 @@ void matmultBatched(DeviceTensor& C, matmultBatchedStruct().run(C, A, B, params); break; default: - throw invalid_argument("At most 2 outer sequential dimensions supported"); + THError("At most 2 outer sequential dimensions supported"); }; } @@ -628,15 +650,15 @@ struct matmultIterStruct { } }; -#define ITERATEDMM_TAIL_INSTANTIATION(DIM1, DIM2) \ - template <> \ - struct matmultIterStruct { \ +#define ITERATEDMM_TAIL_INSTANTIATION(DIM1, DIM2) \ + template <> \ + struct matmultIterStruct { \ void run(DeviceTensor& C, \ DeviceTensor& A, \ DeviceTensor& B, \ - const BLASParameters& params) { \ - CHECK(false) << "Should not be here"; \ - } \ + const BLASParameters& params) { \ + CHECK(false) << "Should not be here"; \ + } \ }; ITERATEDMM_TAIL_INSTANTIATION(3, 1); @@ -676,15 +698,14 @@ void matmultIter(DeviceTensor& C, break; default: - throw invalid_argument( - "At most 2 outer sequential and 2 batch dimensions supported"); + THError("At most 2 outer sequential and 2 batch dimensions supported"); }; } #define MATMULT_ITER_INSTANTIATION(DIM) \ - template void matmultIter(DeviceTensor& C, \ - DeviceTensor& A, \ - DeviceTensor& B, \ + template void matmultIter(DeviceTensor& C, \ + DeviceTensor& A, \ + DeviceTensor& B, \ const BLASParameters& params); MATMULT_ITER_INSTANTIATION(2); diff --git a/src/CuBLASWrapper.h b/src/CuBLASWrapper.h index 8df3763..434251c 100644 --- a/src/CuBLASWrapper.h +++ b/src/CuBLASWrapper.h @@ -3,7 +3,7 @@ #include "cuda/DeviceTensor.cuh" -#include "BLASParameters.h" +#include "src/BLASParameters.h" #include "cublas_v2.h" #include @@ -15,9 +15,10 @@ namespace facebook { namespace deeplearning { namespace torch { // // This transposition wrapper implements quick device-side transpositions. // Consider tensor dimensions are collapsed into a 2-D 'y'-by-'x'. -// The wrapper takes a sep integer and considers dimensions (0 .. sep - 1) as -// being collapsed to form the 'y' dimension. Dimensions (sep .. Dim - 1) -// are collapsed to form the 'x' dimension. +// The wrapper takes a separator integer and considers dimensions +// (0 .. separator - 1) as being collapsed to form the 'y' +// dimension. Dimensions (separator .. Dim - 1) are collapsed to form the 'x' +// dimension. // // The complex case is a bit trickier since Torch does not natively support // complex numbers, we emulate them with float[2]. In that case, 'x' is @@ -33,15 +34,17 @@ namespace facebook { namespace deeplearning { namespace torch { template void transpose(const cuda::DeviceTensor& in, cuda::DeviceTensor& out, - int sep, + int separator, bool asComplex = false, + bool transposeMetaData = true, cublasHandle_t handle = NULL, cudaStream_t stream = NULL); template void transposeAsComplex(const cuda::DeviceTensor& in, cuda::DeviceTensor& out, - int sep, + int separator, + bool transposeMetaData = true, cublasHandle_t handle = NULL, cudaStream_t stream = NULL); diff --git a/src/CuBLASWrapperLua.cpp b/src/CuBLASWrapperLua.cpp index 8133ee3..f544b1f 100644 --- a/src/CuBLASWrapperLua.cpp +++ b/src/CuBLASWrapperLua.cpp @@ -1,12 +1,12 @@ // Copyright 2014 Facebook #include "cuda/KernelTimer.h" -#include "Utils.h" -#include "DeviceTensorUtils.h" +#include "cuda/util/CachedDeviceProperties.h" +#include "src/Utils.h" +#include "src/DeviceTensorUtils.h" #include "THC.h" #include "THCTensor.h" -#include "CuBLASWrapper.h" -#include "util/Misc.h" +#include "src/CuBLASWrapper.h" #include #include @@ -17,30 +17,33 @@ #include using namespace facebook::cuda; -using namespace facebook::CUDAUtil; using namespace std; namespace facebook { namespace deeplearning { namespace torch { namespace { -#define MATMULT_CASE(DIM) \ - case DIM: \ - CHECK_EQ(DIM, iterDims + batchDims + 2); \ - { \ - DeviceTensor A = torchToDeviceTensor(state, thA); \ - DeviceTensor B = torchToDeviceTensor(state, thB); \ - DeviceTensor C = torchToDeviceTensor(state, thC); \ - matmultIter(C, A, B, params); \ - } \ +#define LOG_TARGET VLOG(3) + +#define MATMULT_CASE(DIM) \ + case DIM: \ + CHECK_EQ(DIM, iterDims + batchDims + 2 + ((asComplex) ? 1 : 0)); \ + { \ + DeviceTensor A = torchToDeviceTensor(state, thA); \ + DeviceTensor B = torchToDeviceTensor(state, thB); \ + DeviceTensor C = torchToDeviceTensor(state, thC); \ + matmultIter(C, A, B, params); \ + } \ break; int matmult(lua_State* L, bool asComplex = false) { THCState* state = getCutorchState(L); + auto transA = luaT_getfieldcheckstring(L, 1, "transA"); + auto transB = luaT_getfieldcheckstring(L, 1, "transB"); auto iterDims = luaT_getfieldcheckint(L, 1, "iterDims"); auto batchDims = luaT_getfieldcheckint(L, 1, "batchDims"); - auto numHandles = luaT_getfieldcheckint(L, 1, "handles"); - auto numStreams = luaT_getfieldcheckint(L, 1, "streams"); + auto scale = luaT_getfieldchecknumber(L, 1, "scale"); + auto timed = luaT_getfieldcheckboolean(L, 1, "timed"); auto thA = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); auto thB = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); auto thC = (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor"); @@ -52,26 +55,28 @@ int matmult(lua_State* L, bool asComplex = false) { CHECK_EQ(THCudaTensor_nDimension(state, thC), THCudaTensor_nDimension(state, thB)); + int device; + THCudaCheck(cudaGetDevice(&device)); + std::vector handles; - for (auto i = 0; i < numHandles; ++i) { - handles.push_back(cublasHandle_t()); - cublasCreate(&(handles.back())); + // Skip NULL handle + for (auto i = 1; i <= THCState_getNumBlasHandles(state); ++i) { + handles.push_back(THCState_getDeviceBlasHandle(state, device, i)); } std::vector streams; - for (auto i = 0; i < numStreams; ++i) { - streams.push_back(cudaStream_t()); - cudaStreamCreate(&(streams.back())); + // Skip default stream + for (auto i = 1; i <= THCState_getNumStreams(state); ++i) { + streams.push_back(THCState_getDeviceStream(state, device, i)); } - auto time = 0.0f; - constexpr long kNumTrials = 5; int dims = THCudaTensor_nDimension(state, thA); BLASParameters p; auto& params = p.withIterDims(iterDims).withBatchDims(batchDims). - withComplex(asComplex).withHandles(handles).withStreams(streams); - for (int i = 0; i < kNumTrials; ++i) { - cuda::KernelTimer timer; + withComplex(asComplex).withHandles(handles).withStreams(streams). + withTransposeA(transA[0]).withTransposeB(transB[0]).withScaleReal(scale); + + if (!timed) { switch (dims) { MATMULT_CASE(2); MATMULT_CASE(3); @@ -79,39 +84,54 @@ int matmult(lua_State* L, bool asComplex = false) { MATMULT_CASE(5); MATMULT_CASE(6); default: - throw invalid_argument("Unsupported dims"); + THError("GEMM Unsupported dims"); }; - auto timeMS = timer.stop(); - if (i > 0) { - time += timeMS; + } else { + auto time = 0.0f; + constexpr long kNumTrials = 5; + for (int i = 0; i < kNumTrials; ++i) { + cuda::KernelTimer timer; + switch (dims) { + MATMULT_CASE(2); + MATMULT_CASE(3); + MATMULT_CASE(4); + MATMULT_CASE(5); + MATMULT_CASE(6); + default: + THError("GEMM Unsupported dims"); + }; + auto timeMS = timer.stop(); + if (i > 0) { + time += timeMS; + } } - } - time /= kNumTrials - 1; + time /= kNumTrials - 1; - long iters = 1; - for (int i = 0; i < iterDims; ++i) { - iters *= THCudaTensor_size(state, thA, i); - } - long batch = 1; - for (int i = iterDims; i < iterDims + batchDims; ++i) { - batch *= THCudaTensor_size(state, thA, i); - } + long iters = 1; + for (int i = 0; i < iterDims; ++i) { + iters *= THCudaTensor_size(state, thA, i); + } + long batch = 1; + for (int i = iterDims; i < iterDims + batchDims; ++i) { + batch *= THCudaTensor_size(state, thA, i); + } - auto GOut = (THCudaTensor_size(state, thC, 0) * - THCudaTensor_stride(state, thC, 0) * - THCudaTensor_size(state, thA, dims - 1)) / - 1e9; - LOG(INFO) << folly::format( - " Running mxm ({}x{}x{}): {} iterations (parallel over streams)," \ - " {} batches, GReductions(virtual fmas)/s = {:.5f}" \ - " time = {:.2f}ms", - THCudaTensor_size(state, thC, dims - 2), - THCudaTensor_size(state, thC, dims - 1), - THCudaTensor_size(state, thA, dims - 1), - iters, - batch, - (GOut / time) * 1e3, - time).str(); + auto GOut = (THCudaTensor_size(state, thC, 0) * + THCudaTensor_stride(state, thC, 0) * + THCudaTensor_size(state, thA, dims - 1)) / + 1e9; + LOG_TARGET << folly::format( + " Running mxm ({}x{}x{}): {} iterations (parallel over streams)," \ + " {} batches, GReductions(virtual fmas)/s = {:.5f}" \ + " time = {:.2f}ms", + THCudaTensor_size(state, thC, (asComplex) ? dims - 3 : dims - 2), + THCudaTensor_size(state, thC, (asComplex) ? dims - 2 : dims - 1), + THCudaTensor_size(state, thA, (asComplex) ? dims - 2 : dims - 1), + iters, + batch, + (GOut / time) * 1e3, + time).str(); + } return 0; } @@ -124,9 +144,64 @@ int matmultComplex(lua_State* L) { return matmult(L, true); } +#define TRANSPOSE_CASE(DIM) \ + if (dim == DIM) { \ + DeviceTensor A = torchToDeviceTensor(state, thA); \ + DeviceTensor tA = torchToDeviceTensor(state, thB); \ + facebook::deeplearning::torch::transpose( \ + A, tA, separator, asComplex, transposeMetaData, handle, stream); \ + if (transposeMetaData) { \ + /* Also transpose the metadata */ \ + for (auto i = 0; i < dim; ++i) { \ + thB->size[i] = tA.getSize(i); \ + thB->stride[i] = tA.getStride(i); \ + } \ + } \ + done = true; \ + } + +int transpose(lua_State* L, bool asComplex = false) { + THCState* state = getCutorchState(L); + auto separator = luaT_getfieldcheckint(L, 1, "separator"); + auto transposeMetaData = luaT_getfieldcheckboolean(L, 1, "transposeMetaData"); + auto handleIndex = luaT_getfieldcheckint(L, 1, "handle"); + auto streamIndex = luaT_getfieldcheckint(L, 1, "stream"); + auto thA = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); + auto thB = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); + int dim = THCudaTensor_nDimension(state, thA); + + CHECK_EQ(THCudaTensor_nDimension(state, thA), + THCudaTensor_nDimension(state, thB)); + + int device; + THCudaCheck(cudaGetDevice(&device)); + + auto handle = THCState_getDeviceBlasHandle(state, device, handleIndex); + auto stream = THCState_getDeviceStream(state, device, streamIndex); + + auto done = false; + TRANSPOSE_CASE(2); + TRANSPOSE_CASE(3); + TRANSPOSE_CASE(4); + TRANSPOSE_CASE(5); + if (!done) { THError("Transpose Unsupported dims"); } + + return 0; +} + +int transpose(lua_State* L) { + return transpose(L, false); +} + +int transposeComplex(lua_State* L) { + return transpose(L, true); +} + const luaL_Reg functions[] = { {"CuBLASWrapper_matmult", matmult}, {"CuBLASWrapper_matmultComplex", matmultComplex}, + {"CuBLASWrapper_transpose", transpose}, + {"CuBLASWrapper_transposeComplex", transposeComplex}, {nullptr, nullptr}, }; diff --git a/src/CudaTensorUtils.cpp b/src/CudaTensorUtils.cpp index 295f722..0edaa4b 100644 --- a/src/CudaTensorUtils.cpp +++ b/src/CudaTensorUtils.cpp @@ -1,5 +1,5 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "CudaTensorUtils.h" +#include "src/CudaTensorUtils.h" #include "THC.h" using namespace std; @@ -107,10 +107,10 @@ makeAliasedTHCudaTensorFull(THCState* state, } auto szTH = LongStorage::wrap( - makeMutable(LongRange(sizesTH, sizes.size()))).moveAsTH(); + folly::Range(sizesTH, sizes.size())).moveAsTH(); SCOPE_EXIT { THLongStorage_free(szTH); }; auto strTH = LongStorage::wrap( - makeMutable(LongRange(stridesTH, sizes.size()))).moveAsTH(); + folly::Range(stridesTH, sizes.size())).moveAsTH(); SCOPE_EXIT { THLongStorage_free(strTH); }; auto tensor = THCudaTensor_newWithStorage( @@ -148,9 +148,9 @@ Tensor copyFromCuda(THCState* state, const THCudaTensor* ctensor) { return Tensor( Storage(dataTH), tensor->storageOffset, LongStorage::wrap( - makeMutable(LongRange(tensor->size, tensor->nDimension))), + folly::Range(tensor->size, tensor->nDimension)), LongStorage::wrap( - makeMutable(LongRange(tensor->stride, tensor->nDimension)))); + folly::Range(tensor->stride, tensor->nDimension))); } unique_ptr diff --git a/src/CudaTensorUtils.h b/src/CudaTensorUtils.h index c3b8ec8..016c0cc 100644 --- a/src/CudaTensorUtils.h +++ b/src/CudaTensorUtils.h @@ -2,7 +2,7 @@ #pragma once #include "THCTensor.h" -#include "folly/Optional.h" +#include #include "thpp/Tensor.h" #include diff --git a/src/DeviceTensorUtils.h b/src/DeviceTensorUtils.h index c74f15b..e4207f3 100644 --- a/src/DeviceTensorUtils.h +++ b/src/DeviceTensorUtils.h @@ -51,4 +51,4 @@ torchToDeviceTensorCast(THCState* state, THCudaTensor* t) { } } } // namespace -#include "DeviceTensorUtils-inl.h" +#include "src/DeviceTensorUtils-inl.h" diff --git a/src/FeatureLPPooling.cu b/src/FeatureLPPooling.cu index 02806b3..73e1480 100644 --- a/src/FeatureLPPooling.cu +++ b/src/FeatureLPPooling.cu @@ -1,11 +1,11 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "FeatureLPPooling.cuh" +#include "src/FeatureLPPooling.cuh" #include "cuda/DeviceTensor.cuh" #include "cuda/CudaStaticAssert.cuh" #include "cuda/CudaUtils.cuh" #include "cuda/RegisterUtils.cuh" -#include "util/Misc.h" +#include "cuda/util/CachedDeviceProperties.h" #include "THC.h" #include @@ -349,7 +349,7 @@ runFeatureLPPoolingUpdateOutput(cudaStream_t stream, DeviceTensor& output, float power, int width, int stride) { const cudaDeviceProp& deviceProperties = - facebook::CUDAUtil::getCurrentDeviceProperties(); + facebook::cuda::getCurrentDeviceProperties(); const int outputFeatures = ((input.getSize(1) - width) / stride) + 1; assert(input.getSize(0) == output.getSize(0)); @@ -442,7 +442,7 @@ runFeatureLPPoolingUpdateGradInput(cudaStream_t stream, DeviceTensor& gradInput, float power, int width, int stride) { const cudaDeviceProp& deviceProperties = - facebook::CUDAUtil::getCurrentDeviceProperties(); + facebook::cuda::getCurrentDeviceProperties(); for (int i = 0; i < 4; ++i) { assert(gradOutput.getSize(i) == output.getSize(i)); @@ -463,7 +463,7 @@ runFeatureLPPoolingUpdateGradInput(cudaStream_t stream, // Different threads are potentially adding into overlapping input // points, so we must clear out gradInput before continuing. - gradInput.fillAsync(0.0f, stream); + gradInput.zero(); // Split non-features among threads and grid x int totalNonFeatureSize = input.getSize(2) * input.getSize(3); diff --git a/src/FeatureLPPoolingHost.cpp b/src/FeatureLPPoolingHost.cpp index e1d3a30..f7fb493 100644 --- a/src/FeatureLPPoolingHost.cpp +++ b/src/FeatureLPPoolingHost.cpp @@ -1,10 +1,10 @@ // Copyright 2004-present Facebook. All Rights Reserved. #include "cuda/DeviceTensor.cuh" -#include "Utils.h" -#include "DeviceTensorUtils.h" +#include "src/Utils.h" +#include "src/DeviceTensorUtils.h" #include "THC.h" -#include "FeatureLPPooling.cuh" +#include "src/FeatureLPPooling.cuh" #include #include @@ -264,7 +264,7 @@ int featureLPPooling_updateGradInput(lua_State *L) { gradOutput = *gradOutputUpcast; output = *outputUpcast; - if (!output.isSameSizeAndStride(gradOutput)) { + if (!output.isSameSize(gradOutput)) { luaL_error(L, "output and gradOutput sizes do not match"); } diff --git a/src/HSMHost.cpp b/src/HSMHost.cpp index 671fab3..2537e43 100644 --- a/src/HSMHost.cpp +++ b/src/HSMHost.cpp @@ -3,7 +3,7 @@ * @author Michael Mathieu (myrhev@fb.com) */ -#include "Utils.h" +#include "src/Utils.h" #include #include #include "THC.h" diff --git a/src/HalfPrec.cpp b/src/HalfPrec.cpp index 673119e..bd6a802 100644 --- a/src/HalfPrec.cpp +++ b/src/HalfPrec.cpp @@ -1,13 +1,13 @@ // Copyright 2004-, Facebook, Inc. All Rights Reserved. -#include "HalfPrec.h" +#include "src/HalfPrec.h" #include #include #include -#include "Utils.h" -#include "Tensor.h" -#include "LuaUtils.h" +#include "src/Utils.h" +#include "src/Tensor.h" +#include "src/LuaUtils.h" #include "THC.h" using namespace std; diff --git a/src/HalfPrecKernels.cu b/src/HalfPrecKernels.cu index 8c13c6e..9c03878 100644 --- a/src/HalfPrecKernels.cu +++ b/src/HalfPrecKernels.cu @@ -4,10 +4,10 @@ #include #include -#include "HalfPrec.h" -#include "util/Transform.cuh" +#include "src/HalfPrec.h" +#include "src/util/Transform.cuh" -using namespace facebook::CUDAUtil; +using namespace facebook::cuda; void halfprec_ToHalf(cudaStream_t stream, const float* input, half_t* output, diff --git a/src/HalfPrecTest.cpp b/src/HalfPrecTest.cpp index 18155c1..c2c2899 100644 --- a/src/HalfPrecTest.cpp +++ b/src/HalfPrecTest.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/src/InitCuda.cpp b/src/InitCuda.cpp index d8246e9..99bdd68 100644 --- a/src/InitCuda.cpp +++ b/src/InitCuda.cpp @@ -5,6 +5,11 @@ #include +#ifdef FB_INTERNAL +#else +#define LUAOPEN(x) luaopen_fbcunn_cuda_ext(x) +#endif + namespace facebook { namespace deeplearning { namespace torch { void initCrossMapNormalizationCuda(lua_State* L); @@ -17,14 +22,15 @@ void initOneBitQuantizationCuda(lua_State* L); void initSparseNLLCriterionCuda(lua_State* L); void initFeatureLPPoolingCuda(lua_State* L); void initCuBLASWrapper(lua_State *L); -void initFFTWrapper(lua_State *L); -void initSpatialConvolutionCuFFT(lua_State *L); +// void initFFTWrapper(lua_State *L); +// void initSpatialConvolutionCuFFT(lua_State *L); +void initWeightedLookupTableCuda(lua_State *L); }}} // namespace using namespace facebook::deeplearning::torch; -extern "C" int luaopen_libfbcunnlayers(lua_State* L) { +extern "C" int LUAOPEN(lua_State* L) { initCrossMapNormalizationCuda(L); initLocallyConnectedCuda(L); initLookupTableGPUCuda(L); @@ -35,8 +41,9 @@ extern "C" int luaopen_libfbcunnlayers(lua_State* L) { initSparseNLLCriterionCuda(L); initFeatureLPPoolingCuda(L); initCuBLASWrapper(L); - initFFTWrapper(L); - initSpatialConvolutionCuFFT(L); + // initFFTWrapper(L); + // initSpatialConvolutionCuFFT(L); + initWeightedLookupTableCuda(L); return 0; } diff --git a/src/LocallyConnected.cuh b/src/LocallyConnected.cuh index c913d2d..99baab8 100644 --- a/src/LocallyConnected.cuh +++ b/src/LocallyConnected.cuh @@ -4,7 +4,7 @@ #pragma once #include "cuda/DeviceTensor.cuh" -#include "DeviceTensorUtils.h" +#include "src/DeviceTensorUtils.h" #include namespace facebook { namespace deeplearning { namespace torch { diff --git a/src/LocallyConnectedHost.cpp b/src/LocallyConnectedHost.cpp index 40459c6..0b775f8 100644 --- a/src/LocallyConnectedHost.cpp +++ b/src/LocallyConnectedHost.cpp @@ -4,7 +4,7 @@ */ #include "THC.h" -#include "Utils.h" +#include "src/Utils.h" #include "LocallyConnected.cuh" #include #include @@ -77,14 +77,6 @@ void initializeParams(THCState* state, } } -void narrowTensors(THCState* state, - THCudaTensor* in, THCudaTensor* in1, - THCudaTensor* out, THCudaTensor* out1, - int index, int size) { - THCudaTensor_narrow(state, in1, in, 0, index, size); - THCudaTensor_narrow(state, out1, out, 0, index, size); -} - // Updates a cache in cuda layout. // // The input tensor is in standard Torch layout and the resulting diff --git a/src/LookupTableGPUHost.cpp b/src/LookupTableGPUHost.cpp index 3eaa5e1..b9c7115 100644 --- a/src/LookupTableGPUHost.cpp +++ b/src/LookupTableGPUHost.cpp @@ -4,8 +4,8 @@ */ #include "cuda/DeviceTensor.cuh" -#include "Utils.h" -#include "DeviceTensorUtils.h" +#include "src/Utils.h" +#include "src/DeviceTensorUtils.h" #include "THC.h" #include diff --git a/src/MM.cu b/src/MM.cu index 78df619..56de949 100644 --- a/src/MM.cu +++ b/src/MM.cu @@ -1,33 +1,76 @@ // Copyright 2004-present Facebook. All Rights Reserved. +#include "DeviceTensorUtils.h" +#include "THCTensor.h" + #include "cuda/DeviceTensor.cuh" #include "cuda/MM.cuh" + using namespace facebook::cuda; namespace facebook { namespace deeplearning { namespace torch { -template +template + void transposeMM(DeviceTensor& A, DeviceTensor& B, DeviceTensor& C, float invNorm, cudaStream_t s = 0) { - facebook::cuda::transposeMM( - A, B, C, invNorm, s); + facebook::cuda::transposeMM + ( + A, B, C, invNorm, s); } -#define INSTANTIATE_TRANSPOSE_MM(DIM, CONJA, CONJB) \ - template void transposeMM( \ - DeviceTensor& A, \ - DeviceTensor& B, \ - DeviceTensor& C, \ - float invNorm, \ +#define INSTANTIATE_TRANSPOSE_MM(DIM, CONJA, CONJB, ACC) \ + template void transposeMM( \ + DeviceTensor& A, \ + DeviceTensor& B, \ + DeviceTensor& C, \ + float invNorm, \ cudaStream_t s); -INSTANTIATE_TRANSPOSE_MM(5, true, false); -INSTANTIATE_TRANSPOSE_MM(5, false, true); -INSTANTIATE_TRANSPOSE_MM(5, false, false); +INSTANTIATE_TRANSPOSE_MM(5, true, false, true); +INSTANTIATE_TRANSPOSE_MM(5, false, true, true); +INSTANTIATE_TRANSPOSE_MM(5, false, false, true); +INSTANTIATE_TRANSPOSE_MM(5, true, false, false); +INSTANTIATE_TRANSPOSE_MM(5, false, true, false); +INSTANTIATE_TRANSPOSE_MM(5, false, false, false); + +#define CALL_TRANSPOSE_MM(DIM, CONJA, CONJB, ACC) \ + if (THCudaTensor_nDimension(state, tA) == DIM && \ + conjugateTransposeA == CONJA && \ + conjugateTransposeB == CONJB && \ + accumulate == ACC) { \ + DeviceTensor A = torchToDeviceTensor(state, tA); \ + DeviceTensor B = torchToDeviceTensor(state, tB); \ + DeviceTensor C = torchToDeviceTensor(state, tC); \ + facebook::deeplearning::torch::transposeMM( \ + A, B, C, invNorm, THCState_getCurrentStream(state)); \ + return; \ + } + +extern "C" void transposeMMFFI(THCState* state, + THCudaTensor* tA, + THCudaTensor* tB, + THCudaTensor* tC, + float invNorm, + bool conjugateTransposeA, + bool conjugateTransposeB, + bool accumulate) { + CHECK_EQ(THCudaTensor_nDimension(state, tA), + THCudaTensor_nDimension(state, tB)); + CHECK_EQ(THCudaTensor_nDimension(state, tA), + THCudaTensor_nDimension(state, tC)); + + CALL_TRANSPOSE_MM(5, true, false, true); + CALL_TRANSPOSE_MM(5, false, true, true); + CALL_TRANSPOSE_MM(5, false, false, true); + CALL_TRANSPOSE_MM(5, true, false, false); + CALL_TRANSPOSE_MM(5, false, true, false); + CALL_TRANSPOSE_MM(5, false, false, false); +} #undef INSTANTIATE_TRANSPOSE_MM diff --git a/src/MM.h b/src/MM.h index 1b43a18..1dd43f5 100644 --- a/src/MM.h +++ b/src/MM.h @@ -8,7 +8,8 @@ namespace facebook { namespace deeplearning { namespace torch { -template +template + void transposeMM(facebook::cuda::DeviceTensor& A, facebook::cuda::DeviceTensor& B, facebook::cuda::DeviceTensor& C, diff --git a/src/OneBitQuantization.cu b/src/OneBitQuantization.cu index 1ac6149..a5e6953 100644 --- a/src/OneBitQuantization.cu +++ b/src/OneBitQuantization.cu @@ -1,6 +1,6 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "OneBitQuantization.cuh" +#include "src/OneBitQuantization.cuh" #include "cuda/ComputeCapabilities.cuh" #include "cuda/CudaDebugUtils.cuh" diff --git a/src/OneBitQuantizationHost.cpp b/src/OneBitQuantizationHost.cpp index 9b56eb9..37f395f 100644 --- a/src/OneBitQuantizationHost.cpp +++ b/src/OneBitQuantizationHost.cpp @@ -1,11 +1,11 @@ // Copyright 2014 Facebook #include "cuda/DeviceTensor.cuh" -#include "DeviceTensorUtils.h" -#include "Utils.h" +#include "src/DeviceTensorUtils.h" +#include "src/Utils.h" #include "THC.h" #include "THCTensor.h" -#include "OneBitQuantization.cuh" +#include "src/OneBitQuantization.cuh" #include #include diff --git a/src/SparseNLLCriterion.cu b/src/SparseNLLCriterion.cu index c381e20..05870ea 100644 --- a/src/SparseNLLCriterion.cu +++ b/src/SparseNLLCriterion.cu @@ -5,7 +5,7 @@ #include "cuda/CudaUtils.cuh" #include "cuda/WarpReductions.cuh" -#include "util/Misc.h" +#include "cuda/util/CachedDeviceProperties.h" #include "SparseNLLCriterion.cuh" @@ -76,7 +76,7 @@ void runSparseNLLCriterion_updateOutput( DeviceTensor& output) { const cudaDeviceProp& deviceProperties = - facebook::CUDAUtil::getCurrentDeviceProperties(); + facebook::cuda::getCurrentDeviceProperties(); const int maxThreads = deviceProperties.maxThreadsPerBlock; const int batchSize = targetP.getSize(0); @@ -97,7 +97,7 @@ void runSparseNLLCriterion_updateGradInput( DeviceTensor& gradInput) { const cudaDeviceProp& deviceProperties = - facebook::CUDAUtil::getCurrentDeviceProperties(); + facebook::cuda::getCurrentDeviceProperties(); const int batchSize = targetP.getSize(0); const int K = targetP.getSize(1); diff --git a/src/SparseNLLCriterionHost.cpp b/src/SparseNLLCriterionHost.cpp index 4e888e2..9ae80f4 100644 --- a/src/SparseNLLCriterionHost.cpp +++ b/src/SparseNLLCriterionHost.cpp @@ -1,11 +1,11 @@ // Copyright 2014 Facebook #include "cuda/DeviceTensor.cuh" -#include "Utils.h" -#include "DeviceTensorUtils.h" +#include "src/Utils.h" +#include "src/DeviceTensorUtils.h" #include "THC.h" #include "THCTensor.h" -#include "SparseNLLCriterion.cuh" +#include "src/SparseNLLCriterion.cuh" #include #include diff --git a/src/SpatialBatchNormalization.cu b/src/SpatialBatchNormalization.cu new file mode 100644 index 0000000..743b693 --- /dev/null +++ b/src/SpatialBatchNormalization.cu @@ -0,0 +1,791 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#include "src/DeviceTensorUtils.h" +#include "THCTensor.h" + +#include "cuda/CudaUtils.cuh" +#include "cuda/DeviceTensor.cuh" +#include "cuda/MemoryAccess.cuh" +#include "cuda/util/CachedDeviceProperties.h" + +#define ENABLE_CUDA_DEBUG +#include "cuda/CudaDebugUtils.cuh" + +#include +#include + +#include + +using namespace facebook::cuda; + +namespace facebook { namespace deeplearning { namespace torch { + +#define LOG_TARGET VLOG(1) // LOG(INFO) + +template +__global__ void SpatialBatchNormalizationUpdateOutputInferenceUnrolled_kernel( + const DeviceTensor input, + DeviceTensor output, + DeviceTensor runningMean, + DeviceTensor runningStddev, + const DeviceTensor weight, + const DeviceTensor bias) { + + static_assert(std::is_same::value , "type"); + + auto x = threadIdx.x; + auto y = threadIdx.y; + auto plane = blockIdx.x; + auto batch = blockIdx.y; + + // stddev is actually 1 / stddev + auto stddev = runningStddev[plane].ldg(); + auto mean = runningMean[plane].ldg(); + auto inp = input[batch][plane][y][x].ldg(); + if (affine) { + // multiply with gamma and add beta + // TODO: everyone pulling this, optimize by reusing better + auto beta = bias[plane].ldg(); + auto gamma = weight[plane].ldg(); + output[batch][plane][y][x] = gamma * (inp - mean) * (stddev) + beta; + } else { + output[batch][plane][y][x] = (inp - mean) * (stddev); + } +} + +template +__global__ void SpatialBatchNormalizationUpdateOutputInference_kernel( + const DeviceTensor input, + DeviceTensor output, + DeviceTensor runningMean, + DeviceTensor runningStddev, + const DeviceTensor weight, + const DeviceTensor bias) { + + static_assert(std::is_same::value , "type"); + + auto x = threadIdx.x; + auto plane = blockIdx.x; + auto batch = blockIdx.y; + + // stddev is actually 1 / stddev + auto stddev = runningStddev[plane].ldg(); + auto mean = runningMean[plane].ldg(); + T beta, gamma; + if (affine) { + beta = bias[plane].ldg(); + gamma = weight[plane].ldg(); + } + + for (auto y = threadIdx.y; y < output.getSize(2); y += blockDim.y) { + auto inp = input[batch][plane][y][x].ldg(); + if (affine) { + // multiply with gamma and add beta + // TODO: everyone pulling this, optimize by reusing better + output[batch][plane][y][x] = gamma * (inp - mean) * (stddev) + beta; + } else { + output[batch][plane][y][x] = (inp - mean) * (stddev); + } + } + +} + +template +__global__ void SpatialBatchNormalizationUpdateOutput_kernel( + const DeviceTensor input, + DeviceTensor output, + DeviceTensor centered, + DeviceTensor std, + DeviceTensor normalized, + DeviceTensor runningMean, + DeviceTensor runningStddev, + const DeviceTensor weight, + const DeviceTensor bias, + T epsilon, + T momentum) { + + static_assert(std::is_same::value , "type"); + + // Assert powers of 2 for proper intra-warp shuffle reduction + assert(blockDim.x == NumThreads); + assert(blockDim.y == NumThreads); + static_assert((NumThreads & (NumThreads - 1)) == 0, + "NumThreads must be a power of 2 for proper warp shuffling"); + auto plane = blockIdx.x; + auto numBatches = input.getSize(0); + + auto norm = (T)0; + if (threadIdx.y == 0) { + norm = input.getSize(0) * input.getSize(2) * input.getSize(3); + norm = (T)1 / norm; + } + + // 1. Compute the mean across (batch, y, x), save it and update the + // runningMean with momentum + auto batchMeanGlobal = (T)0; + for (int y = threadIdx.y; y < input.getSize(2); y += NumThreads) { + auto batchMeanLocal = (T)0; + for (auto batch = 0; batch < numBatches; ++batch) { + for (int x = threadIdx.x; x < input.getSize(3); x += NumThreads) { + auto inp = (inBounds(y, x, input)) ? + input[batch][plane][y][x].ldg() : 0.0f; + batchMeanLocal += inp; + } + } + // Reduce within warp + for (auto i = 0; i < getMSB(NumThreads); ++i) { + batchMeanLocal += __shfl_xor(batchMeanLocal, 1 << i, NumThreads); + } + // thread 0 has it + batchMeanGlobal += batchMeanLocal; + } + + __shared__ T shared[NumThreads]; + // thx == 0 stores into smem + if (threadIdx.x == 0) { + shared[threadIdx.y] = batchMeanGlobal; + } + + __syncthreads(); + // 'transpose', and reduce within warp again + if (threadIdx.y == 0) { + auto batchMeanLocal = shared[threadIdx.x]; + // Reduce within warp again + for (auto i = 0; i < getMSB(NumThreads); ++i) { + batchMeanLocal += __shfl_xor(batchMeanLocal, 1 << i, NumThreads); + } + // We did an allreduce with xors, this should reduce contention on + // shared memory. + batchMeanGlobal = batchMeanLocal * norm; + // Save the non momentum-altered version to share with everyone + shared[threadIdx.x] = batchMeanGlobal; + } + __syncthreads(); + + // Everyone picks it up + batchMeanGlobal = shared[threadIdx.x]; + if (threadIdx.y == 0 && threadIdx.x == 0) { + // Momentum based writeback + runningMean[plane] = + (1 - momentum) * runningMean[plane] + momentum * batchMeanGlobal; + } + + + // 2. Compute the stddev across (batch, y, x), + // save it + // update the runningStddev with momentum + // save a copy + // All threads have the batchMean now, compute the stddev + auto batchStddevGlobal = (T)0; + for (int y = threadIdx.y; y < input.getSize(2); y += NumThreads) { + auto batchStddevLocal = (T)0; + for (auto batch = 0; batch < numBatches; ++batch) { + for (int x = threadIdx.x; x < input.getSize(3); x += NumThreads) { + auto inp = 0.0f; + if (inBounds(y, x, input)) { + inp = input[batch][plane][y][x].ldg(); + batchStddevLocal += + (inp - batchMeanGlobal) * (inp - batchMeanGlobal); + centered[batch][plane][y][x] = inp - batchMeanGlobal; + } + } + } + // Reduce within warp + for (auto i = 0; i < getMSB(NumThreads); ++i) { + batchStddevLocal += __shfl_xor(batchStddevLocal, 1 << i, NumThreads); + } + // thread 0 has it + batchStddevGlobal += batchStddevLocal; + } + + // thx == 0 stores into smem, reuse the same smem region, be sure to kill + // WAR / WAW dependences even if they are extremely unlikely. + __syncthreads(); + if (threadIdx.x == 0) { + shared[threadIdx.y] = batchStddevGlobal; + } + + __syncthreads(); + // 'transpose', and reduce within warp again + if (threadIdx.y == 0) { + auto batchStddevLocal = shared[threadIdx.x]; + // Reduce within warp again + for (auto i = 0; i < getMSB(NumThreads); ++i) { + batchStddevLocal += __shfl_xor(batchStddevLocal, 1 << i, NumThreads); + } + // We did an allreduce with xors, this should reduce contention on + // shared memory. + batchStddevLocal *= norm; + batchStddevGlobal = 1 / sqrt(batchStddevLocal + epsilon); + // Save the non momentum-altered version to share with everyone + shared[threadIdx.x] = batchStddevGlobal; + } + __syncthreads(); + + // Everyone picks it up + batchStddevGlobal = shared[threadIdx.x]; + // Momentum based writeback + if (threadIdx.y == 0 && threadIdx.x == 0) { + std[plane] = batchStddevGlobal; + runningStddev[plane] = + (1 - momentum) * runningStddev[plane] + momentum * batchStddevGlobal; + } + + // Write normalized and update the output + auto beta = bias[plane]; + auto gamma = weight[plane]; + for (int y = threadIdx.y; y < input.getSize(2); y += NumThreads) { + for (int x = threadIdx.x; x < input.getSize(3); x += NumThreads) { + if(inBounds(y, x, output)) { + for (auto batch = 0; batch < numBatches; ++batch) { + auto inp = input[batch][plane][y][x].ldg(); + normalized[batch][plane][y][x] = + (inp - batchMeanGlobal) * (batchStddevGlobal); + if (affine) { + // multiply with gamma and add beta + output[batch][plane][y][x] = + gamma * (inp - batchMeanGlobal) * (batchStddevGlobal) + beta; + } else { + output[batch][plane][y][x] = + (inp - batchMeanGlobal) * (batchStddevGlobal); + } + } + } + } + } + +} + + +template +void SpatialBatchNormalizationUpdateOutput( + const DeviceTensor input, + DeviceTensor output, + DeviceTensor centered, + DeviceTensor std, + DeviceTensor normalized, + DeviceTensor runningMean, + DeviceTensor runningStddev, + const DeviceTensor weight, + const DeviceTensor bias, + T epsilon, + T momentum, + cudaStream_t s) +{ + static_assert(BatchDims == 2, "BatchDims == 2 only atm"); + + auto prop = getCurrentDeviceProperties(); + if (!train) { + if (input.getSize(3) * input.getSize(2) < prop.maxThreadsPerBlock) { + dim3 blocks(input.getSize(1), input.getSize(0)); + dim3 threads(input.getSize(3), input.getSize(2)); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + SpatialBatchNormalizationUpdateOutputInferenceUnrolled_kernel + + <<>> + (input, output, runningMean, runningStddev, weight, bias); + } else { + CHECK_GE(prop.maxThreadsPerBlock, input.getSize(3)) << + "Need a rolled version across both threadIdx.x and y"; + dim3 blocks(input.getSize(1), + input.getSize(0)); + dim3 threads(input.getSize(3), + min(input.getSize(2), + floor(prop.maxThreadsPerBlock, input.getSize(3))) + ); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + SpatialBatchNormalizationUpdateOutputInference_kernel + + <<>> + (input, output, runningMean, runningStddev, weight, bias); + } + } else { + dim3 blocks(input.getSize(1)); + if (input.getSize(3) >= 16 && input.getSize(2) >= 16) { + dim3 threads(16, 16); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + SpatialBatchNormalizationUpdateOutput_kernel + + <<>>(input, + output, + centered, + std, + normalized, + runningMean, + runningStddev, + weight, + bias, + epsilon, + momentum); + } else { + dim3 threads(8, 8); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + SpatialBatchNormalizationUpdateOutput_kernel + + <<>>(input, + output, + centered, + std, + normalized, + runningMean, + runningStddev, + weight, + bias, + epsilon, + momentum); + } + } + +} + +extern "C" void SpatialBatchNormalizationUpdateOutputFFI( + THCState* state, + THCudaTensor* input, + THCudaTensor* output, + THCudaTensor* centered, + THCudaTensor* std, + THCudaTensor* normalized, + THCudaTensor* runningMean, + THCudaTensor* runningStddev, + THCudaTensor* weight, + THCudaTensor* bias, + float epsilon, + float momentum, + bool train, + bool affine) +{ + // The SpatialBatchNormalization lua module is designed for + // 4-D only: batch, plane, y, x + constexpr int BatchDims = 2; + constexpr int ImageDims = 2; + typedef double ComputeT; + if (!train) { + if (!affine) { + // Collapse + SpatialBatchNormalizationUpdateOutput + + ( + torchToDeviceTensor(state, input), + torchToDeviceTensor(state, output), + DeviceTensor(), + DeviceTensor(), + DeviceTensor(), + torchToDeviceTensor(state, runningMean), + torchToDeviceTensor(state, runningStddev), + DeviceTensor(), + DeviceTensor(), + epsilon, + momentum, + THCState_getCurrentStream(state) + ); + } else { + // Collapse + SpatialBatchNormalizationUpdateOutput + + ( + torchToDeviceTensor(state, input), + torchToDeviceTensor(state, output), + DeviceTensor(), + DeviceTensor(), + DeviceTensor(), + torchToDeviceTensor(state, runningMean), + torchToDeviceTensor(state, runningStddev), + torchToDeviceTensor(state, weight), + torchToDeviceTensor(state, bias), + epsilon, + momentum, + THCState_getCurrentStream(state) + ); + } + } else { + if (!affine) { + SpatialBatchNormalizationUpdateOutput + + ( + torchToDeviceTensor(state, input), + torchToDeviceTensor(state, output), + torchToDeviceTensor(state, centered), + torchToDeviceTensor(state, std), + torchToDeviceTensor(state, normalized), + torchToDeviceTensor(state, runningMean), + torchToDeviceTensor(state, runningStddev), + DeviceTensor(), + DeviceTensor(), + epsilon, + momentum, + THCState_getCurrentStream(state) + ); + } else { + SpatialBatchNormalizationUpdateOutput + + ( + torchToDeviceTensor(state, input), + torchToDeviceTensor(state, output), + torchToDeviceTensor(state, centered), + torchToDeviceTensor(state, std), + torchToDeviceTensor(state, normalized), + torchToDeviceTensor(state, runningMean), + torchToDeviceTensor(state, runningStddev), + torchToDeviceTensor(state, weight), + torchToDeviceTensor(state, bias), + epsilon, + momentum, + THCState_getCurrentStream(state) + ); + } + } + + THCudaCheck(cudaGetLastError()); +} + + +template +__global__ void SpatialBatchNormalizationUpdateGradInput_kernel( + DeviceTensor gradInput, + const DeviceTensor gradOutput, + DeviceTensor centered, + DeviceTensor std, + const DeviceTensor weight) { + + static_assert(std::is_same::value , "type"); + + // Assert powers of 2 for proper intra-warp shuffle reduction + assert(blockDim.x == NumThreads); + assert(blockDim.y == NumThreads); + static_assert((NumThreads & (NumThreads - 1)) == 0, + "NumThreads must be a power of 2 for proper warp shuffling"); + auto plane = blockIdx.x; + auto numBatches = gradInput.getSize(0); + + auto norm = (T)0; + if (threadIdx.y == 0) { + norm = gradInput.getSize(0) * gradInput.getSize(2) * gradInput.getSize(3); + norm = (T)1 / norm; + } + + // 1. Compute means across (batch, y, x) + auto gradMeanGlobal = (T)0; + auto centeredGradMeanGlobal = (T)0; + for (int y = threadIdx.y; y < gradInput.getSize(2); y += NumThreads) { + auto gradMeanLocal = (T)0; + auto centeredGradMeanLocal = (T)0; + for (auto batch = 0; batch < numBatches; ++batch) { + for (int x = threadIdx.x; x < gradInput.getSize(3); x += NumThreads) { + auto g = (inBounds(y, x, gradOutput)) ? + gradOutput[batch][plane][y][x].ldg() : 0.0f; + auto c = (inBounds(y, x, centered)) ? + centered[batch][plane][y][x].ldg() : 0.0f; + gradMeanLocal += g; + centeredGradMeanLocal += c * g; + } + } + // Reduce within warp + for (auto i = 0; i < getMSB(NumThreads); ++i) { + gradMeanLocal += + __shfl_xor(gradMeanLocal, 1 << i, NumThreads); + centeredGradMeanLocal += + __shfl_xor(centeredGradMeanLocal, 1 << i, NumThreads); + } + // thread 0 has it + gradMeanGlobal += gradMeanLocal; + centeredGradMeanGlobal += centeredGradMeanLocal; + } + + __shared__ T shared[2][NumThreads]; + // thx == 0 stores into smem + if (threadIdx.x == 0) { + shared[0][threadIdx.y] = gradMeanGlobal; + shared[1][threadIdx.y] = centeredGradMeanGlobal; + } + + __syncthreads(); + // 'transpose', and reduce within warp again + if (threadIdx.y == 0) { + auto gradMeanLocal = shared[0][threadIdx.x]; + auto centeredGradMeanLocal = shared[1][threadIdx.x]; + // Reduce within warp again + for (auto i = 0; i < getMSB(NumThreads); ++i) { + gradMeanLocal += + __shfl_xor(gradMeanLocal, 1 << i, NumThreads); + centeredGradMeanLocal += + __shfl_xor(centeredGradMeanLocal, 1 << i, NumThreads); + } + // We did an allreduce with xors, this should reduce contention on + // shared memory. + gradMeanGlobal = gradMeanLocal * norm; + centeredGradMeanGlobal = centeredGradMeanLocal * norm; + // Save the non momentum-altered version to share with everyone + shared[0][threadIdx.x] = gradMeanGlobal; + shared[1][threadIdx.x] = centeredGradMeanGlobal; + } + __syncthreads(); + + // Everyone picks it up, should be broadcast into the whole gradInput + gradMeanGlobal = shared[0][threadIdx.x]; + centeredGradMeanGlobal = shared[1][threadIdx.x]; + + auto stdVal = std[plane]; + for (int y = threadIdx.y; y < gradInput.getSize(2); y += NumThreads) { + for (auto batch = 0; batch < numBatches; ++batch) { + for (int x = threadIdx.x; x < gradInput.getSize(3); x += NumThreads) { + if (affine) { + gradInput[batch][plane][y][x] = + ( - centeredGradMeanGlobal * + centered[batch][plane][y][x] * + stdVal * + stdVal + + + gradOutput[batch][plane][y][x] + - + gradMeanGlobal + ) + * stdVal * weight[plane]; + } else { + gradInput[batch][plane][y][x] = + ( - centeredGradMeanGlobal * + centered[batch][plane][y][x] * + stdVal * + stdVal + + + gradOutput[batch][plane][y][x] + - + gradMeanGlobal + ) + * stdVal; + } + } + } + } + +} + +template +void SpatialBatchNormalizationUpdateGradInput( + DeviceTensor gradInput, + const DeviceTensor gradOutput, + DeviceTensor centered, + DeviceTensor std, + const DeviceTensor weight, + cudaStream_t s) +{ + static_assert(BatchDims == 2, "BatchDims == 2 only atm"); + + dim3 blocks(gradInput.getSize(1)); + if (gradInput.getSize(3) >= 16 && gradInput.getSize(2) >= 16) { + dim3 threads(16, 16); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + SpatialBatchNormalizationUpdateGradInput_kernel + + <<>>(gradInput, + gradOutput, + centered, + std, + weight); + } else { + dim3 threads(8, 8); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + SpatialBatchNormalizationUpdateGradInput_kernel + + <<>>(gradInput, + gradOutput, + centered, + std, + weight); + } + +} + +extern "C" void SpatialBatchNormalizationUpdateGradInputFFI( + THCState* state, + THCudaTensor* gradInput, + THCudaTensor* gradOutput, + THCudaTensor* centered, + THCudaTensor* std, + THCudaTensor* weight, + bool affine) { + + // The SpatialBatchNormalization lua module is designed for + // 4-D only: batch, plane, y, x + constexpr int BatchDims = 2; + constexpr int ImageDims = 2; + typedef double ComputeT; + if (!affine) { + // Collapse + SpatialBatchNormalizationUpdateGradInput + + ( + torchToDeviceTensor(state, gradInput), + torchToDeviceTensor(state, gradOutput), + torchToDeviceTensor(state, centered), + torchToDeviceTensor(state, std), + DeviceTensor(), + THCState_getCurrentStream(state) + ); + } else { + // Collapse + SpatialBatchNormalizationUpdateGradInput + + ( + torchToDeviceTensor(state, gradInput), + torchToDeviceTensor(state, gradOutput), + torchToDeviceTensor(state, centered), + torchToDeviceTensor(state, std), + torchToDeviceTensor(state, weight), + THCState_getCurrentStream(state) + ); + } + + THCudaCheck(cudaGetLastError()); +} + + +template +__global__ void SpatialBatchNormalizationAccGradParameters_kernel( + const DeviceTensor gradOutput, + const DeviceTensor normalized, + DeviceTensor gradWeight, + DeviceTensor gradBias, + T scale) +{ + + static_assert(std::is_same::value , "type"); + + // Assert powers of 2 for proper intra-warp shuffle reduction + assert(blockDim.x == NumThreads); + assert(blockDim.y == NumThreads); + static_assert((NumThreads & (NumThreads - 1)) == 0, + "NumThreads must be a power of 2 for proper warp shuffling"); + auto plane = blockIdx.x; + auto numBatches = gradOutput.getSize(0); + + // 1. Compute sums across (batch, y, x) + auto gradMeanGlobal = (T)0; + auto normalizedGradMeanGlobal = (T)0; + for (int y = threadIdx.y; y < gradOutput.getSize(2); y += NumThreads) { + auto gradMeanLocal = (T)0; + auto normalizedGradMeanLocal = (T)0; + for (auto batch = 0; batch < numBatches; ++batch) { + for (int x = threadIdx.x; x < gradOutput.getSize(3); x += NumThreads) { + auto g = (inBounds(y, x, gradOutput)) ? + gradOutput[batch][plane][y][x].ldg() : 0.0f; + auto n = (inBounds(y, x, normalized)) ? + normalized[batch][plane][y][x].ldg() : 0.0f; + gradMeanLocal += g; + normalizedGradMeanLocal += n * g; + } + } + // Reduce within warp + for (auto i = 0; i < getMSB(NumThreads); ++i) { + gradMeanLocal += + __shfl_xor(gradMeanLocal, 1 << i, NumThreads); + normalizedGradMeanLocal += + __shfl_xor(normalizedGradMeanLocal, 1 << i, NumThreads); + } + // thread 0 has it + gradMeanGlobal += gradMeanLocal; + normalizedGradMeanGlobal += normalizedGradMeanLocal; + } + + __shared__ T shared[2][NumThreads]; + // thx == 0 stores into smem + if (threadIdx.x == 0) { + shared[0][threadIdx.y] = gradMeanGlobal; + shared[1][threadIdx.y] = normalizedGradMeanGlobal; + } + + __syncthreads(); + // 'transpose', and reduce within warp again + if (threadIdx.y == 0) { + auto gradMeanLocal = shared[0][threadIdx.x]; + auto normalizedGradMeanLocal = shared[1][threadIdx.x]; + // Reduce within warp again + for (auto i = 0; i < getMSB(NumThreads); ++i) { + gradMeanLocal += + __shfl_xor(gradMeanLocal, 1 << i, NumThreads); + normalizedGradMeanLocal += + __shfl_xor(normalizedGradMeanLocal, 1 << i, NumThreads); + } + // We did an allreduce with xors, this should reduce contention on + // shared memory. + gradMeanGlobal = gradMeanLocal; + normalizedGradMeanGlobal = normalizedGradMeanLocal; + + // thread 0 has it + if (threadIdx.x == 0) { + gradBias[plane] += scale * gradMeanGlobal; + gradWeight[plane] += scale * normalizedGradMeanGlobal; + } + } +} + +template +void SpatialBatchNormalizationAccGradParameters( + const DeviceTensor gradOutput, + const DeviceTensor normalized, + DeviceTensor gradWeight, + DeviceTensor gradBias, + T scale, + cudaStream_t s) +{ + static_assert(BatchDims == 2, "BatchDims == 2 only atm"); + + dim3 blocks(gradOutput.getSize(1)); + if (gradOutput.getSize(3) >= 16 && gradOutput.getSize(2) >= 16) { + dim3 threads(16, 16); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + SpatialBatchNormalizationAccGradParameters_kernel + <<>>(gradOutput, + normalized, + gradWeight, + gradBias, + scale); + } else { + dim3 threads(8, 8); + LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " " + << threads.x << " " << threads.y << " " << threads.z; + SpatialBatchNormalizationAccGradParameters_kernel + <<>>(gradOutput, + normalized, + gradWeight, + gradBias, + scale); + } + +} + +extern "C" void SpatialBatchNormalizationAccGradParametersFFI( + THCState* state, + THCudaTensor* gradOutput, + THCudaTensor* normalized, + THCudaTensor* gradWeight, + THCudaTensor* gradBias, + float scale) { + // The SpatialBatchNormalization lua module is designed for + // 4-D only: batch, plane, y, x + constexpr int BatchDims = 2; + constexpr int ImageDims = 2; + typedef double ComputeT; + // Collapse + SpatialBatchNormalizationAccGradParameters + + ( + torchToDeviceTensor(state, gradOutput), + torchToDeviceTensor(state, normalized), + torchToDeviceTensor(state, gradWeight), + torchToDeviceTensor(state, gradBias), + scale, + THCState_getCurrentStream(state) + ); + + THCudaCheck(cudaGetLastError()); +} + + +}}} diff --git a/src/TemporalConvolutionFBHost.cpp b/src/TemporalConvolutionFBHost.cpp index 0339f8b..80ffd83 100644 --- a/src/TemporalConvolutionFBHost.cpp +++ b/src/TemporalConvolutionFBHost.cpp @@ -1,13 +1,13 @@ // Copyright 2014 Facebook #include "cuda/DeviceTensor.cuh" +#include "cuda/util/CachedDeviceProperties.h" #include "THC.h" #include "THCTensor.h" -#include "Utils.h" -#include "CuBLASWrapper.h" -#include "ConvolutionBias.cuh" -#include "DeviceTensorUtils.h" -#include "util/Misc.h" +#include "src/Utils.h" +#include "src/CuBLASWrapper.h" +#include "src/ConvolutionBias.cuh" +#include "src/DeviceTensorUtils.h" #include #include diff --git a/src/TemporalKMaxPooling.cu b/src/TemporalKMaxPooling.cu index 6e42296..f64b2df 100644 --- a/src/TemporalKMaxPooling.cu +++ b/src/TemporalKMaxPooling.cu @@ -3,7 +3,7 @@ #include "cuda/DeviceTensor.cuh" #include "cuda/TopKElements.cuh" #include "cuda/DeviceTensor.cuh" -#include "util/Misc.h" +#include "cuda/util/CachedDeviceProperties.h" #include "THC.h" using namespace facebook::cuda; @@ -88,7 +88,7 @@ runTemporalKMaxPoolingUpdateOutput(cudaStream_t stream, DeviceTensor& output, int k) { const cudaDeviceProp& deviceProperties = - facebook::CUDAUtil::getCurrentDeviceProperties(); + facebook::cuda::getCurrentDeviceProperties(); // We aim to run with 4 warps. const int numWarps = std::min(input.getSize(2), 4); @@ -107,7 +107,7 @@ runTemporalKMaxPoolingUpdateGradInput(cudaStream_t stream, DeviceTensor& gradInput, int k) { const cudaDeviceProp& deviceProperties = - facebook::CUDAUtil::getCurrentDeviceProperties(); + facebook::cuda::getCurrentDeviceProperties(); // We aim to run with 4 warps. const int numThreads = diff --git a/src/TemporalKMaxPoolingHost.cpp b/src/TemporalKMaxPoolingHost.cpp index 1ea1678..fe30821 100644 --- a/src/TemporalKMaxPoolingHost.cpp +++ b/src/TemporalKMaxPoolingHost.cpp @@ -1,14 +1,15 @@ // Copyright 2004-present Facebook. All Rights Reserved. #include "cuda/DeviceTensor.cuh" -#include "Utils.h" -#include "DeviceTensorUtils.h" +#include "src/Utils.h" +#include "src/DeviceTensorUtils.h" #include "THC.h" -#include "TemporalKMaxPooling.cuh" +#include "src/TemporalKMaxPooling.cuh" #include #include #include +#include using namespace facebook::cuda; @@ -17,12 +18,12 @@ namespace facebook { namespace deeplearning { namespace torch { namespace { int checkAndAdjustK(lua_State* L, int k, double kDynamic, long sequenceLength) { - if (kDynamic > 0) { - k = std::max(k, (int) (kDynamic * sequenceLength)); + if (kDynamic != -1) { + k = std::max(k, (int) (std::ceil(kDynamic * sequenceLength))); } if (k > sequenceLength) { - luaL_error(L, "k (%d) must be less than sequence length (%d) ", k, sequenceLength); + luaL_error(L, "k: k must be less than the sequence length"); } return k; diff --git a/src/TemporalMaxPooling.cu b/src/TemporalMaxPooling.cu index ac16412..1230278 100644 --- a/src/TemporalMaxPooling.cu +++ b/src/TemporalMaxPooling.cu @@ -1,12 +1,9 @@ // Copyright 2004-present Facebook. All Rights Reserved. #include "cuda/DeviceTensor.cuh" -#include "DeviceTensorUtils.h" -#include "util/Misc.h" -#include "Utils.h" -#include "lua.h" -#include "luaT.h" -#include "THC.h" +#include "cuda/util/CachedDeviceProperties.h" +#include "src/DeviceTensorUtils.h" +#include "src/Utils.h" using namespace facebook::cuda; using namespace facebook::deeplearning::torch; @@ -141,7 +138,7 @@ static int fbcunn_TemporalMaxPooling_updateOutput(lua_State *L) { // be limited by smem or register count, so no need to use the // occupancy calculator. const cudaDeviceProp& deviceProperties = - facebook::CUDAUtil::getCurrentDeviceProperties(); + facebook::cuda::getCurrentDeviceProperties(); dim3 block(min(input.getSize(2), deviceProperties.maxThreadsPerBlock)); dim3 grid(input.getSize(0), // batch size @@ -205,7 +202,7 @@ static int fbcunn_TemporalMaxPooling_updateGradInput(lua_State *L) { // be limited by smem or register count, so no need to use the // occupancy calculator. const cudaDeviceProp& deviceProperties = - facebook::CUDAUtil::getCurrentDeviceProperties(); + facebook::cuda::getCurrentDeviceProperties(); dim3 block(min(gradOutput.getSize(2), deviceProperties.maxThreadsPerBlock)); dim3 grid(gradOutput.getSize(0), // batch size diff --git a/src/WeightedLookupTable.cu b/src/WeightedLookupTable.cu new file mode 100644 index 0000000..730f90c --- /dev/null +++ b/src/WeightedLookupTable.cu @@ -0,0 +1,51 @@ +/** + * Copyright 2015 Facebook + */ + +#include "cuda/CudaUtils.cuh" +#include "cuda/DeviceTensor.cuh" +#include "cuda/WarpReductions.cuh" + +using namespace facebook::cuda; + +namespace facebook { namespace deeplearning { namespace torch { +namespace detail { + +namespace { + +__global__ void scaleByWeight(DeviceTensor output, + DeviceTensor input, + DeviceTensor weights) { + // Values computed per thread + const int VT = 4; + + // Each block computes a 4x128 section of the output, with each + // warp handling a 1x128 section. + + int rowIdx = blockIdx.x * blockDim.y + threadIdx.y; + if (rowIdx < weights.getSize(0)) { + float weight = weights[rowIdx]; + + #pragma unroll + for (int i = 0; i < VT; i++) { + int colIdx = blockDim.x * (VT * blockIdx.y + i) + threadIdx.x; + if (colIdx < input.getSize(1)) { + output[rowIdx][colIdx] = input[rowIdx][colIdx] * weight; + } + } + } +} + +} + +void launchWeightedLookupTableScaleByWeightKernel(cudaStream_t stream, + DeviceTensor& output, + DeviceTensor& input, + DeviceTensor& weight) { + dim3 grid(cuda::ceil(output.getSize(0), 4), cuda::ceil(output.getSize(1), 128)); + dim3 block(32, 4); + + scaleByWeight<<>>(output, input, weight); +} + +}}}} diff --git a/src/WeightedLookupTableHost.cpp b/src/WeightedLookupTableHost.cpp new file mode 100644 index 0000000..84b8d08 --- /dev/null +++ b/src/WeightedLookupTableHost.cpp @@ -0,0 +1,58 @@ +/** + * Copyright 2015 Facebook + */ + +#include "cuda/DeviceTensor.cuh" +#include "src/Utils.h" +#include "src/DeviceTensorUtils.h" +#include "THC.h" + +#include +#include +#include + +using namespace facebook::cuda; + +namespace facebook { namespace deeplearning { namespace torch { + +namespace detail { +void launchWeightedLookupTableScaleByWeightKernel( + cudaStream_t stream, + DeviceTensor& output, + DeviceTensor& input, + DeviceTensor& weight); +} + +namespace { + +int scaleByWeight(lua_State* L) { + THCState* state = getCutorchState(L); + auto output = (THCudaTensor*)luaT_checkudata(L, 1, "torch.CudaTensor"); + const auto input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); + const auto weight = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); + + DeviceTensor cudaOutput = torchToDeviceTensor(state, output); + DeviceTensor cudaInput = torchToDeviceTensor(state, input); + DeviceTensor cudaWeight = torchToDeviceTensor(state, weight); + + detail::launchWeightedLookupTableScaleByWeightKernel( + THCState_getCurrentStream(state), + cudaOutput, cudaInput, cudaWeight); + + return 0; +} + +const luaL_Reg functions[] = { + {"WeightedLookupTable_scaleByWeight", scaleByWeight}, + {nullptr, nullptr}, +}; + +} // namespace + +void initWeightedLookupTableCuda(lua_State* L) { + luaT_pushmetatable(L, "torch.CudaTensor"); + luaT_registeratname(L, functions, "nn"); + lua_pop(L, 1); +} + +}}} // namespaces diff --git a/src/fft/CuFFTConvolution.cpp b/src/fft/CuFFTConvolution.cpp index be77a5a..5e670d7 100644 --- a/src/fft/CuFFTConvolution.cpp +++ b/src/fft/CuFFTConvolution.cpp @@ -1,17 +1,17 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "CuFFTConvolution.cuh" +#include "src/fft/CuFFTConvolution.cuh" #include "THCTensor.h" #include "cuda/DeviceTensor.cuh" -#include "CuBLASWrapper.h" -#include "DeviceTensorUtils.h" -#include "MM.h" -#include "CuFFTStrategy.h" -#include "CuFFTWrapper.cuh" -#include "FBFFTHost.h" -#include "Utils.cuh" -#include "Utils.h" +#include "src/CuBLASWrapper.h" +#include "src/DeviceTensorUtils.h" +#include "src/MM.h" +#include "src/fft/CuFFTStrategy.h" +#include "src/fft/CuFFTWrapper.cuh" +#include "src/fft/FBFFTHost.h" +#include "src/fft/Utils.cuh" +#include "src/fft/Utils.h" #include #include @@ -799,13 +799,13 @@ void CuFFTConvolution::CuFFTConvolutionMxM() { // A, B and C nomenclature is relative to cuBLAS' column-major. // In row-major fbmm, this is reversed. if (convPass_.pass == ConvolutionPass::kUpdateOutput) { - transposeMM<5, false, true>( + transposeMM<5, false, true, false>( BComplex_, AComplex_, CComplex_, norm_.x, getStream(0)); } else if (convPass_.pass == ConvolutionPass::kUpdateGradInput) { - transposeMM<5, false, false>( + transposeMM<5, false, false, false>( BComplex_, AComplex_, CComplex_, norm_.x, getStream(0)); } else if (convPass_.pass == ConvolutionPass::kAccGradParameters) { - transposeMM<5, true, false>( + transposeMM<5, true, false, false>( BComplex_, AComplex_, CComplex_, norm_.x, getStream(0)); } else { throw std::runtime_error("Invalid pass for CuFFTConvolution"); @@ -902,7 +902,7 @@ void CuFFTConvolution::run() { if (!strategy_->fbmm()) { // Transpose A_ (? ? y x) -> (y x ? ?) (row-major formulation) - transposeAsComplex(AComplex_, AComplexT_, 2, handle0, s0); + transposeAsComplex(AComplex_, AComplexT_, 2, true, handle0, s0); } auto handle1 = getCircular(cublasHandles_, 1); @@ -929,7 +929,7 @@ void CuFFTConvolution::run() { if (!strategy_->fbmm()) { // Transpose A_ (? ? y x) -> (y x ? ?) (row-major formulation) - transposeAsComplex(BComplex_, BComplexT_, 2, handle1, s1); + transposeAsComplex(BComplex_, BComplexT_, 2, true, handle1, s1); // Here, both CComplex_ and CComplexT_ contain garbage that we will // overwrite and that we preemptively size to (y x ? ?).. @@ -959,7 +959,7 @@ void CuFFTConvolution::run() { auto handle = getCircular(cublasHandles_, 0); // Transpose followed by IFFT in same stream s0 as the MxM // Transpose input (y x ? ?) -> (? ? y x) (row-major formulation) - transposeAsComplex(CComplexT_, CComplex_, 2, handle, s); + transposeAsComplex(CComplexT_, CComplex_, 2, true, handle, s); } if (strategy_->cufft()) { fft2d<2>(C_, CComplex_, FFTParameters().inverse().normalize(false), diff --git a/src/fft/CuFFTConvolution_AccGradParameters.cu b/src/fft/CuFFTConvolution_AccGradParameters.cu index ecb6216..6b5af65 100644 --- a/src/fft/CuFFTConvolution_AccGradParameters.cu +++ b/src/fft/CuFFTConvolution_AccGradParameters.cu @@ -1,16 +1,16 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "CuFFTConvolution_AccGradParameters.cuh" +#include "src/fft/CuFFTConvolution_AccGradParameters.cuh" #include "cuda/CudaUtils.cuh" #include "cuda/DeviceTensor.cuh" -#include "DeviceTensorUtils.h" +#include "src/DeviceTensorUtils.h" #include "THCTensor.h" -#include "CuBLASWrapper.h" -#include "ConvolutionBias.cuh" -#include "CuFFTWrapper.cuh" -#include "CuFFTConvolution.cuh" -#include "Utils.cuh" +#include "src/CuBLASWrapper.h" +#include "src/ConvolutionBias.cuh" +#include "src/fft/CuFFTWrapper.cuh" +#include "src/fft/CuFFTConvolution.cuh" +#include "src/fft/Utils.cuh" #include #include diff --git a/src/fft/CuFFTConvolution_UpdateGradInput.cu b/src/fft/CuFFTConvolution_UpdateGradInput.cu index b768bcc..fd0560f 100644 --- a/src/fft/CuFFTConvolution_UpdateGradInput.cu +++ b/src/fft/CuFFTConvolution_UpdateGradInput.cu @@ -1,15 +1,15 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "CuFFTConvolution_UpdateGradInput.cuh" +#include "src/fft/CuFFTConvolution_UpdateGradInput.cuh" #include "cuda/CudaUtils.cuh" #include "cuda/DeviceTensor.cuh" -#include "DeviceTensorUtils.h" +#include "src/DeviceTensorUtils.h" #include "THCTensor.h" -#include "CuBLASWrapper.h" -#include "CuFFTWrapper.cuh" -#include "CuFFTConvolution.cuh" -#include "Utils.cuh" +#include "src/CuBLASWrapper.h" +#include "src/fft/CuFFTWrapper.cuh" +#include "src/fft/CuFFTConvolution.cuh" +#include "src/fft/Utils.cuh" #include #include diff --git a/src/fft/CuFFTConvolution_UpdateOutput.cu b/src/fft/CuFFTConvolution_UpdateOutput.cu index 15c82fa..ae29e44 100644 --- a/src/fft/CuFFTConvolution_UpdateOutput.cu +++ b/src/fft/CuFFTConvolution_UpdateOutput.cu @@ -1,16 +1,16 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "CuFFTConvolution_UpdateOutput.cuh" +#include "src/fft/CuFFTConvolution_UpdateOutput.cuh" #include "cuda/CudaUtils.cuh" #include "cuda/DeviceTensor.cuh" -#include "DeviceTensorUtils.h" +#include "src/DeviceTensorUtils.h" #include "THCTensor.h" -#include "ConvolutionBias.cuh" -#include "CuBLASWrapper.h" -#include "CuFFTWrapper.cuh" -#include "CuFFTConvolution.cuh" -#include "Utils.cuh" +#include "src/ConvolutionBias.cuh" +#include "src/CuBLASWrapper.h" +#include "src/fft/CuFFTWrapper.cuh" +#include "src/fft/CuFFTConvolution.cuh" +#include "src/fft/Utils.cuh" #include #include diff --git a/src/fft/CuFFTStrategy.h b/src/fft/CuFFTStrategy.h index 075eb25..263ee6e 100644 --- a/src/fft/CuFFTStrategy.h +++ b/src/fft/CuFFTStrategy.h @@ -2,8 +2,8 @@ #pragma once -#include "CuFFTConvolution.cuh" -#include "CuFFTWrapper.cuh" +#include "src/fft/CuFFTConvolution.cuh" +#include "src/fft/CuFFTWrapper.cuh" #include #include diff --git a/src/fft/CuFFTWrapper.cu b/src/fft/CuFFTWrapper.cu index a55d247..5709483 100644 --- a/src/fft/CuFFTWrapper.cu +++ b/src/fft/CuFFTWrapper.cu @@ -1,14 +1,16 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "CuFFTWrapper.cuh" +#include "src/fft/CuFFTWrapper.cuh" -#include "cuda/DeviceTensor.cuh" #include "THCTensor.h" +#include "cuda/DeviceTensor.cuh" +#include "src/DeviceTensorUtils.h" #include #include #include #include +#include #include #include #include @@ -17,7 +19,7 @@ #include #include #include -#include +#include DEFINE_bool(fft_verbose, false, "Dump meta information for the FFT wrapper"); @@ -173,7 +175,7 @@ cufftHandle makeCuFFTPlan(const DeviceTensor& real, batchSize); } if (errFFT != CUFFT_SUCCESS) { - throw std::bad_alloc(); + THError("Could not allocate cufft plan properly!"); } return plan; @@ -202,7 +204,7 @@ void fft(DeviceTensor& real, real.template dataAs(), cplx.template dataAs()); if (errFFT != CUFFT_SUCCESS) { - throw std::bad_alloc(); + THError("Error running forward FFT!"); } DCHECK_EQ(errFFT, CUFFT_SUCCESS); } else { @@ -210,7 +212,7 @@ void fft(DeviceTensor& real, cplx.template dataAs(), real.template dataAs()); if (errFFT != CUFFT_SUCCESS) { - throw std::bad_alloc(); + THError("Error running inverse FFT!"); } DCHECK_EQ(errFFT, CUFFT_SUCCESS); @@ -223,7 +225,8 @@ void fft(DeviceTensor& real, DCHECK_LT(0, size) << "Negative size not supported !"; float val = 1 / (float)size; thrust::device_ptr res(real.data()); - thrust::transform(res, + thrust::transform(thrust::cuda::par.on(stream), + res, res + real.getSize(0) * real.getStride(0), res, CudaScaleFunctor(val)); @@ -317,4 +320,38 @@ template void fft<2, 5>(DeviceTensor& real, cufftHandle* plan, cudaStream_t stream); +#define INSTANTIATE_CUFFT_PLAN(BATCH_DIMS, REAL_TENSOR_DIM) \ + if (BATCH_DIMS == batchDimensions && \ + REAL_TENSOR_DIM == THCudaTensor_nDimension(state, realTH)) { \ + DeviceTensor real = \ + torchToDeviceTensor(state, realTH); \ + DeviceTensor cplx = \ + torchToDeviceTensor(state, cplxTH); \ + return makeCuFFTPlan(real, cplx, params); \ + } + +extern "C" +cufftHandle makeCuFFTPlanFFI(THCState* state, + THCudaTensor* realTH, + THCudaTensor* cplxTH, + bool direction, + bool normalize, + int fftVersion, + int batchDimensions) +{ + FFTParameters params = FFTParameters().normalize(normalize); + if (direction) params = params.forward(); + else params = params.inverse(); + if (fftVersion == 0) params = params.withCufft(); + else params = params.withFbfft(); + + // 1 and 2D plans atm with 1 or 2 batch dimensions + INSTANTIATE_CUFFT_PLAN(1, 2); + INSTANTIATE_CUFFT_PLAN(1, 3); + INSTANTIATE_CUFFT_PLAN(2, 3); + INSTANTIATE_CUFFT_PLAN(2, 4); + + return (cufftHandle)-1; +} + } } } // namespace diff --git a/src/fft/CuFFTWrapper.cuh b/src/fft/CuFFTWrapper.cuh index a6d069a..26a6699 100644 --- a/src/fft/CuFFTWrapper.cuh +++ b/src/fft/CuFFTWrapper.cuh @@ -2,8 +2,8 @@ #pragma once #include "cuda/DeviceTensor.cuh" -#include "cuda/fbfft/FBFFT.h" -#include "Utils.cuh" +#include "cuda/fbfft/FBFFT.cuh" +#include "src/fft/Utils.cuh" #include @@ -16,11 +16,12 @@ class FFTParameters { // Normalization occurs only in inverse FFT (by 1 / (M.N)) since CuFFT does // unnormalized FFTs by default FFTParameters() : - version(cufft), direction_(true), normalize_(true) {} + version(cufft), direction_(true), normalize_(true), padLeft_(0), padUp_(0) + {} operator facebook::cuda::fbfft::FBFFTParameters() const { facebook::cuda::fbfft::FBFFTParameters res; - res = res.normalize(normalize_); + res = res.normalize(normalize_).withPadLeft(padLeft_).withPadUp(padUp_); return (direction_) ? res.forward() : res.inverse(); } @@ -49,11 +50,23 @@ class FFTParameters { return *this; } + FFTParameters& withPadLeft(int p) { + padLeft_ = p; + return *this; + } + + FFTParameters& withPadUp(int p) { + padUp_ = p; + return *this; + } + bool forwardFFT() const { return direction_; } bool inverseFFT() const { return !direction_; } bool normalizeFFT() const { return normalize_; } bool cuFFT() const { return version == cufft; } bool fbFFT() const { return version == fbfft; } + int padLeft() const { return padLeft_; } + int padUp() const { return padUp_; } template std::vector makeComplexTensorSizes( @@ -99,6 +112,8 @@ class FFTParameters { private: bool direction_; bool normalize_; + int padLeft_; + int padUp_; }; template diff --git a/src/fft/FBFFTDevice.cu b/src/fft/FBFFTDevice.cu index 6a89eeb..a980d4d 100644 --- a/src/fft/FBFFTDevice.cu +++ b/src/fft/FBFFTDevice.cu @@ -1,8 +1,6 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#pragma once - -#include "cuda/fbfft/FBFFT.h" +#include "cuda/fbfft/FBFFT.cuh" #include "cuda/fbfft/FBFFTCommon.cuh" namespace facebook { namespace cuda { namespace fbfft { @@ -11,12 +9,15 @@ template facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft1D<1>( DeviceTensor& real, DeviceTensor& complex, + const int padL, cudaStream_t s); template facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft2D<1>( DeviceTensor& real, DeviceTensor& complex, + const int padL, + const int padU, cudaStream_t s); template @@ -29,6 +30,7 @@ template facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft1D<1>( DeviceTensor& real, DeviceTensor& complex, + const int padL, cudaStream_t s); template @@ -41,6 +43,8 @@ template facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft2D<1>( DeviceTensor& srcComplex, DeviceTensor& realDst, + const int padL, + const int padU, cudaStream_t s); }}} diff --git a/src/fft/FBFFTHost.cpp b/src/fft/FBFFTHost.cpp index b8ac98f..85c6f8d 100644 --- a/src/fft/FBFFTHost.cpp +++ b/src/fft/FBFFTHost.cpp @@ -4,9 +4,9 @@ #include "cuda/Complex.cuh" #include "cuda/ComputeCapabilities.cuh" #include "cuda/DeviceTensor.cuh" -#include "cuda/fbfft/FBFFT.h" -#include "CuFFTWrapper.cuh" -#include "DeviceTensorUtils.h" +#include "cuda/fbfft/FBFFT.cuh" +#include "src/fft/CuFFTWrapper.cuh" +#include "src/DeviceTensorUtils.h" #include #include @@ -23,9 +23,9 @@ FBFFTParameters::ErrorCode fbfft1dHost( FBFFTParameters params, cudaStream_t s) { if (params.forwardFFT()) { - return fbfft1D(real, complexAsFloat, s); + return fbfft1D(real, complexAsFloat, params.padLeft(), s); } else { - return fbifft1D(real, complexAsFloat, s); + return fbifft1D(real, complexAsFloat, params.padLeft(), s); } } @@ -83,7 +83,8 @@ FBFFTParameters::ErrorCode fbfft2dHost( FBFFTParameters::ErrorCode res; if (params.forwardFFT()) { - res = fbfft2D(real, bufferAsFloatTr, s); + res = fbfft2D( + real, bufferAsFloatTr, params.padLeft(), params.padUp(), s); } else { assert(real.getSize(0) == bufferAsFloat->getSize(0)); assert(complex.getSize(1) == @@ -99,13 +100,15 @@ FBFFTParameters::ErrorCode fbfft2dHost( if (params.forwardFFT()) { return fbfft2D(bufferTr, complex, s); } else { - return fbifft2D(buffer, real, s); + return fbifft2D(buffer, real, params.padLeft(), params.padUp(), s); } } else { if (params.forwardFFT()) { - return fbfft2D(real, complexAsFloat, s); + return fbfft2D( + real, complexAsFloat, params.padLeft(), params.padUp(), s); } else { - return fbifft2D(complex, real, s); + return fbifft2D( + complex, real, params.padLeft(), params.padUp(), s); } } @@ -154,12 +157,13 @@ FBFFTParameters::ErrorCode fbfft(THCState* state, THCudaTensor* r, THCudaTensor* c, THCudaTensor* b, - FBFFTParameters params, - cudaStream_t s) { + FBFFTParameters params) { if (THCudaTensor_nDimension(state, r) - Batch == 1) { - return fbfft1dHost(state, r, c, params, s); + return fbfft1dHost( + state, r, c, params, THCState_getCurrentStream(state)); } else if (THCudaTensor_nDimension(state, r) - Batch == 2) { - return fbfft2dHost(state, r, c, b, params, s); + return fbfft2dHost( + state, r, c, b, params, THCState_getCurrentStream(state)); } return FBFFTParameters::UnsupportedDimension; } @@ -169,15 +173,13 @@ fbfft<1>(THCState* state, THCudaTensor* real, THCudaTensor* complex, THCudaTensor* buffer, - FBFFTParameters params, - cudaStream_t s); + FBFFTParameters params); template FBFFTParameters::ErrorCode fbfft<2>(THCState* state, THCudaTensor* real, THCudaTensor* complex, THCudaTensor* buffer, - FBFFTParameters params, - cudaStream_t s); + FBFFTParameters params); } } } // namespace diff --git a/src/fft/FBFFTHost.h b/src/fft/FBFFTHost.h index f055209..aa72862 100644 --- a/src/fft/FBFFTHost.h +++ b/src/fft/FBFFTHost.h @@ -33,7 +33,6 @@ facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft( THCudaTensor* complex, THCudaTensor* buffer = nullptr, facebook::cuda::fbfft::FBFFTParameters params = - facebook::cuda::fbfft::FBFFTParameters(), - cudaStream_t s = 0); + facebook::cuda::fbfft::FBFFTParameters()); } } } // namespace diff --git a/src/fft/FFTIteratedConvolution.cu b/src/fft/FFTIteratedConvolution.cu new file mode 100644 index 0000000..0523133 --- /dev/null +++ b/src/fft/FFTIteratedConvolution.cu @@ -0,0 +1,98 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#include "src/DeviceTensorUtils.h" +#include "THCTensor.h" + +#include "cuda/DeviceTensor.cuh" +#include "cuda/fbfft/FFTIteratedConvolution.cuh" + +#include +#include + +using namespace facebook::cuda; + +namespace facebook { namespace deeplearning { namespace torch { + +typedef struct { + THCudaTensor* tensor; + int padL; + int padU; +} TiledDeviceTensorFFI; + +#define LOG_TARGET LOG(INFO) + +#define INSTANTIATE_ITERATED_CONVOLUTION(DIM, FFT_SIZE) \ + if (THCudaTensor_nDimension(state, weight) == DIM && \ + fftSize == FFT_SIZE) { \ + thrust::host_vector > \ + tiledInputs; \ + thrust::host_vector > \ + tiledOutputs; \ + for (int i = 0; i < numTiles; ++i) { \ + DeviceTensor ti( \ + torchToDeviceTensor(state, input[i].tensor)); \ + fbfft::detail::TiledDeviceTensor inp( \ + ti, \ + input[i].padL, \ + input[i].padU); \ + /* TODO: emplace_back */ \ + tiledInputs.push_back(inp); \ + \ + DeviceTensor to( \ + torchToDeviceTensor(state, output[i].tensor)); \ + fbfft::detail::TiledDeviceTensor out( \ + to, \ + output[i].padL, \ + output[i].padU); \ + /* TODO: emplace_back */ \ + tiledOutputs.push_back(out); \ + } \ + \ + thrust::device_vector > \ + ins = tiledInputs; \ + thrust::device_vector > \ + outs = tiledOutputs; \ + \ + DeviceTensor wei( \ + torchToDeviceTensor(state, weight)); \ + bool res = \ + fbfft::detail::FFTIteratedConvolution( \ + thrust::raw_pointer_cast(&ins[0]), \ + thrust::raw_pointer_cast(&outs[0]), \ + wei, \ + pass, \ + scale, \ + batchSize, \ + ins.size(), \ + THCState_getCurrentStream(state)); \ + if (!res) { THError("Error in iterated convolution"); } \ + } + +extern "C" void convolveIteratedFFI(THCState* state, + TiledDeviceTensorFFI* input, + THCudaTensor* weight, + TiledDeviceTensorFFI* output, + int numTiles, + int fftSize, + fbfft::detail::FFTConvolutionPassFFI pass, + float scale) { + // TODO: accGrad all on same stream, updateOutput / updateGradInput async + int batchSize = THCudaTensor_size(state, input[0].tensor, 0); + + //////////////////////////////////////////////////////// + // FFT of size 32 + //////////////////////////////////////////////////////// + INSTANTIATE_ITERATED_CONVOLUTION(4, 32); + + //////////////////////////////////////////////////////// + // FFT of size 16 + //////////////////////////////////////////////////////// + INSTANTIATE_ITERATED_CONVOLUTION(4, 16); + + //////////////////////////////////////////////////////// + // FFT of size 8 + //////////////////////////////////////////////////////// + INSTANTIATE_ITERATED_CONVOLUTION(4, 8); +} + +}}} diff --git a/src/fft/FFTWrapperLua.cpp b/src/fft/FFTWrapperLua.cpp index f4ec016..6bea2b4 100644 --- a/src/fft/FFTWrapperLua.cpp +++ b/src/fft/FFTWrapperLua.cpp @@ -4,13 +4,12 @@ #include "THC.h" #include "THCTensor.h" -#include "cuda/fbfft/FBFFT.h" -#include "Utils.h" -#include "../Utils.h" -#include "CuFFTWrapper.cuh" -#include "FBFFTHost.h" -#include "DeviceTensorUtils.h" -#include "util/Misc.h" +#include "cuda/fbfft/FBFFT.cuh" +#include "cuda/util/CachedDeviceProperties.h" +#include "src/Utils.h" +#include "src/fft/CuFFTWrapper.cuh" +#include "src/fft/FBFFTHost.h" +#include "src/DeviceTensorUtils.h" #include #include @@ -23,7 +22,6 @@ using namespace facebook::cuda; using namespace facebook::cuda::fbfft; -using namespace facebook::CUDAUtil; using namespace std; namespace facebook { namespace deeplearning { namespace torch { @@ -61,8 +59,8 @@ float timedRun(THCState* state, state, timeTHTensor, frequencyTHTensor, bufferTHTensor, (FBFFTParameters)p); if (result != FBFFTParameters::Success) { - throw std::invalid_argument(folly::format("FBFFT error: {}", - (int)result).str().c_str()); + THCudaCheck(cudaGetLastError()); + THError(folly::format("FBFFT error: {}", (int)result).str().c_str()); } auto timeMS = timer.stop(); return timeMS; @@ -70,49 +68,38 @@ float timedRun(THCState* state, return 0.0f; } -#define FFT_BATCH(BATCH) \ - case BATCH: \ - { \ - switch(dims) { \ - case 2: \ - time += timedRun(state, \ - timeTHTensor, \ - frequencyTHTensor, \ - bufferTHTensor, \ - p, \ - fftPlan); \ - break; \ - case 3: \ - time += timedRun(state, \ - timeTHTensor, \ - frequencyTHTensor, \ - bufferTHTensor, \ - p, \ - fftPlan); \ - break; \ - default: \ - throw invalid_argument("Unsupported dims + batchDims"); \ - } \ - } \ - break; - - - -int fftFun(lua_State* L, bool forward) { - THCState* state = getCutorchState(L); - bool dumpTimings = false; +#define TIMED_FFT(BATCH, DIM) \ + if (batchDims == BATCH && dims == DIM) { \ + time += timedRun(state, \ + timeTHTensor, \ + frequencyTHTensor, \ + bufferTHTensor, \ + p, \ + fftPlan); \ + done = true; \ + } +int runTimedFFT(lua_State* L, bool forward) { + THCState* state = getCutorchState(L); auto batchDims = luaT_getfieldcheckint(L, 1, "batchDims"); - auto cufft = luaT_getfieldcheckint(L, 1, "cufft"); + auto cufft = luaT_getfieldcheckboolean(L, 1, "cufft"); + auto padLeft = luaT_getfieldcheckint(L, 1, "padLeft"); + auto padUp = luaT_getfieldcheckint(L, 1, "padUp"); auto timeTHTensor = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); auto frequencyTHTensor = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); auto bufferTHTensor = (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor"); + if (THCudaTensor_nDimension(state, bufferTHTensor) == 0) { + bufferTHTensor = nullptr; + THAssert(THCudaTensor_checkGPU(state, 2, timeTHTensor, frequencyTHTensor)); + } else { + THAssert(THCudaTensor_checkGPU(state, 3, timeTHTensor, frequencyTHTensor, + bufferTHTensor)); + } + auto fftPlan = (cufftHandle)lua_tonumber(L, 5); - THAssert(THCudaTensor_checkGPU(state, 3, timeTHTensor, frequencyTHTensor, - bufferTHTensor)); CHECK_EQ(THCudaTensor_nDimension(state, timeTHTensor) + 1, THCudaTensor_nDimension(state, frequencyTHTensor)); @@ -124,27 +111,23 @@ int fftFun(lua_State* L, bool forward) { if (!forward) { p = p.inverse().normalize(false); } - if (cufft == 1) { + if (cufft) { p = p.withCufft(); } else { p = p.withFbfft(); } + p.withPadLeft(padLeft); + p.withPadUp(padUp); try { - cufftHandle fftPlan = -1; - SCOPE_EXIT{ - if (fftPlan >= 0) { - CHECK_EQ(CUFFT_SUCCESS, cufftDestroy(fftPlan)); - } - }; - for (int i = 0; i < kNumTrials; ++i) { - switch (batchDims) { - FFT_BATCH(1); - default: - throw invalid_argument("Unsupported batch dims"); - }; - + auto done = false; + TIMED_FFT(1, 2); + TIMED_FFT(1, 3); + if (!done) { + THCudaCheck(cudaGetLastError()); + THError("Timed FFT: Unsupported batch dims"); + } // Reset time to kNumTrials if (i < kNumSkipTrials && kNumTrials > kNumSkipTrials) { time = 0.0f; @@ -176,7 +159,7 @@ int fftFun(lua_State* L, bool forward) { auto version = (p.cuFFT()) ? "CuFFT" : "FBFFT"; auto direction = (forward) ? "forward" : "inverse"; auto GOut = size / 1e9; - LOG_IF(INFO, dumpTimings) << folly::format( + LOG(INFO) << folly::format( " Running fft-{}d ({}) direction={} ({}x{}x{})," \ " {} batches, GNlogN/s = {:.5f}" \ " time = {:.2f}ms", @@ -193,17 +176,116 @@ int fftFun(lua_State* L, bool forward) { return 0; } -int fftFun(lua_State* L) { - return fftFun(L, true); +#define FBFFT_CASE(BATCH_DIMS, INPUT_DIMS) \ + if (batchDims == BATCH_DIMS && inputDims == INPUT_DIMS) { \ + auto result = fbfft(state, \ + timeTHTensor, \ + frequencyTHTensor, \ + bufferTHTensor, \ + (FBFFTParameters)p); \ + if (result != FBFFTParameters::Success) { \ + THCudaCheck(cudaGetLastError()); \ + THError( \ + folly::format("FBFFT error: {}", \ + (int)result).str().c_str()); \ + } \ + done = true; \ + } + +#define CUFFT_CASE(BATCH_DIMS, INPUT_DIMS) \ + if (batchDims == BATCH_DIMS && inputDims == INPUT_DIMS) { \ + auto timeTensor = \ + torchToDeviceTensor(state, timeTHTensor); \ + auto frequencyTensor = \ + torchToDeviceTensor(state, frequencyTHTensor); \ + if (fftPlan < 0) { \ + localPlan = makeCuFFTPlan( \ + timeTensor, frequencyTensor, p); \ + } \ + fft(timeTensor, frequencyTensor, p, &localPlan);\ + done = true; \ + } + +int runFFT(lua_State* L, bool forward) { + THCState* state = getCutorchState(L); + auto batchDims = luaT_getfieldcheckint(L, 1, "batchDims"); + auto cufft = luaT_getfieldcheckboolean(L, 1, "cufft"); + auto padLeft = luaT_getfieldcheckint(L, 1, "padLeft"); + auto padUp = luaT_getfieldcheckint(L, 1, "padUp"); + auto timeTHTensor = + (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); + auto frequencyTHTensor = + (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); + auto bufferTHTensor = + (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor"); + if (THCudaTensor_nDimension(state, bufferTHTensor) == 0) { + bufferTHTensor = nullptr; + THAssert(THCudaTensor_checkGPU(state, 2, timeTHTensor, frequencyTHTensor)); + } else { + THAssert(THCudaTensor_checkGPU(state, 3, timeTHTensor, frequencyTHTensor, + bufferTHTensor)); + } + auto fftPlan = (cufftHandle)lua_tonumber(L, 5); + + CHECK_EQ(THCudaTensor_nDimension(state, timeTHTensor) + 1, + THCudaTensor_nDimension(state, frequencyTHTensor)); + + int inputDims = THCudaTensor_nDimension(state, timeTHTensor); + FFTParameters p; // forward and normalize are default + if (!forward) { + p = p.inverse().normalize(false); + } + if (!cufft) { + p = p.withFbfft(); + } else { + p = p.withCufft(); + } + p.withPadLeft(padLeft); + p.withPadUp(padUp); + + try { + auto done = false; + if (!cufft) { + FBFFT_CASE(1, 2); + FBFFT_CASE(1, 3); + FBFFT_CASE(2, 3); + FBFFT_CASE(2, 4); + if (!done) { THError("Unsupported fbfft batch dims"); } + } else { + cufftHandle localPlan = fftPlan; + SCOPE_EXIT { + if (fftPlan < 0) { + cufftDestroy(localPlan); + } + }; + CUFFT_CASE(1, 2); + CUFFT_CASE(1, 3); + CUFFT_CASE(2, 3); + CUFFT_CASE(2, 4); + if (!done) { THError("Unsupported cufft batch dims"); } + } + } catch(exception &e){ + return luaL_error(L, e.what()); + } + + return 0; +} + +int fft(lua_State* L) { + auto timed = luaT_getfieldcheckboolean(L, 1, "timed"); + if (timed) { return runTimedFFT(L, true); } + return runFFT(L, true); } -int fftiFun(lua_State* L) { - return fftFun(L, false); +int ffti(lua_State* L) { + auto timed = luaT_getfieldcheckboolean(L, 1, "timed"); + if (timed) { return runTimedFFT(L, false); } + return runFFT(L, false); } const luaL_Reg functions[] = { - {"FFTWrapper_fft", fftFun}, - {"FFTWrapper_ffti", fftiFun}, + {"FFTWrapper_fft", fft}, + {"FFTWrapper_ffti", ffti}, {nullptr, nullptr}, }; diff --git a/src/fft/SpatialConvolutionCuFFT.cpp b/src/fft/SpatialConvolutionCuFFT.cpp index 4cba88e..6cd7a20 100644 --- a/src/fft/SpatialConvolutionCuFFT.cpp +++ b/src/fft/SpatialConvolutionCuFFT.cpp @@ -1,15 +1,15 @@ // Copyright 2014 Facebook #include "THCTensor.h" -#include "DeviceTensorUtils.h" -#include "CuFFTConvolution.cuh" -#include "CuFFTConvolution_UpdateOutput.cuh" -#include "CuFFTConvolution_AccGradParameters.cuh" -#include "CuFFTConvolution_UpdateGradInput.cuh" -#include "CuFFTStrategy.h" -#include "CuFFTWrapper.cuh" -#include "Utils.h" -#include "util/Misc.h" +#include "cuda/util/CachedDeviceProperties.h" +#include "src/DeviceTensorUtils.h" +#include "src/fft/CuFFTConvolution.cuh" +#include "src/fft/CuFFTConvolution_UpdateOutput.cuh" +#include "src/fft/CuFFTConvolution_AccGradParameters.cuh" +#include "src/fft/CuFFTConvolution_UpdateGradInput.cuh" +#include "src/fft/CuFFTStrategy.h" +#include "src/fft/CuFFTWrapper.cuh" +#include "src/fft/Utils.h" #include #include @@ -18,7 +18,7 @@ #include using namespace std; -using namespace facebook::CUDAUtil; +using namespace facebook::cuda; using namespace facebook::deeplearning::torch; namespace facebook { namespace deeplearning { namespace torch { @@ -247,7 +247,7 @@ template class CuFFTBuffers { torchToDeviceTensor(state, realTH), torchToDeviceTensor(state, complexTH), params); - auto h = folly::make_unique(p); + auto h = std::unique_ptr(new cufftPlan(p)); cufftPlanMap_.emplace(key, std::move(h)); return cufftPlanMap_[key].get(); } @@ -459,27 +459,27 @@ void updateOutputTH(THCState* state, auto inputCPtr = MAKE_INPUT_BUFFER(p.buffers.input); auto inputC = inputCPtr.get(); - DCHECK_EQ(p.buffers.input->storage, inputC->storage); + CHECK_EQ(p.buffers.input->storage, inputC->storage); auto outputCPtr = MAKE_OUTPUT_BUFFER(p.buffers.output); auto outputC = outputCPtr.get(); - DCHECK_EQ(p.buffers.output->storage, outputC->storage); + CHECK_EQ(p.buffers.output->storage, outputC->storage); auto weightCPtr = MAKE_WEIGHT_BUFFER(p.buffers.weight); auto weightC = weightCPtr.get(); - DCHECK_EQ(p.buffers.weight->storage, weightC->storage); + CHECK_EQ(p.buffers.weight->storage, weightC->storage); auto inputCTrPtr = MAKE_INPUT_BUFFER(p.buffers.inputTranspose); auto inputCTr = inputCTrPtr.get(); - DCHECK_EQ(p.buffers.inputTranspose->storage, inputCTr->storage); + CHECK_EQ(p.buffers.inputTranspose->storage, inputCTr->storage); auto outputCTrPtr = MAKE_OUTPUT_BUFFER(p.buffers.outputTranspose); auto outputCTr = outputCTrPtr.get(); - DCHECK_EQ(p.buffers.outputTranspose->storage, outputCTr->storage); + CHECK_EQ(p.buffers.outputTranspose->storage, outputCTr->storage); auto weightCTrPtr = MAKE_WEIGHT_BUFFER(p.buffers.weightTranspose); auto weightCTr = weightCTrPtr.get(); - DCHECK_EQ(p.buffers.weightTranspose->storage, weightCTr->storage); + CHECK_EQ(p.buffers.weightTranspose->storage, weightCTr->storage); // Plans auto planInput = (s.fbfft()) ? @@ -548,7 +548,7 @@ void updateOutputTH(THCState* state, } // Actual run - CuFFTConvolution conv(ConvolutionPass(ConvolutionPass::kUpdateOutput)); + CuFFTConvolution conv( (ConvolutionPass(ConvolutionPass::kUpdateOutput)) ); conv.withInputAndBuffers( state, inputR, inputC, inputCTr, inputCBuffer, planInput) .withFiltersAndBuffers( @@ -645,27 +645,27 @@ void updateGradInputTH(THCState* state, auto gradInputCPtr = MAKE_INPUT_BUFFER(p.buffers.input); auto gradInputC = gradInputCPtr.get(); - DCHECK_EQ(p.buffers.input->storage, gradInputC->storage); + CHECK_EQ(p.buffers.input->storage, gradInputC->storage); auto gradOutputCPtr = MAKE_OUTPUT_BUFFER(p.buffers.output); auto gradOutputC = gradOutputCPtr.get(); - DCHECK_EQ(p.buffers.output->storage, gradOutputC->storage); + CHECK_EQ(p.buffers.output->storage, gradOutputC->storage); auto weightCPtr = MAKE_WEIGHT_BUFFER(p.buffers.weight); auto weightC = weightCPtr.get(); - DCHECK_EQ(p.buffers.weight->storage, weightC->storage); + CHECK_EQ(p.buffers.weight->storage, weightC->storage); auto gradInputCTrPtr = MAKE_INPUT_BUFFER(p.buffers.inputTranspose); auto gradInputCTr = gradInputCTrPtr.get(); - DCHECK_EQ(p.buffers.inputTranspose->storage, gradInputCTr->storage); + CHECK_EQ(p.buffers.inputTranspose->storage, gradInputCTr->storage); auto gradOutputCTrPtr = MAKE_OUTPUT_BUFFER(p.buffers.outputTranspose); auto gradOutputCTr = gradOutputCTrPtr.get(); - DCHECK_EQ(p.buffers.outputTranspose->storage, gradOutputCTr->storage); + CHECK_EQ(p.buffers.outputTranspose->storage, gradOutputCTr->storage); auto weightCTrPtr = MAKE_WEIGHT_BUFFER(p.buffers.weightTranspose); auto weightCTr = weightCTrPtr.get(); - DCHECK_EQ(p.buffers.weightTranspose->storage, weightCTr->storage); + CHECK_EQ(p.buffers.weightTranspose->storage, weightCTr->storage); auto gradInputCBuffer = (s.fbfft()) ? buffers.buffer(state, @@ -711,7 +711,7 @@ void updateGradInputTH(THCState* state, auto handles = buffers.handles(); // Actual run - CuFFTConvolution conv(ConvolutionPass(ConvolutionPass::kUpdateGradInput)); + CuFFTConvolution conv( (ConvolutionPass(ConvolutionPass::kUpdateGradInput)) ); conv.withInputAndBuffers( state, giTmp, gradInputC, gradInputCTr, gradInputCBuffer, planInput) @@ -807,27 +807,27 @@ void accGradParametersTH(THCState* state, auto inputCPtr = MAKE_INPUT_BUFFER(p.buffers.input); auto inputC = inputCPtr.get(); - DCHECK_EQ(p.buffers.input->storage, inputC->storage); + CHECK_EQ(p.buffers.input->storage, inputC->storage); auto gradOutputCPtr = MAKE_OUTPUT_BUFFER(p.buffers.output); auto gradOutputC = gradOutputCPtr.get(); - DCHECK_EQ(p.buffers.output->storage, gradOutputC->storage); + CHECK_EQ(p.buffers.output->storage, gradOutputC->storage); auto gradWeightCPtr = MAKE_WEIGHT_BUFFER(p.buffers.weight); auto gradWeightC = gradWeightCPtr.get(); - DCHECK_EQ(p.buffers.weight->storage, gradWeightC->storage); + CHECK_EQ(p.buffers.weight->storage, gradWeightC->storage); auto inputCTrPtr = MAKE_INPUT_BUFFER(p.buffers.inputTranspose); auto inputCTr = inputCTrPtr.get(); - DCHECK_EQ(p.buffers.inputTranspose->storage, inputCTr->storage); + CHECK_EQ(p.buffers.inputTranspose->storage, inputCTr->storage); auto gradOutputCTrPtr = MAKE_OUTPUT_BUFFER(p.buffers.outputTranspose); auto gradOutputCTr = gradOutputCTrPtr.get(); - DCHECK_EQ(p.buffers.outputTranspose->storage, gradOutputCTr->storage); + CHECK_EQ(p.buffers.outputTranspose->storage, gradOutputCTr->storage); auto gradWeightCTrPtr = MAKE_WEIGHT_BUFFER(p.buffers.weightTranspose); auto gradWeightCTr = gradWeightCTrPtr.get(); - DCHECK_EQ(p.buffers.weightTranspose->storage, gradWeightCTr->storage); + CHECK_EQ(p.buffers.weightTranspose->storage, gradWeightCTr->storage); auto inputCBuffer = (s.fbfft()) ? buffers.buffer(state, @@ -873,7 +873,7 @@ void accGradParametersTH(THCState* state, buffers.plan(state, gradOutputR, gradOutputC, FFTParameters().forward(), 1); auto handles = buffers.handles(); - CuFFTConvolution conv(ConvolutionPass(ConvolutionPass::kAccGradParameters)); + CuFFTConvolution conv((ConvolutionPass(ConvolutionPass::kAccGradParameters))); conv.withInputAndBuffers( state, inputR, inputC, inputCTr, inputCBuffer, planInput) .withFiltersAndBuffers( diff --git a/src/fft/SpatialConvolutionCuFFT.h b/src/fft/SpatialConvolutionCuFFT.h index a0a6095..2f9c49c 100644 --- a/src/fft/SpatialConvolutionCuFFT.h +++ b/src/fft/SpatialConvolutionCuFFT.h @@ -2,7 +2,7 @@ #pragma once -#include "CuFFTStrategy.h" +#include "src/fft/CuFFTStrategy.h" namespace facebook { namespace deeplearning { namespace torch { namespace detail { diff --git a/src/fft/SpatialConvolutionCuFFTHost.cpp b/src/fft/SpatialConvolutionCuFFTHost.cpp index a0d684e..bd220f6 100644 --- a/src/fft/SpatialConvolutionCuFFTHost.cpp +++ b/src/fft/SpatialConvolutionCuFFTHost.cpp @@ -1,11 +1,10 @@ // Copyright 2014 Facebook -#include "Utils.h" -#include "../Utils.h" -#include "CudaTensorUtils.h" -#include "CuFFTStrategy.h" -#include "SpatialConvolutionCuFFT.h" -#include "SpatialConvolutionCuFFTTuner.h" +#include "src/Utils.h" +#include "src/CudaTensorUtils.h" +#include "src/fft/CuFFTStrategy.h" +#include "src/fft/SpatialConvolutionCuFFT.h" +#include "src/fft/SpatialConvolutionCuFFTTuner.h" #include #include @@ -66,7 +65,7 @@ int updateOutputLua(lua_State* L) { bufs.weight, bufs.weightTranspose)); THParams thp(state, input, weight, output, bias, 0.0f, bufs); - ConvolutionPass pass(ConvolutionPass(ConvolutionPass::kUpdateOutput)); + ConvolutionPass pass( (ConvolutionPass(ConvolutionPass::kUpdateOutput)) ); ProblemSizes pbs(thp, pass); auto strategy = SpatialConvolutionCuFFTTuner::getBestPerformance(state, pbs); @@ -119,7 +118,7 @@ int updateGradInputLua(lua_State* L) { bufs.weight, bufs.weightTranspose)); THParams thp(state, gradInput, weight, gradOutput, nullptr, 0.0f, bufs); - ConvolutionPass pass(ConvolutionPass(ConvolutionPass::kUpdateGradInput)); + ConvolutionPass pass( (ConvolutionPass(ConvolutionPass::kUpdateGradInput)) ); ProblemSizes pbs(thp, pass); auto strategy = SpatialConvolutionCuFFTTuner::getBestPerformance(state, pbs); @@ -177,7 +176,7 @@ int accGradParametersLua(lua_State* L) { bufs.weight, bufs.weightTranspose)); THParams thp(state, input, gradWeight, gradOutput, gradBias, scale, bufs); - ConvolutionPass pass(ConvolutionPass(ConvolutionPass::kAccGradParameters)); + ConvolutionPass pass(ConvolutionPass::kAccGradParameters); ProblemSizes pbs(thp, pass); auto strategy = SpatialConvolutionCuFFTTuner::getBestPerformance(state, pbs); diff --git a/src/fft/SpatialConvolutionCuFFTTuner.cpp b/src/fft/SpatialConvolutionCuFFTTuner.cpp index 9a42433..fe42073 100644 --- a/src/fft/SpatialConvolutionCuFFTTuner.cpp +++ b/src/fft/SpatialConvolutionCuFFTTuner.cpp @@ -1,11 +1,11 @@ // Copyright 2014 Facebook -#include "SpatialConvolutionCuFFTTuner.h" +#include "src/fft/SpatialConvolutionCuFFTTuner.h" #include "cuda/KernelTimer.h" #include "THC.h" -#include "CuFFTStrategy.h" -#include "SpatialConvolutionCuFFT.h" +#include "src/fft/CuFFTStrategy.h" +#include "src/fft/SpatialConvolutionCuFFT.h" #include #include diff --git a/src/fft/SpatialConvolutionCuFFTTuner.h b/src/fft/SpatialConvolutionCuFFTTuner.h index cb6b494..953fff0 100644 --- a/src/fft/SpatialConvolutionCuFFTTuner.h +++ b/src/fft/SpatialConvolutionCuFFTTuner.h @@ -2,7 +2,7 @@ #pragma once -#include "CuFFTStrategy.h" +#include "src/fft/CuFFTStrategy.h" #include struct THCState; diff --git a/src/fft/Utils-inl.h b/src/fft/Utils-inl.h index 8d3ef7d..d2c20c8 100644 --- a/src/fft/Utils-inl.h +++ b/src/fft/Utils-inl.h @@ -2,7 +2,7 @@ #pragma once -#include "Utils.cuh" +#include "src/fft/Utils.cuh" #include "THC.h" namespace facebook { namespace deeplearning { namespace torch { @@ -21,8 +21,8 @@ makeCuFFTTensorReal( THCState* state, THCudaTensor* in, const std::vector& commonDims, - THCudaTensor* candidateCudaStorageReal = nullptr, - FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace) { + THCudaTensor* candidateCudaStorageReal, + FFTOutputSpecification inPlace) { DCHECK_EQ(FFTDim, commonDims.size()); DCHECK_EQ(4, THCudaTensor_nDimension(state, in)); DCHECK_LE(1, FFTDim); @@ -139,8 +139,8 @@ makeCuFFTTensorComplex( THCState* state, THCudaTensor* real, const std::vector& commonDims, - THCudaTensor* candidateCudaStorageComplex = nullptr, - FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace) { + THCudaTensor* candidateCudaStorageComplex, + FFTOutputSpecification inPlace) { DCHECK_EQ(4, THCudaTensor_nDimension(state, real)); DCHECK_LE(1, FFTDim); DCHECK_GE(3, FFTDim); @@ -199,7 +199,7 @@ std::unique_ptr makeCuFFTTensorComplex( THCState* state, const std::vector& allDims, - THCudaTensor* candidateCudaStorageComplex = nullptr) { + THCudaTensor* candidateCudaStorageComplex) { DCHECK_EQ(4, allDims.size()); DCHECK_LE(1, FFTDim); DCHECK_GE(3, FFTDim); @@ -240,7 +240,7 @@ makeCuFFTTensors( THCState* state, THCudaTensor* in, const std::vector& commonDims, - FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace) { + FFTOutputSpecification inPlace) { auto p1 = makeCuFFTTensorReal( state, in, commonDims, nullptr, inPlace); @@ -257,7 +257,7 @@ makeCuFFTTensors( THCState* state, thpp::Tensor& in, const std::vector& commonDims, - FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace) { + FFTOutputSpecification inPlace) { auto th = copyToCuda(state, in); auto res = makeCuFFTTensors(state, th.get(), commonDims, inPlace); return make_pair(std::move(res.first), std::move(res.second)); diff --git a/src/fft/Utils.h b/src/fft/Utils.h index 8ea2c58..9e18ab0 100644 --- a/src/fft/Utils.h +++ b/src/fft/Utils.h @@ -4,7 +4,7 @@ #include "thpp/Tensor.h" #include "THCTensor.h" -#include "CudaTensorUtils.h" +#include "src/CudaTensorUtils.h" #include #include diff --git a/src/util/AsyncCopier.cpp b/src/util/AsyncCopier.cpp index 9a25a31..28f9f89 100644 --- a/src/util/AsyncCopier.cpp +++ b/src/util/AsyncCopier.cpp @@ -3,13 +3,14 @@ * @author Tudor Bosman (tudorb@fb.com) */ -#include "util/AsyncCopier.h" -#include "util/Misc.h" +#include "src/util/AsyncCopier.h" +#include "src/util/Misc.h" + #include #include #include -namespace facebook { namespace CUDAUtil { +namespace facebook { namespace cuda { uint8_t* allocPageLocked(size_t size) { void* ptr; diff --git a/src/util/AsyncCopier.h b/src/util/AsyncCopier.h index 513864a..2bc4abf 100644 --- a/src/util/AsyncCopier.h +++ b/src/util/AsyncCopier.h @@ -15,7 +15,7 @@ #include #include -namespace facebook { namespace CUDAUtil { +namespace facebook { namespace cuda { class AsyncCopier { public: diff --git a/src/util/GlobalAsyncCopier.cpp b/src/util/GlobalAsyncCopier.cpp index d25c80c..d1c56c3 100644 --- a/src/util/GlobalAsyncCopier.cpp +++ b/src/util/GlobalAsyncCopier.cpp @@ -3,15 +3,15 @@ * @author Tudor Bosman (tudorb@fb.com) */ -#include "util/GlobalAsyncCopier.h" +#include "src/util/GlobalAsyncCopier.h" #include #include #include -#include "util/AsyncCopier.h" +#include "src/util/AsyncCopier.h" -using namespace facebook::CUDAUtil; +using namespace facebook::cuda; constexpr size_t kDefaultBufferSizeMB = 16; const char* const kBufferSizeEnvVar = "FB_CUDA_ASYNC_COPIER_BUFFER_SIZE_MB"; diff --git a/src/util/Misc.cpp b/src/util/Misc.cpp index 9503eeb..f1c35ec 100644 --- a/src/util/Misc.cpp +++ b/src/util/Misc.cpp @@ -1,6 +1,7 @@ // Copyright 2004-, Facebook, Inc. All Rights Reserved. -#include "util/Misc.h" +#include "src/util/Misc.h" + #include #include #include @@ -8,21 +9,7 @@ using namespace std; -namespace facebook { namespace CUDAUtil { - -int getDevice() { - int dev; - checkCudaError(cudaGetDevice(&dev)); - return dev; -} - -// Streams. We have an implicit model that async memory copies -// with send semantics happen on a dedicated, conventional stream -// per-device. The stream runs on the destination. -namespace { -mutex mtx; -unordered_map deviceToCopyStream; -} +namespace facebook { namespace cuda { cudaStream_t getComputeStream() { // It would be nice to compute on non-default streams from time to time, @@ -30,20 +17,7 @@ cudaStream_t getComputeStream() { return 0; } -cudaStream_t getCopyStream() { - unique_lock own(mutex); - auto dev = getDevice(); - auto row = deviceToCopyStream.find(dev); - if (row == deviceToCopyStream.end()) { - cudaStream_t& stream = deviceToCopyStream[dev]; - checkCudaError(cudaStreamCreate(&stream)); - return stream; - } - return row->second; -} - -__attribute__((__noreturn__)) -void throwCudaError(cudaError_t error, const char* msg) { +[[noreturn]] void throwCudaError(cudaError_t error, const char* msg) { auto string = msg ? folly::sformat("{}: CUDA error {} ({})", msg, int(error), cudaGetErrorString(error)) : @@ -52,42 +26,4 @@ void throwCudaError(cudaError_t error, const char* msg) { throw std::runtime_error(string); } -namespace { - -struct DeviceProperties { - DeviceProperties(); - int deviceCount = 0; - std::unique_ptr deviceProperties; -}; - -DeviceProperties::DeviceProperties() { - auto err = cudaGetDeviceCount(&deviceCount); - if (err == cudaErrorNoDevice) { - deviceCount = 0; - } else { - checkCudaError(err, "cudaGetDeviceCount"); - } - - deviceProperties = folly::make_unique(deviceCount); - for (int i = 0; i < deviceCount; ++i) { - checkCudaError(cudaGetDeviceProperties(&deviceProperties[i], i), - "cudaGetDeviceProperties"); - } -} - -} // namespace - -const cudaDeviceProp& getCurrentDeviceProperties() { - int device = 0; - checkCudaError(cudaGetDevice(&device), "cudaGetDevice"); - - return getDeviceProperties(device); -} - -const cudaDeviceProp& getDeviceProperties(int device) { - static DeviceProperties dprop; - DCHECK(device >= 0 && device < dprop.deviceCount); - return dprop.deviceProperties[device]; -} - } } diff --git a/src/util/Misc.h b/src/util/Misc.h index b6b9940..a662a21 100644 --- a/src/util/Misc.h +++ b/src/util/Misc.h @@ -2,14 +2,13 @@ #pragma once -#include +#include "cuda/util/CachedDeviceProperties.h" -namespace facebook { namespace CUDAUtil { +#include -int getDevice(); +namespace facebook { namespace cuda { -extern __attribute__((__noreturn__)) -void throwCudaError(cudaError_t, const char* msg); +[[noreturn]] void throwCudaError(cudaError_t, const char* msg); inline void checkCudaError(cudaError_t error, const char* msg = 0) { @@ -18,7 +17,6 @@ checkCudaError(cudaError_t error, const char* msg = 0) { } } - class OnDevice { int m_home; public: @@ -31,10 +29,6 @@ class OnDevice { } }; -const cudaDeviceProp& getCurrentDeviceProperties(); -const cudaDeviceProp& getDeviceProperties(int device); - cudaStream_t getComputeStream(); -cudaStream_t getCopyStream(); } } diff --git a/src/util/Transform.cu b/src/util/Transform.cu index 416ae2a..5db91b8 100644 --- a/src/util/Transform.cu +++ b/src/util/Transform.cu @@ -3,9 +3,9 @@ #include #include -#include "util/Transform.cuh" +#include "src/util/Transform.cuh" -namespace facebook { namespace CUDAUtil { +namespace facebook { namespace cuda { template __global__ static void diff --git a/src/util/Transform.cuh b/src/util/Transform.cuh index a412e81..baeeaab 100644 --- a/src/util/Transform.cuh +++ b/src/util/Transform.cuh @@ -6,7 +6,7 @@ #include #include -namespace facebook { namespace CUDAUtil { +namespace facebook { namespace cuda { /* * A generic interface for dense point-to-point operations. diff --git a/test/BiasTest.cpp b/test/BiasTest.cpp index df29948..89d8327 100644 --- a/test/BiasTest.cpp +++ b/test/BiasTest.cpp @@ -3,8 +3,8 @@ #include "TestUtils.h" #include "THCTensor.h" -#include "torch/fb/fbcunn/src/DeviceTensorUtils.h" -#include "torch/fb/fbcunn/src/ConvolutionBias.cuh" +#include "src/DeviceTensorUtils.h" +#include "src/ConvolutionBias.cuh" #include #include @@ -14,6 +14,20 @@ using namespace std; using namespace facebook::deeplearning::torch; using namespace thpp; +unique_ptr g_state; + +// Override gtest_main to initialize a THCState +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + google::ParseCommandLineFlags(&argc, &argv, true); + g_state.reset(new THCState); + THCudaInit(g_state.get()); + + auto ret = RUN_ALL_TESTS(); + THCudaShutdown(g_state.get()); + return ret; +} + namespace facebook { namespace deeplearning { namespace torch { namespace bias { constexpr int kRuns = 10; @@ -177,17 +191,17 @@ void testOneAccGradParameters(long batchSize, auto expectedResult = referenceBiasAccGradParameters(output, gradBias, biasScale); - auto outputCuda = copyToCuda(nullptr, output); - auto gradBiasCuda = copyToCuda(nullptr, gradBias); + auto outputCuda = copyToCuda(g_state.get(), output); + auto gradBiasCuda = copyToCuda(g_state.get(), gradBias); for (int i = 0; i < nRep; i++) { - accGradParametersBias(nullptr, + accGradParametersBias(g_state.get(), outputCuda.get(), gradBiasCuda.get(), biasScale); } - auto result = copyFromCuda(nullptr, gradBiasCuda.get()); + auto result = copyFromCuda(g_state.get(), gradBiasCuda.get()); // Due to order of reductions, our implementation is a little off auto comparison = test::compareTensors(expectedResult, result, 5e-4f); @@ -213,17 +227,17 @@ void testOneAccGradParametersTemporal(long batchSize, auto expectedResult = referenceBiasAccGradParametersTemporal(output, gradBias, biasScale); - auto outputCuda = copyToCuda(nullptr, output); - auto gradBiasCuda = copyToCuda(nullptr, gradBias); + auto outputCuda = copyToCuda(g_state.get(), output); + auto gradBiasCuda = copyToCuda(g_state.get(), gradBias); for (int i = 0; i < nRep; i++) { - accGradParametersTemporalBias(nullptr, + accGradParametersTemporalBias(g_state.get(), outputCuda.get(), gradBiasCuda.get(), biasScale); } - auto result = copyFromCuda(nullptr, gradBiasCuda.get()); + auto result = copyFromCuda(g_state.get(), gradBiasCuda.get()); auto comparison = test::compareTensors(expectedResult, result, 5e-4f); EXPECT_TRUE(comparison.first) << comparison.second; @@ -255,12 +269,12 @@ TEST(BiasTest, testUpdateOutput) { auto bias = makeBiasTensor(numPlanes); auto expectedResult = referenceBiasUpdateOutput(output, bias); - auto outputCuda = copyToCuda(nullptr, output); - auto biasCuda = copyToCuda(nullptr, bias); + auto outputCuda = copyToCuda(g_state.get(), output); + auto biasCuda = copyToCuda(g_state.get(), bias); - updateOutputBias(nullptr, outputCuda.get(), biasCuda.get()); + updateOutputBias(g_state.get(), outputCuda.get(), biasCuda.get()); - auto result = copyFromCuda(nullptr, outputCuda.get()); + auto result = copyFromCuda(g_state.get(), outputCuda.get()); auto comparison = test::compareTensors(expectedResult, result); EXPECT_TRUE(comparison.first) << comparison.second; @@ -288,12 +302,12 @@ TEST(BiasTest, testUpdateOutputTemporal) { Tensor transposedOutput; auto expectedResult = referenceBiasUpdateOutputTemporal(output, bias); - auto outputCuda = copyToCuda(nullptr, output); - auto biasCuda = copyToCuda(nullptr, bias); + auto outputCuda = copyToCuda(g_state.get(), output); + auto biasCuda = copyToCuda(g_state.get(), bias); - updateOutputTemporalBias(nullptr, outputCuda.get(), biasCuda.get()); + updateOutputTemporalBias(g_state.get(), outputCuda.get(), biasCuda.get()); - auto result = copyFromCuda(nullptr, outputCuda.get()); + auto result = copyFromCuda(g_state.get(), outputCuda.get()); auto comparison = test::compareTensors(expectedResult, result); EXPECT_TRUE(comparison.first) << comparison.second; diff --git a/test/ConvolutionTest.cpp b/test/ConvolutionTest.cpp index eefa449..6fef76f 100644 --- a/test/ConvolutionTest.cpp +++ b/test/ConvolutionTest.cpp @@ -1,14 +1,14 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "torch/fb/fbcunn/src/DeviceTensorUtils.h" +#include "src/DeviceTensorUtils.h" #include "THCTensor.h" -#include "torch/fb/fbcunn/src/fft/Utils.h" -#include "torch/fb/fbcunn/src/fft/CuFFTConvolution_UpdateOutput.cuh" -#include "torch/fb/fbcunn/src/fft/CuFFTConvolution_AccGradParameters.cuh" -#include "torch/fb/fbcunn/src/fft/CuFFTConvolution_UpdateGradInput.cuh" -#include "torch/fb/fbcunn/test/InputCentricConvolution_UpdateOutput.cuh" -#include "torch/fb/fbcunn/test/ReferenceConvolutions.h" -#include "torch/fb/fbcunn/test/TestUtils.h" +#include "src/fft/Utils.h" +#include "src/fft/CuFFTConvolution_UpdateOutput.cuh" +#include "src/fft/CuFFTConvolution_AccGradParameters.cuh" +#include "src/fft/CuFFTConvolution_UpdateGradInput.cuh" +#include "test/InputCentricConvolution_UpdateOutput.cuh" +#include "test/ReferenceConvolutions.h" +#include "test/TestUtils.h" #include #include @@ -22,11 +22,18 @@ using namespace facebook::deeplearning::torch; DEFINE_bool(verify, true, "Run the convolution and verify the output"); DEFINE_bool(debug, false, "Print basic information on tensors"); +unique_ptr g_state; + // Override gtest_main so as to parse the --verify flag int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); google::ParseCommandLineFlags(&argc, &argv, true); - return RUN_ALL_TESTS(); + g_state.reset(new THCState); + THCudaInit(g_state.get()); + + auto ret = RUN_ALL_TESTS(); + THCudaShutdown(g_state.get()); + return ret; } namespace facebook { namespace deeplearning { namespace torch { namespace test { @@ -466,17 +473,12 @@ class TorchTest : public ConvolutionModule { // Torch APIs don't take input/filters as const even though they // effectively are - auto inputTH = input.moveAsTH(); - auto filtersTH = filters.moveAsTH(); - auto outputTH = output.moveAsTH(); + auto inputTH = input.asTH(); + auto filtersTH = filters.asTH(); + auto outputTH = output.asTH(); THFloatTensor_conv2Dmm(outputTH, 1.0, 1.0, inputTH, filtersTH, filterRowStride, filterColStride, "V", "X"); - - // Rebind for evaluation and cleanup - output = std::move(outputTH); - input = std::move(inputTH); - filters = std::move(filtersTH); } void updateGradInput( @@ -488,20 +490,15 @@ class TorchTest : public ConvolutionModule { Tensor& input) override { ASSERT_FALSE(inputPadding); // padding not supported - auto inputTH = input.moveAsTH(); - auto outputTH = output.moveAsTH(); + auto inputTH = input.asTH(); + auto outputTH = output.asTH(); // Torch requires transposition of filters filters.transpose(0, 1); - auto filtersTH = filters.moveAsTH(); + auto filtersTH = filters.asTH(); THFloatTensor_conv2Dmm(inputTH, 0.0, 1.0, outputTH, filtersTH, filterRowStride, filterColStride, "F", "C"); - - // Rebind for evaluation and cleanup - input = std::move(inputTH); - output = std::move(outputTH); - filters = std::move(filtersTH); } void accGradParameters( @@ -514,18 +511,13 @@ class TorchTest : public ConvolutionModule { Tensor& filters) override { ASSERT_FALSE(inputPadding); // padding not supported - auto inputTH = input.moveAsTH(); - auto outputTH = output.moveAsTH(); - auto filtersTH = filters.moveAsTH(); + auto inputTH = input.asTH(); + auto outputTH = output.asTH(); + auto filtersTH = filters.asTH(); THFloatTensor_conv2DRevgerm(filtersTH, 1.0, scale, inputTH, outputTH, filterRowStride, filterColStride); - - // Rebind for evaluation and cleanup - input = std::move(inputTH); - output = std::move(outputTH); - filters = std::move(filtersTH); } }; @@ -588,8 +580,8 @@ class InputCentricTest : public ConvolutionModule { Tensor& output) override { ASSERT_FALSE(inputPadding); // padding not supported - auto inputCuda = copyToCuda(nullptr, input); - auto filtersCuda = copyToCuda(nullptr, filters); + auto inputCuda = copyToCuda(g_state.get(), input); + auto filtersCuda = copyToCuda(g_state.get(), filters); CHECK(layout == Layout::Relayout) << "Only Relayout mode is supported for this kernel atm"; @@ -609,13 +601,13 @@ class InputCentricTest : public ConvolutionModule { } } } - auto filtersCudaTmp = copyToCuda(nullptr, filtersTmp); + auto filtersCudaTmp = copyToCuda(g_state.get(), filtersTmp); // Relayout output, for instance for 32 x 96 x 71 x 71 we get const int filterRowSize = filters.size(2); const int ceilFilterSizeFilterStride = (filterRowSize + filterRowStride - 1) / filterRowStride; - auto outputCudaTmp = makeTHCudaTensorFull(nullptr, { + auto outputCudaTmp = makeTHCudaTensorFull(g_state.get(), { output.size(0), // 32 // 71 + 2 * ceilFilterSizeFilterStride // This expansion by 2 * ceilFilterSizeFilterStride allows us to @@ -627,7 +619,7 @@ class InputCentricTest : public ConvolutionModule { ); bool result = - InputCentricRelayoutConvolution_UpdateOutput(nullptr, + InputCentricRelayoutConvolution_UpdateOutput(g_state.get(), inputCuda.get(), filtersCudaTmp.get(), filterRowStride, @@ -637,7 +629,7 @@ class InputCentricTest : public ConvolutionModule { EXPECT_TRUE(result); // Recover actual output from layout - auto outputTmp = copyFromCuda(nullptr, outputCudaTmp.get()); + auto outputTmp = copyFromCuda(g_state.get(), outputCudaTmp.get()); for (long i = 0; i < output.size(0); ++i) { for (long j = 0; j < output.size(1); ++j) { for (long k = 0; k < output.size(2); ++k) { @@ -707,33 +699,34 @@ class CuFFT : public ConvolutionModule { std::max(filters.size(3), output.size(3))); std::vector maxSizes({maxRows, maxCols}); - auto realComplexPair = makeCuFFTTensors(nullptr, input, maxSizes); + auto realComplexPair = + makeCuFFTTensors(g_state.get(), input, maxSizes); auto inputTHCudaTensor = std::move(realComplexPair.first); auto inputComplexTHCudaTensor = std::move(realComplexPair.second); auto inputComplexTHCudaTensorT = makeCuFFTTensorComplex( - nullptr, inputTHCudaTensor.get(), maxSizes); + g_state.get(), inputTHCudaTensor.get(), maxSizes); realComplexPair = - makeCuFFTTensors(nullptr, filters, maxSizes); + makeCuFFTTensors(g_state.get(), filters, maxSizes); auto filtersTHCudaTensor = std::move(realComplexPair.first); auto filtersComplexTHCudaTensor = std::move(realComplexPair.second); auto filtersComplexTHCudaTensorT = makeCuFFTTensorComplex( - nullptr, filtersTHCudaTensor.get(), maxSizes); + g_state.get(), filtersTHCudaTensor.get(), maxSizes); realComplexPair = - makeCuFFTTensors(nullptr, output, maxSizes); + makeCuFFTTensors(g_state.get(), output, maxSizes); auto outputTHCudaTensor = std::move(realComplexPair.first); auto outputComplexTHCudaTensor = std::move(realComplexPair.second); auto outputComplexTHCudaTensorT = makeCuFFTTensorComplex( - nullptr, outputTHCudaTensor.get(), maxSizes); + g_state.get(), outputTHCudaTensor.get(), maxSizes); // We don't test the bias here - auto bias = Tensor{{output.size(0)}}; + auto bias = Tensor{output.size(0)}; bias.fill(0); - auto biasCuda = copyToCuda(nullptr, bias); + auto biasCuda = copyToCuda(g_state.get(), bias); if (impl_ == Implementation::Reference) { - CuFFTConvolution_ReferenceUpdateOutput(nullptr, + CuFFTConvolution_ReferenceUpdateOutput(g_state.get(), inputTHCudaTensor.get(), filtersTHCudaTensor.get(), outputTHCudaTensor.get(), @@ -742,7 +735,7 @@ class CuFFT : public ConvolutionModule { filtersComplexTHCudaTensor.get(), outputComplexTHCudaTensor.get()); } else { - CuFFTConvolution_UpdateOutput(nullptr, + CuFFTConvolution_UpdateOutput(g_state.get(), inputTHCudaTensor.get(), filtersTHCudaTensor.get(), outputTHCudaTensor.get(), @@ -757,13 +750,14 @@ class CuFFT : public ConvolutionModule { if (FLAGS_verify) { checkExpectedInput(input, - copyFromCuda(nullptr, inputTHCudaTensor.get())); + copyFromCuda(g_state.get(), inputTHCudaTensor.get())); checkExpectedInput(filters, - copyFromCuda(nullptr, filtersTHCudaTensor.get())); + copyFromCuda(g_state.get(), + filtersTHCudaTensor.get())); // Recover actual output from padded layout, output is smaller // than outputTmp when kernelSize > 1 - auto outputTmp = copyFromCuda(nullptr, outputTHCudaTensor.get()); + auto outputTmp = copyFromCuda(g_state.get(), outputTHCudaTensor.get()); for (long i = 0; i < output.size(0); ++i) { for (long j = 0; j < output.size(1); ++j) { for (long k = 0; k < output.size(2); ++k) { @@ -802,34 +796,34 @@ class CuFFT : public ConvolutionModule { std::vector maxSizes({maxRows, maxCols}); auto realComplexPair = - makeCuFFTTensors(nullptr, input, maxSizes); + makeCuFFTTensors(g_state.get(), input, maxSizes); auto inputTHCudaTensor = std::move(realComplexPair.first); auto inputComplexTHCudaTensor = std::move(realComplexPair.second); auto inputComplexTHCudaTensorT = makeCuFFTTensorComplex( - nullptr, inputTHCudaTensor.get(), maxSizes); + g_state.get(), inputTHCudaTensor.get(), maxSizes); realComplexPair = - makeCuFFTTensors(nullptr, filters, maxSizes); + makeCuFFTTensors(g_state.get(), filters, maxSizes); auto filtersTHCudaTensor = std::move(realComplexPair.first); auto filtersComplexTHCudaTensor = std::move(realComplexPair.second); auto filtersComplexTHCudaTensorT = makeCuFFTTensorComplex( - nullptr, filtersTHCudaTensor.get(), maxSizes); + g_state.get(), filtersTHCudaTensor.get(), maxSizes); realComplexPair = - makeCuFFTTensors(nullptr, output, maxSizes); + makeCuFFTTensors(g_state.get(), output, maxSizes); auto outputTHCudaTensor = std::move(realComplexPair.first); auto outputComplexTHCudaTensor = std::move(realComplexPair.second); auto outputComplexTHCudaTensorT = makeCuFFTTensorComplex( - nullptr, outputTHCudaTensor.get(), maxSizes); + g_state.get(), outputTHCudaTensor.get(), maxSizes); // We don't test the bias here - auto bias = Tensor{{filters.size(0)}}; + auto bias = Tensor{filters.size(0)}; bias.fill(0); - auto biasCuda = copyToCuda(nullptr, bias); + auto biasCuda = copyToCuda(g_state.get(), bias); if (impl_ == Implementation::Reference) { CuFFTConvolution_ReferenceAccGradParameters( - nullptr, + g_state.get(), inputTHCudaTensor.get(), filtersTHCudaTensor.get(), outputTHCudaTensor.get(), @@ -839,7 +833,7 @@ class CuFFT : public ConvolutionModule { filtersComplexTHCudaTensor.get(), outputComplexTHCudaTensor.get()); } else { - CuFFTConvolution_AccGradParameters(nullptr, + CuFFTConvolution_AccGradParameters(g_state.get(), inputTHCudaTensor.get(), filtersTHCudaTensor.get(), outputTHCudaTensor.get(), @@ -855,12 +849,12 @@ class CuFFT : public ConvolutionModule { if (FLAGS_verify) { checkExpectedInput(input, - copyFromCuda(nullptr, inputTHCudaTensor.get())); + copyFromCuda(g_state.get(), inputTHCudaTensor.get())); checkExpectedInput(output, - copyFromCuda(nullptr, outputTHCudaTensor.get())); + copyFromCuda(g_state.get(), outputTHCudaTensor.get())); // Recover actual filters from padded layout, filters is smaller // than filtersTmp when kernelSize > 1 - auto filtersTmp = copyFromCuda(nullptr, filtersTHCudaTensor.get()); + auto filtersTmp = copyFromCuda(g_state.get(), filtersTHCudaTensor.get()); for (long i = 0; i < filters.size(0); ++i) { for (long j = 0; j < filters.size(1); ++j) { for (long k = 0; k < filters.size(2); ++k) { @@ -896,30 +890,30 @@ class CuFFT : public ConvolutionModule { std::vector maxSizes({maxRows, maxCols}); auto realComplexPair = - makeCuFFTTensors(nullptr, input, maxSizes); + makeCuFFTTensors(g_state.get(), input, maxSizes); auto inputTHCudaTensor = std::move(realComplexPair.first); auto inputComplexTHCudaTensor = std::move(realComplexPair.second); auto inputComplexTHCudaTensorT = makeCuFFTTensorComplex( - nullptr, inputTHCudaTensor.get(), maxSizes); + g_state.get(), inputTHCudaTensor.get(), maxSizes); realComplexPair = - makeCuFFTTensors(nullptr, filters, maxSizes); + makeCuFFTTensors(g_state.get(), filters, maxSizes); auto filtersTHCudaTensor = std::move(realComplexPair.first); auto filtersComplexTHCudaTensor = std::move(realComplexPair.second); auto filtersComplexTHCudaTensorT = makeCuFFTTensorComplex( - nullptr, filtersTHCudaTensor.get(), maxSizes); + g_state.get(), filtersTHCudaTensor.get(), maxSizes); realComplexPair = - makeCuFFTTensors(nullptr, output, maxSizes); + makeCuFFTTensors(g_state.get(), output, maxSizes); auto outputTHCudaTensor = std::move(realComplexPair.first); auto outputComplexTHCudaTensor = std::move(realComplexPair.second); auto outputComplexTHCudaTensorT = makeCuFFTTensorComplex( - nullptr, outputTHCudaTensor.get(), maxSizes); + g_state.get(), outputTHCudaTensor.get(), maxSizes); if (impl_ == Implementation::Reference) { CuFFTConvolution_ReferenceUpdateGradInput( - nullptr, + g_state.get(), inputTHCudaTensor.get(), filtersTHCudaTensor.get(), outputTHCudaTensor.get(), @@ -928,7 +922,7 @@ class CuFFT : public ConvolutionModule { outputComplexTHCudaTensor.get()); } else { CuFFTConvolution_UpdateGradInput( - nullptr, + g_state.get(), inputTHCudaTensor.get(), filtersTHCudaTensor.get(), outputTHCudaTensor.get(), @@ -942,12 +936,13 @@ class CuFFT : public ConvolutionModule { if (FLAGS_verify) { checkExpectedInput(filters, - copyFromCuda(nullptr, filtersTHCudaTensor.get())); + copyFromCuda(g_state.get(), + filtersTHCudaTensor.get())); checkExpectedInput(output, - copyFromCuda(nullptr, outputTHCudaTensor.get())); + copyFromCuda(g_state.get(), outputTHCudaTensor.get())); // Recover actual filters from padded layout, filters is smaller // than filtersTmp when kernelSize > 1 - auto inputTmp = copyFromCuda(nullptr, inputTHCudaTensor.get()); + auto inputTmp = copyFromCuda(g_state.get(), inputTHCudaTensor.get()); for (long i = 0; i < input.size(0); ++i) { for (long j = 0; j < input.size(1); ++j) { for (long k = 0; k < input.size(2); ++k) { @@ -1376,7 +1371,7 @@ TEST(CudaConvolutionTest, CuFFT_updateGradInput_fixed) { CuFFT::checkExpectedInput( expectedInput, - copyFromCuda(nullptr, cufft.saveInputTHCudaTensor.get())); + copyFromCuda(g_state.get(), cufft.saveInputTHCudaTensor.get())); } } } } } // namespace diff --git a/test/CuBLASTest.cpp b/test/CuBLASTest.cpp index 3f5eefa..40a343b 100644 --- a/test/CuBLASTest.cpp +++ b/test/CuBLASTest.cpp @@ -1,9 +1,9 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "torch/fb/fbcunn/src/DeviceTensorUtils.h" +#include "src/DeviceTensorUtils.h" #include "THCTensor.h" -#include "torch/fb/fbcunn/src/CuBLASWrapper.h" -#include "torch/fb/fbcunn/test/TestUtils.h" +#include "src/CuBLASWrapper.h" +#include "test/TestUtils.h" #include #include @@ -13,6 +13,20 @@ using namespace std; using namespace facebook::deeplearning::torch; using namespace thpp; +unique_ptr g_state; + +// Override gtest_main to initialize a THCState +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + google::ParseCommandLineFlags(&argc, &argv, true); + g_state.reset(new THCState); + THCudaInit(g_state.get()); + + auto ret = RUN_ALL_TESTS(); + THCudaShutdown(g_state.get()); + return ret; +} + namespace facebook { namespace deeplearning { namespace torch { namespace test { template @@ -37,15 +51,15 @@ std::pair, bool asComplex = false) { CHECK_EQ(Dim, t.ndims()); CHECK_EQ(Dim, tt.ndims()); - auto tCuda = copyToCuda(nullptr, t); - auto ttCuda = copyToCuda(nullptr, tt); + auto tCuda = copyToCuda(g_state.get(), t); + auto ttCuda = copyToCuda(g_state.get(), tt); DeviceTensor tCudaTensor = - torchToDeviceTensor(nullptr, tCuda.get()); + torchToDeviceTensor(g_state.get(), tCuda.get()); DeviceTensor ttCudaTensor = - torchToDeviceTensor(nullptr, ttCuda.get()); + torchToDeviceTensor(g_state.get(), ttCuda.get()); transpose(tCudaTensor, ttCudaTensor, sep, asComplex); - tt = copyFromCuda(nullptr, ttCuda.get()); + tt = copyFromCuda(g_state.get(), ttCuda.get()); tt.resize(LongStorage(resizeTransposed)); return make_pair(std::move(tCuda), std::move(ttCuda)); } @@ -58,11 +72,13 @@ void unTransposeAndCheckOutOfPlace( int sep, initializer_list testSize, bool asComplex = false) { - auto ct = torchToDeviceTensor(nullptr, pCudaTensor.first.get()); - auto ctt = torchToDeviceTensor(nullptr, pCudaTensor.second.get()); + auto ct = + torchToDeviceTensor(g_state.get(), pCudaTensor.first.get()); + auto ctt = + torchToDeviceTensor(g_state.get(), pCudaTensor.second.get()); transpose(ct, ctt, Dim - sep, asComplex); - pTensor.second = copyFromCuda(nullptr, pCudaTensor.first.get()); + pTensor.second = copyFromCuda(g_state.get(), pCudaTensor.first.get()); pTensor.first.resize(LongStorage(testSize)); pTensor.second.resize(LongStorage(testSize)); diff --git a/test/CudaTensorTest.cpp b/test/CudaTensorTest.cpp index 3def370..10328cb 100644 --- a/test/CudaTensorTest.cpp +++ b/test/CudaTensorTest.cpp @@ -1,9 +1,9 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "torch/fb/fbcunn/src/CudaTensorUtils.h" +#include "src/CudaTensorUtils.h" #include "THC.h" -#include "torch/fb/fbcunn/test/CudaTensorTestKernels.cuh" -#include "folly/Optional.h" -#include "folly/ScopeGuard.h" +#include "test/CudaTensorTestKernels.cuh" +#include +#include #include #include @@ -11,6 +11,20 @@ using namespace std; +unique_ptr g_state; + +// Override gtest_main to initialize a THCState +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + google::ParseCommandLineFlags(&argc, &argv, true); + g_state.reset(new THCState); + THCudaInit(g_state.get()); + + auto ret = RUN_ALL_TESTS(); + THCudaShutdown(g_state.get()); + return ret; +} + namespace facebook { namespace deeplearning { namespace torch { namespace { @@ -39,47 +53,51 @@ void verify3d(THCudaTensor* tensor) { } // unnamed namespace TEST(CudaTensor, testDimensionMismatch) { - EXPECT_THROW(testAssignment3d(nullptr, - makeTHCudaTensorFull(nullptr, {1, 2, 3, 4}).get()), + EXPECT_THROW(testAssignment3d( + g_state.get(), + makeTHCudaTensorFull(g_state.get(), {1, 2, 3, 4}).get()), invalid_argument); - EXPECT_THROW(testAssignment3d(nullptr, - makeTHCudaTensorFull(nullptr, {1}).get()), + EXPECT_THROW(testAssignment3d( + g_state.get(), + makeTHCudaTensorFull(g_state.get(), {1}).get()), invalid_argument); } TEST(CudaTensor, testWrite3d) { - auto tensor = makeTHCudaTensorFull(nullptr, {11, 7, 5}); + auto tensor = makeTHCudaTensorFull(g_state.get(), {11, 7, 5}); // Run our kernel - EXPECT_TRUE(testAssignment3d(nullptr, tensor.get())); + EXPECT_TRUE(testAssignment3d(g_state.get(), tensor.get())); verify3d(tensor.get()); } TEST(CudaTensor, testWrite3dNonTrivialStride) { - auto tensor = makeTHCudaTensorFull(nullptr, {11, 7, 5}, {200, 6, 1}); + auto tensor = makeTHCudaTensorFull(g_state.get(), {11, 7, 5}, {200, 6, 1}); // Run our kernel - EXPECT_TRUE(testAssignment3d(nullptr, tensor.get())); + EXPECT_TRUE(testAssignment3d(g_state.get(), tensor.get())); verify3d(tensor.get()); } TEST(CudaTensor, testWrite1d) { constexpr long kSize = 3; - auto storage = THCudaStorage_newWithSize(nullptr, kSize); - auto tensor = THCudaTensor_newWithStorage1d(nullptr, storage, 0, kSize, 1); - SCOPE_EXIT{ THCudaTensor_free(nullptr, tensor); }; + auto storage = + THCudaStorage_newWithSize(g_state.get(), kSize); + auto tensor = + THCudaTensor_newWithStorage1d(g_state.get(), storage, 0, kSize, 1); + SCOPE_EXIT{ THCudaTensor_free(g_state.get(), tensor); }; // Clear out tensor - THCudaTensor_fill(nullptr, tensor, 0.0f); + THCudaTensor_fill(g_state.get(), tensor, 0.0f); // Run our kernel - EXPECT_TRUE(testAssignment1d(nullptr, tensor)); + EXPECT_TRUE(testAssignment1d(g_state.get(), tensor)); // Verify output auto hostStorage = THFloatStorage_newWithSize(tensor->storage->size); SCOPE_EXIT{ THFloatStorage_free(hostStorage); }; - THFloatStorage_copyCuda(nullptr, hostStorage, storage); + THFloatStorage_copyCuda(g_state.get(), hostStorage, storage); for (int i = 0; i < tensor->size[0]; ++i) { EXPECT_EQ(i, hostStorage->data[i]); @@ -88,51 +106,58 @@ TEST(CudaTensor, testWrite1d) { TEST(CudaTensor, testUpcast) { // test with no padding - EXPECT_TRUE(testUpcast(nullptr, - makeTHCudaTensorFull(nullptr, {3, 2, 1}).get())); + EXPECT_TRUE(testUpcast(g_state.get(), + makeTHCudaTensorFull(g_state.get(), {3, 2, 1}).get())); // test with padding - EXPECT_TRUE(testUpcast(nullptr, - makeTHCudaTensorFull(nullptr, {4, 3, 2}, {150, 40, 15}).get())); + EXPECT_TRUE( + testUpcast(g_state.get(), + makeTHCudaTensorFull( + g_state.get(), {4, 3, 2}, {150, 40, 15}).get())); } TEST(CudaTensor, testDowncastIllegalPaddingThrows) { // 16 should be 12 for no padding - EXPECT_THROW(testDowncastTo2d(nullptr, - makeTHCudaTensorFull(nullptr, {2, 3, 4}, {16, 4, 1}).get()), + EXPECT_THROW(testDowncastTo2d( + g_state.get(), + makeTHCudaTensorFull( + g_state.get(), {2, 3, 4}, {16, 4, 1}).get()), invalid_argument); // 15/5 should be 12/3 for no padding - EXPECT_THROW(testDowncastTo1d(nullptr, - makeTHCudaTensorFull(nullptr, {2, 3, 4}, {15, 5, 1}).get()), + EXPECT_THROW(testDowncastTo1d( + g_state.get(), + makeTHCudaTensorFull( + g_state.get(), {2, 3, 4}, {15, 5, 1}).get()), invalid_argument); // But, the same should not cause a problem for 2d since the padding // is in the non-collapsed dimensions - EXPECT_NO_THROW(testDowncastTo2d(nullptr, + EXPECT_NO_THROW(testDowncastTo2d(g_state.get(), makeTHCudaTensorFull( - nullptr, {2, 3, 4}, {15, 5, 1}).get())); + g_state.get(), {2, 3, 4}, {15, 5, 1}).get())); } TEST(CudaTensor, testDowncast) { - EXPECT_TRUE(testDowncastTo2d(nullptr, - makeTHCudaTensorFull(nullptr, {2, 3, 4}).get())); + EXPECT_TRUE(testDowncastTo2d( + g_state.get(), + makeTHCudaTensorFull(g_state.get(), {2, 3, 4}).get())); // We can have padding in the innermost dimension - EXPECT_TRUE(testDowncastTo2d(nullptr, - makeTHCudaTensorFull(nullptr, {2, 3, 4}, + EXPECT_TRUE(testDowncastTo2d(g_state.get(), + makeTHCudaTensorFull(g_state.get(), {2, 3, 4}, {36, 12, 3}).get())); } TEST(CudaTensor, testDowncastWrites) { - auto tensor = makeTHCudaTensorFull(nullptr, {2, 3, 4}); - EXPECT_TRUE(testDowncastWrites(nullptr, tensor.get())); + auto tensor = makeTHCudaTensorFull(g_state.get(), {2, 3, 4}); + EXPECT_TRUE(testDowncastWrites(g_state.get(), tensor.get())); // Verify output auto hostStorage = THFloatStorage_newWithSize(tensor->storage->size); SCOPE_EXIT{ THFloatStorage_free(hostStorage); }; - THFloatStorage_copyCuda(nullptr, hostStorage, tensor->storage); + THFloatStorage_copyCuda(g_state.get(), hostStorage, tensor->storage); // In the downcast view, we should have overwritten all the values for (int k = 0; k < tensor->size[0]; ++k) { diff --git a/test/CudaTensorTestKernels.cu b/test/CudaTensorTestKernels.cu index 2b7e251..3ea9803 100644 --- a/test/CudaTensorTestKernels.cu +++ b/test/CudaTensorTestKernels.cu @@ -1,13 +1,12 @@ // Copyright 2004-present Facebook. All Rights Reserved. #include "cuda/DeviceTensor.cuh" -#include "torch/fb/fbcunn/src/DeviceTensorUtils.h" - -#include "torch/fb/fbcunn/src/util/Misc.h" +#include "cuda/util/CachedDeviceProperties.h" +#include "src/DeviceTensorUtils.h" #include using namespace facebook::cuda; -using namespace facebook::CUDAUtil; +using namespace facebook::cuda; namespace facebook { namespace deeplearning { namespace torch { diff --git a/test/FFTTest.cpp b/test/FFTTest.cpp index 8934379..2bfa6be 100644 --- a/test/FFTTest.cpp +++ b/test/FFTTest.cpp @@ -1,11 +1,11 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "torch/fb/fbcunn/src/DeviceTensorUtils.h" +#include "src/DeviceTensorUtils.h" #include "THCTensor.h" -#include "torch/fb/fbcunn/src/fft/CuFFTWrapper.cuh" -#include "torch/fb/fbcunn/test/InputCentricConvolution_UpdateOutput.cuh" -#include "torch/fb/fbcunn/test/ReferenceConvolutions.h" -#include "torch/fb/fbcunn/test/TestUtils.h" +#include "src/fft/CuFFTWrapper.cuh" +#include "test/InputCentricConvolution_UpdateOutput.cuh" +#include "test/ReferenceConvolutions.h" +#include "test/TestUtils.h" #include @@ -18,11 +18,18 @@ using namespace facebook::deeplearning::torch; DEFINE_bool(verify, true, "Run the convolution and verify the output"); +unique_ptr g_state; + // Override gtest_main so as to parse the --verify flag int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); google::ParseCommandLineFlags(&argc, &argv, true); - return RUN_ALL_TESTS(); + g_state.reset(new THCState); + THCudaInit(g_state.get()); + + auto ret = RUN_ALL_TESTS(); + THCudaShutdown(g_state.get()); + return ret; } namespace facebook { namespace deeplearning { namespace torch { namespace test { @@ -81,16 +88,16 @@ class FFTTestBase : public ::testing::Test { } auto realComplexPair = - makeCuFFTTensors(nullptr, input, FFTSize, cfg.inPlace); + makeCuFFTTensors(g_state.get(), input, FFTSize, cfg.inPlace); inputTHCudaTensor = std::move(realComplexPair.first); fftTHCudaTensor = std::move(realComplexPair.second); inputCudaTensor = torchToDeviceTensor( - nullptr, inputTHCudaTensor.get()); + g_state.get(), inputTHCudaTensor.get()); outputCudaTensor = torchToDeviceTensor( - nullptr, fftTHCudaTensor.get()); + g_state.get(), fftTHCudaTensor.get()); if (cfg.inPlace == FFTOutputSpecification::InPlace) { CHECK_EQ(inputCudaTensor.data(), outputCudaTensor.data()); @@ -220,7 +227,7 @@ TEST_F(FFT2DTest, test2x2ConstantInPlace) { fft2d<2>(inputCudaTensor, outputCudaTensor); checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get())); + copyFromCuda(g_state.get(), fftTHCudaTensor.get())); } TEST_F(FFT2DTest, test2x2ConstantOutOfPlace) { @@ -253,7 +260,7 @@ TEST_F(FFT2DTest, test2x2ConstantOutOfPlace) { fft2d<2>(inputCudaTensor, outputCudaTensor); checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get())); + copyFromCuda(g_state.get(), fftTHCudaTensor.get())); } TEST_F(FFT2DTest, test2x2VariableInPlace) { @@ -290,7 +297,7 @@ TEST_F(FFT2DTest, test2x2VariableInPlace) { fft2d<2>(inputCudaTensor, outputCudaTensor); checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get())); + copyFromCuda(g_state.get(), fftTHCudaTensor.get())); } TEST_F(FFT2DTest, test2x2VariableOutOfPlace) { @@ -328,7 +335,7 @@ TEST_F(FFT2DTest, test2x2VariableOutOfPlace) { fft2d<2>(inputCudaTensor, outputCudaTensor); checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get())); + copyFromCuda(g_state.get(), fftTHCudaTensor.get())); } TEST_F(FFT2DTest, test1x2ConstantInPlacePadded) { @@ -365,7 +372,7 @@ TEST_F(FFT2DTest, test1x2ConstantInPlacePadded) { fft2d<2>(inputCudaTensor, outputCudaTensor); checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get())); + copyFromCuda(g_state.get(), fftTHCudaTensor.get())); } TEST_F(FFT2DTest, test1x2ConstantOutOfPlacePadded) { @@ -402,7 +409,7 @@ TEST_F(FFT2DTest, test1x2ConstantOutOfPlacePadded) { fft2d<2>(inputCudaTensor, outputCudaTensor); checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get())); + copyFromCuda(g_state.get(), fftTHCudaTensor.get())); } TEST_F(FFT2DTest, test2x2ConstantInPlacePadded) { @@ -456,7 +463,7 @@ TEST_F(FFT2DTest, test2x2ConstantInPlacePadded) { // One element does not check at 1e-6f error checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get()), + copyFromCuda(g_state.get(), fftTHCudaTensor.get()), 5e-5f); } @@ -515,7 +522,7 @@ TEST_F(FFT2DTest, test2x2ConstantOutOfPlacePadded) { // One element does not check at 1e-6f error checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get()), + copyFromCuda(g_state.get(), fftTHCudaTensor.get()), 5e-5f); } @@ -552,7 +559,7 @@ TEST_F(FFT2DTest, testInverseOutOfPlace) { // First element does not check at 5e-5f error checkExpectedInput(input, - copyFromCuda(nullptr, inputTHCudaTensor.get()), + copyFromCuda(g_state.get(), inputTHCudaTensor.get()), 5e-4f); } @@ -588,7 +595,7 @@ TEST_F(FFT2DTest, testInverseInPlace) { // First element does not check at 1e-6f error checkExpectedInput(input, - copyFromCuda(nullptr, inputTHCudaTensor.get()), + copyFromCuda(g_state.get(), inputTHCudaTensor.get()), 5e-5f); } @@ -625,7 +632,7 @@ TEST_F(FFT2DTest, testInverseOutOfPlacePadded) { // First element does not check at 5e-5f error checkExpectedInput(input, - copyFromCuda(nullptr, inputTHCudaTensor.get()), + copyFromCuda(g_state.get(), inputTHCudaTensor.get()), 5e-4f); } @@ -677,7 +684,7 @@ TEST_F(FFT1DTest, test1x4VariableOutOfPlacePadded) { fft1d<3>(inputCudaTensor, outputCudaTensor); checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get())); + copyFromCuda(g_state.get(), fftTHCudaTensor.get())); } TEST_F(FFT1DTest, test1x4VariableInPlacePadded) { @@ -728,7 +735,7 @@ TEST_F(FFT1DTest, test1x4VariableInPlacePadded) { fft1d<3>(inputCudaTensor, outputCudaTensor); checkExpectedOutput(expected, - copyFromCuda(nullptr, fftTHCudaTensor.get())); + copyFromCuda(g_state.get(), fftTHCudaTensor.get())); } TEST_F(FFT1DTest, testInverseInPlace) { @@ -762,7 +769,7 @@ TEST_F(FFT1DTest, testInverseInPlace) { fft1d<3>(inputCudaTensor, outputCudaTensor, FFTParameters().inverse()); checkExpectedInput(input, - copyFromCuda(nullptr, inputTHCudaTensor.get())); + copyFromCuda(g_state.get(), inputTHCudaTensor.get())); } TEST_F(FFT1DTest, testInverseOutOfPlacePadded) { @@ -797,7 +804,7 @@ TEST_F(FFT1DTest, testInverseOutOfPlacePadded) { fft1d<3>(inputCudaTensor, outputCudaTensor, FFTParameters().inverse()); checkExpectedInput(input, - copyFromCuda(nullptr, inputTHCudaTensor.get()), + copyFromCuda(g_state.get(), inputTHCudaTensor.get()), 5e-5f); } diff --git a/test/InputCentricConvolution_UpdateOutput.cu b/test/InputCentricConvolution_UpdateOutput.cu index 6e3d173..5ea8439 100644 --- a/test/InputCentricConvolution_UpdateOutput.cu +++ b/test/InputCentricConvolution_UpdateOutput.cu @@ -1,16 +1,16 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "torch/fb/fbcunn/test/InputCentricConvolution_UpdateOutput.cuh" +#include "test/InputCentricConvolution_UpdateOutput.cuh" #include "cuda/CudaUtils.cuh" #include "cuda/DeviceTensor.cuh" -#include "torch/fb/fbcunn/src/DeviceTensorUtils.h" -#include "torch/fb/fbcunn/src/util/Misc.h" +#include "cuda/util/CachedDeviceProperties.h" +#include "src/DeviceTensorUtils.h" #include #include using namespace facebook::cuda; -using namespace facebook::CUDAUtil; +using namespace facebook::cuda; namespace facebook { namespace deeplearning { namespace torch { namespace test { diff --git a/test/ReferenceConvolutions.cpp b/test/ReferenceConvolutions.cpp index 034c4e1..3e8bcb7 100644 --- a/test/ReferenceConvolutions.cpp +++ b/test/ReferenceConvolutions.cpp @@ -1,6 +1,6 @@ // Copyright 2004-present Facebook. All Rights Reserved. -#include "torch/fb/fbcunn/test/ReferenceConvolutions.h" +#include "test/ReferenceConvolutions.h" #include @@ -21,10 +21,10 @@ namespace facebook { namespace deeplearning { namespace torch { namespace test { // output * filter operates with a mask when going to the input. // // ------------------------------- -// | implied zeros |\ -// | _________________________ | \ -// | | | | \ convoled with -// | | real input | | \____ +// | implied zeros | +// | _________________________ | +// | | | | convoled with +// | | real input | | ____ // | | | | | | // | | | | * | | equals ==> // | | area | | ---- @@ -55,10 +55,10 @@ namespace facebook { namespace deeplearning { namespace torch { namespace test { // // total output area // ------------------------------- -// | affected by implied zeros |\ -// | _________________________ | \ convolved with -// | | | | \ -// | | output area not | | \____ +// | affected by implied zeros | +// | _________________________ | convolved with +// | | | | +// | | output area not | | ____ // | | affected by | | | | // | | implied zero | | * | | equals ==> // | | area; this data | | ---- diff --git a/test/ReferenceConvolutions.h b/test/ReferenceConvolutions.h index f38073c..eb046ec 100644 --- a/test/ReferenceConvolutions.h +++ b/test/ReferenceConvolutions.h @@ -1,7 +1,7 @@ // Copyright 2004-present Facebook. All Rights Reserved. #pragma once -#include "torch/fb/fbcunn/src/Tensor.h" +#include "src/Tensor.h" #include #include diff --git a/test/TestUtils.cpp b/test/TestUtils.cpp index e2b3fc6..2657039 100644 --- a/test/TestUtils.cpp +++ b/test/TestUtils.cpp @@ -131,7 +131,7 @@ bool isWithin(float a, float b, float relativeError) { // Special case for a or b very close to zero, only absolute check can work - if (std::abs(a) < relativeError || std::abs(a) < relativeError || + if (std::abs(a) < relativeError || std::abs(b) < relativeError || !std::isnormal(a)|| !std::isnormal(b)) { if (std::abs(a - b) > adjRelativeError) { return false; diff --git a/test/TestUtils.h b/test/TestUtils.h index 1b51195..9224d5b 100644 --- a/test/TestUtils.h +++ b/test/TestUtils.h @@ -2,11 +2,11 @@ #pragma once #include "cuda/DeviceTensor.cuh" -#include "torch/fb/fbcunn/src/CudaTensorUtils.h" -#include "torch/fb/fbcunn/src/DeviceTensorUtils.h" +#include "src/CudaTensorUtils.h" +#include "src/DeviceTensorUtils.h" #include "THCTensor.h" -#include "torch/fb/fbcunn/src/fft/CuFFTConvolution_UpdateOutput.cuh" -#include "torch/fb/fbcunn/src/fft/Utils.h" +#include "src/fft/CuFFTConvolution_UpdateOutput.cuh" +#include "src/fft/Utils.h" #include #include diff --git a/test/test.lua b/test/test.lua index 50b6f99..656bdbd 100644 --- a/test/test.lua +++ b/test/test.lua @@ -117,6 +117,50 @@ function fbcunntest.TemporalMaxPoolingBatch() 1e-4, 'error on backward batch') end +function fbcunntest.testDoG() + + -- load image: + require 'image' + local input = image.scale(image.lena(), 16, 16, 'bilinear'):cuda() + local numChannels = input:size(1) + + -- construct module: + local nOctaves = 3 + local nScalesPerOctave = 4 + local module = nn.DifferenceOfGaussian( + numChannels, + nOctaves, + nScalesPerOctave + ):cuda() + + -- test forward pass: + local output = module:forward(input) + assert(type(output) == 'table') + assert(#output == nOctaves) + for n = 1,nOctaves do + assert(output[n]:size(1) == nScalesPerOctave * numChannels) + end + + -- repeat the forward tests in batch mode: + local batchSize = 8 + local batchInput = input.new( + batchSize, + input:size(1), + input:size(2), + input:size(3) + ) + for n = 1,batchSize do + batchInput[n]:copy(input):add(torch.randn(input:size()):cuda(), 0.05) + end + output = module:forward(batchInput) + assert(type(output) == 'table') + assert(#output == nOctaves) + for n = 1,nOctaves do + assert(output[n]:size(1) == batchSize) + assert(output[n]:size(2) == nScalesPerOctave * numChannels) + end +end + function fbcunntest.Optim() require 'cunn' local fboptim = require 'fboptim' diff --git a/test/test_BatchNormalization.lua b/test/test_BatchNormalization.lua new file mode 100644 index 0000000..adae5ec --- /dev/null +++ b/test/test_BatchNormalization.lua @@ -0,0 +1,227 @@ +require 'fb.luaunit' +require 'cunn' +require 'fbcunn' +require 'nn' +require 'fbnn' + +local precision = 1e-4 +local threshold = 5e-5 +local relaxedPrecision = 5 * 0.01668 +local numRuns = 10 +local benchmark = false +local debug = false +local silence = true +local seed = os.time() +print('Seed: ', seed) +math.randomseed(seed) +torch.manualSeed(seed) + +local function BNTest( + refmod, gpumod, input, gradOutput, debug, benchmark, indim) + + if debug then + input:fill(1) + gradOutput:fill(1) + input:copy(torch.linspace(1, input:nElement(), input:nElement())) + gradOutput:copy(torch.linspace(1, input:nElement(), input:nElement())) + end + + -- batch norm without affine transform + local function assertDiff(ref, actual, name) + local rel, abs = nn.utils.relErr(ref, actual) + if abs > threshold then + assert(rel <= precision, + name .. ' max diff ' .. ' absolute ' .. abs) + else + assert(rel <= relaxedPrecision, + name .. ' max diff ' .. ' absolute ' .. abs) + end + end + + local function uniformInit(t1, t2) + t1:uniform() + t2:copy(t1) + end + + for _, affine in ipairs({false, true}) do + for _, train in ipairs({false, true}) do + if not silence then + print('affine, train', affine, train) + end + local modRef = refmod(indim, 1e-5, 0.1, affine):cuda() + local modGPU = gpumod(indim, 1e-5, 0.1, affine) + modGPU.train, modRef.train = train, train + + -- Preconditions + if affine then + -- Uniform both for testing purposes + uniformInit(modRef.bias, modGPU.bias) + uniformInit(modRef.weight, modGPU.weight) + assertDiff(modRef.bias, modGPU.bias, 'bias') + assertDiff(modRef.weight, modGPU.weight, 'weight') + end + uniformInit(modRef.running_std, modGPU.running_std) + uniformInit(modRef.running_mean, modGPU.running_mean) + assertDiff(modRef.running_std, modGPU.running_std, 'running_std') + assertDiff(modRef.running_mean, modGPU.running_mean, 'running_mean') + + -- UpdateOutput + modGPU:updateOutput(input) + modRef:updateOutput(input) + + if debug then + print('Input', input:float()) + print('GradOutput', gradOutput:float()) + print('weight', modGPU.weight:float()) + print('bias', modGPU.bias:float()) + print('Expected running_mean', modRef.running_mean:float()) + print('Actual running_mean', modGPU.running_mean:float()) + print('Expected running_std', modRef.running_std:float()) + print('Actual running_std', modGPU.running_std:float()) + print('Expected output', modRef.output:float()) + print('Actual output', modGPU.output:float()) + if train then + print('Expected centered', modRef.centered:float()) + print('Actual centered', modGPU.centered:float()) + print('Expected std', modRef.std:float()) + print('Actual std', modGPU.std:float()) + print('Expected normalized', modRef.normalized:float()) + print('Actual normalized', modGPU.normalized:float()) + end + end + + -- Postconditions + assertDiff(modRef.running_mean, modGPU.running_mean, 'running_mean') + assertDiff(modRef.running_std, modGPU.running_std, 'running_std') + if train then + assertDiff(modRef.centered, modGPU.centered, 'centered') + assertDiff(modRef.std, modGPU.std, 'std') + assertDiff(modRef.normalized, modGPU.normalized, 'normalized') + end + assertDiff(modRef.output, modGPU.output, 'output') + + + + if train then + -- Preconditions + assertDiff(modRef.centered, modGPU.centered, 'centered') + assertDiff(modRef.std, modGPU.std, 'std') + if affine then + assertDiff(modRef.weight, modGPU.weight, 'std') + end + + -- UpdateGradInput + modGPU:updateGradInput(input, gradOutput) + modRef:updateGradInput(input, gradOutput) + + if debug then + print('Expected gradInput', modRef.gradInput:float()) + print('Actual gradInput', modGPU.gradInput:float()) + end + + -- Postconditions + assertDiff(modRef.gradInput, modGPU.gradInput, 'gradInput') + + if affine then + -- Preconditions + -- gradBias and gradWeight are unintialized, users usually + -- call zeroGradParameters first, emulate this + uniformInit(modRef.gradBias, modGPU.gradBias) + uniformInit(modRef.gradWeight, modGPU.gradWeight) + assertDiff(modRef.gradBias, modGPU.gradBias, 'gradBias') + assertDiff(modRef.gradWeight, modGPU.gradWeight, 'gradWeight') + assertDiff(modRef.normalized, modGPU.normalized, 'normalized') + + local scale = torch.random(1000) / 1000.0 + if debug then + local val = 0 + gradOutput:apply( + function() + val = val + 1 + return val + end + ) + scale = 1.0 + modRef.normalized:copy(modGPU.normalized) + print('Normalized', modRef.normalized:float()) + print('GradOutput', gradOutput:float()) + end + + -- AccGradParameters + modGPU:accGradParameters(input, gradOutput, scale) + modRef:accGradParameters(input, gradOutput, scale) + + if debug then + print('Expected gradWeight', modRef.gradWeight:float()) + print('Actual gradWeight', modGPU.gradWeight:float()) + print('Expected gradBias', modRef.gradBias:float()) + print('Actual gradBias', modGPU.gradBias:float()) + end + + -- Postconditions + assertDiff(modRef.gradBias, modGPU.gradBias, 'gradBias') + assertDiff(modRef.gradWeight, modGPU.gradWeight, 'gradWeight') + end + end + end + end +end + +function testSpatialBatchNormalization() + for i = 1, numRuns do + local nframes, indim, ini, inj = torch.random(1, 17), + torch.random(1, 19), + torch.random(1, 35), + torch.random(1, 35) + if benchmark then + nframes, indim, ini, inj = 128, 64, 112, 112 + end + if debug then + nframes, indim, ini, inj = 1, 1, 5, 7 + end + + local input = torch.zeros(nframes, indim, ini, inj):uniform():cuda() + local gradOutput = torch.zeros(nframes, indim, ini, inj):uniform():cuda() + + BNTest(nn.SpatialBatchNormalization, + fbnn.SpatialBatchNormalization, + input, + gradOutput, + debug, + benchmark, + indim) + end +end + +function testBatchNormalization() + for i = 1, numRuns do + local nframes, indim = torch.random(1, 17), torch.random(1, 19) + if benchmark then + nframes, indim = 128, 4096 + end + if debug then + nframes, indim = 5, 7 + end + + local input = torch.zeros(nframes, indim):uniform():cuda() + local gradOutput = torch.zeros(nframes, indim):uniform():cuda() + + BNTest(nn.BatchNormalization, + fbnn.BatchNormalization, + input, + gradOutput, + debug, + benchmark, + indim) + end +end + +--[[ + precision = 1e-6 + numRuns = 10 + benchmark = false + debug = false + silence = true +--]] + +LuaUnit:main() diff --git a/test/test_ClassHierarchicalNLLCriterion.lua b/test/test_ClassHierarchicalNLLCriterion.lua index f89c0a6..58fbb54 100644 --- a/test/test_ClassHierarchicalNLLCriterion.lua +++ b/test/test_ClassHierarchicalNLLCriterion.lua @@ -219,10 +219,10 @@ for _, x in pairs{{criterion.clusterMatrix, criterion.clusterMatrixDx}, ) end if basic then - assert(math.abs( + local err = math.abs( criterion.classMatrixDx[i][j] - - modelDefault.modules[2].gradWeight[i][j]) < - 1e-16) + modelDefault.modules[2].gradWeight[i][j]) + assert(err < 1e-14, "failed error check : " .. err .. ' < ' .. 1e-14) end end end diff --git a/test/benchmark_cublas.lua b/test/test_CuBLAS.lua similarity index 56% rename from test/benchmark_cublas.lua rename to test/test_CuBLAS.lua index ce4acd7..138156a 100644 --- a/test/benchmark_cublas.lua +++ b/test/test_CuBLAS.lua @@ -1,13 +1,12 @@ -- Copyright 2004-present Facebook. All Rights Reserved. -require('fb.luaunit') - +require 'fb.luaunit' +require 'fbtorch' require 'cunn' - require 'fbcunn' torch.setdefaulttensortype('torch.FloatTensor') -local test = {} +local fb_test = {} -- Let C = m-by-n and A = m-by-k -- Format is m, n, k, seqIter, batch, numHandles, numStreams @@ -43,6 +42,16 @@ local problemSize = { {1, 1024, 512, {1}, {16 * 32}, 1, 1}, } +-- This test exercises the performance of multi-handle + multi-stream on many +-- small gemms. +local _testMultiHandlePerf = { + {513, 513, 513, {53}, {}, 0, 0}, + {513, 513, 513, {53}, {}, 1, 1}, + {513, 513, 513, {53}, {}, 1, 4}, + {513, 513, 513, {53}, {}, 4, 1}, + {513, 513, 513, {53}, {}, 4, 4}, +} + local function concat(t1,t2) local res = {} for i=1,#t1 do @@ -54,61 +63,10 @@ local function concat(t1,t2) return res end --- Soumith's inline print -local ndepth = 4 -local function print_inline(...) - local function rawprint(o) - io.write(tostring(o or '') .. ' ') - io.flush() - end - - local function printrecursive(obj, depth) - local depth = depth or 0 - local tab = 0 - local line = function(s) for i=1,tab do io.write(' ') end rawprint(s) end - if next(obj) then - line('{') - for k,v in pairs(obj) do - if type(v) == 'table' then - if depth >= (ndepth-1) or next(v) == nil then - line(tostring(k) .. ' : {}') - else - line(tostring(k) .. ' : ') printrecursive(v, depth + 1) - end - else - line(tostring(k) .. ' : ' .. v) - end - rawprint(',') - end - tab = tab-2 - line('}') - else - line('{}') - end - end - for i = 1,select('#',...) do - local obj = select(i,...) - if type(obj) ~= 'table' then - if type(obj) == 'userdata' or type(obj) == 'cdata' then - rawprint(obj) - else - io.write(obj .. '\t') - if i == select('#',...) then - rawprint() - end - end - elseif getmetatable(obj) and getmetatable(obj).__tostring then - rawprint(obj) - else - printrecursive(obj) - end - end -end - local function testLoop(problemSize) -- Just allocate some dummy placeholder to get to the proper -- function in the lua module - local net = nn.CuBLASWrapper() + local net = nn.CuBLASWrapper(true) local m = problemSize[1] local n = problemSize[2] @@ -125,13 +83,25 @@ local function testLoop(problemSize) local B = torch.Tensor(sB):cuda() local C = torch.Tensor(sC):cuda() - print_inline(problemSize) - print('') - net:matmult(A, B, C, seqIter, batch, handles, streams) + cutorch.reserveBlasHandles(handles) + cutorch.reserveStreams(streams) + cutorch.synchronize() + net:matmult(A, B, C, seqIter, batch) + mytester:assert(true) + cutorch.synchronize() collectgarbage() end -for i = 1, table.getn(problemSize) do - testLoop(problemSize[i]) +function fb_test.testGEMMs() + for i = 1, table.getn(_testMultiHandlePerf) do + testLoop(_testMultiHandlePerf[i]) + end + for i = 1, table.getn(problemSize) do + testLoop(problemSize[i]) + end end + +mytester = torch.Tester() +mytester:add(fb_test) +mytester:run() diff --git a/test/test_CuFFT.lua b/test/test_CuFFT.lua new file mode 100644 index 0000000..ccb4895 --- /dev/null +++ b/test/test_CuFFT.lua @@ -0,0 +1,310 @@ +-- Copyright 2004-present Facebook. All Rights Reserved. +require('fb.luaunit') +local torch = require('fbtorch') + +require 'cunn' +require 'fbcunn' +require 'cutorch' +require 'math' + +torch.setnumthreads(6) +torch.setdefaulttensortype('torch.FloatTensor') + +local mytester = torch.Tester() + +local precision = 1e-4 + +local test = {} +local printResults = false +local printMemory = false +local timeResults = false + +local kNumGPUs = 1 +local maxSize = 128000000 +local maxBatch = 4 +local maxInputPlanes = 13 +local maxOutputPlanes = 13 +local maxKernelSize = 7 +local maxInputSize = 60 + +local function timeFunction(printString, fun, module, arg1, arg2, arg3) + if not timeResults then + return fun(module, arg1, arg2, arg3) + end + + local numTrials = 5 + local time = 0 + for i = 1, numTrials do + local timer = torch.Timer() + cutorch.synchronize() + fun(module, arg1, arg2, arg3) + cutorch.synchronize() + if i > 1 then + time = time + timer:time().real + end + end + time = time / (numTrials - 1) + print(printString .. time * 1000 .. " ms") + + -- Avoid messing up the accGradParameters case, this is benchmarking + -- only so we're ok + module.gradBias:zero() + module.gradWeight:zero() + return fun(module, arg1, arg2, arg3) +end + +local function testLoop(problemSize) + local batchSize = problemSize[1] or 4 * torch.random(maxBatch) + local nInputPlanes = problemSize[2] or torch.random(maxInputSize) + local nOutputPlanes = problemSize[3] or torch.random(maxOutputPlanes) + local kH = problemSize[4] or torch.random(maxKernelSize) + -- If not specified, make it square to avoid blatant rectangular + -- inefficiences with FBFFT atm + local kW = problemSize[5] or torch.random(maxKernelSize) + local iH = problemSize[6] or + math.max(kH, torch.random(maxInputSize) + 4 - kH + 1) + -- If not specified, make it square to avoid blatant rectangular + -- inefficiences with FBFFT atm + local iW = problemSize[7] or + math.max(kW, torch.random(maxInputSize) + 4 - kW + 1) + + local padH = 0 + local padW = 0 + + -- Only small tests, having many small random tests that also + -- exercise synchronizations is far more valuable than bigger ones + if iW * iH * batchSize * nInputPlanes > maxSize then + return + end + if iW * iH * nOutputPlanes * nInputPlanes > maxSize then + return + end + if iW * iH * batchSize * nOutputPlanes > maxSize then + return + end + + local scale = torch.random(100) / 100.0 + print('Running ', batchSize, nInputPlanes, nOutputPlanes, + kH, kW, iH, iW, scale, " pad by ", padH, "x", padW) + + local net = + cudnn.SpatialConvolution(nInputPlanes, nOutputPlanes, + kW, kH, 1, 1, padW, padH):cuda() + local input = torch.CudaTensor(batchSize, nInputPlanes, iH, iW):normal() + local gradOutput = torch.CudaTensor(batchSize, + nOutputPlanes, + iH + 2 * padH - kH + 1, + iW + 2 * padW - kW + 1):normal() + net.gradWeight:zero() + net.gradBias:zero() + + local output = timeFunction("CUDNN updateOutput: ", + net.updateOutput, net, input, scale):float() + local gradInput = + timeFunction("CUDNN updateGradInput: ", + net.updateGradInput, net, input, gradOutput):float() + timeFunction("CUDNN accGradParameters: ", + net.accGradParameters, net, input, gradOutput, scale) + local gradWeight = net.gradWeight:float() + local gradBias = net.gradBias:float() + + local netCuFFT = {} + local outputCuFFT = {} + local gradInputCuFFT = {} + local gradWeightCuFFT = {} + local gradBiasCuFFT = {} + + for k = 1, kNumGPUs do -- Across kNumGPUs GPUs + if k > 1 then + cutorch.setDevice(k) + end + + netCuFFT[k] = + nn.SpatialConvolutionCuFFT(nInputPlanes, nOutputPlanes, + kW, kH, 1, 1, padW, padH) + netCuFFT[k].cudnnDebug = true + netCuFFT[k].gradWeight:zero() + netCuFFT[k].gradBias:zero() + netCuFFT[k].weight:copy(net.weight) + netCuFFT[k].bias:copy(net.bias) + netCuFFT[k]:cuda() + + outputCuFFT[k] = timeFunction("CuFFT updateOutput: ", + netCuFFT[k].updateOutput, + netCuFFT[k], + input, + scale):float() + gradInputCuFFT[k] = timeFunction("CuFFT updateGradInput: ", + netCuFFT[k].updateGradInput, + netCuFFT[k], + input, + gradOutput):float() + timeFunction("CuFFT accGradParameters: ", + netCuFFT[k].accGradParameters, + netCuFFT[k], + input, + gradOutput, + scale) + +--[[ + gradInputCuFFT[k] = timeFunction("CuFFT backward: ", + netCuFFT[k].backward, + netCuFFT[k], + input, + gradOutput, + scale):float() +--]] + + gradWeightCuFFT[k] = netCuFFT[k].gradWeight:float() + gradBiasCuFFT[k] = netCuFFT[k].gradBias:float() + + if printResults then + print("Padding WxH = ", padW, "x", padH) + local norm = math.sqrt(output:dot(output) + 1e-8) + print("updateOutputCuFFT", output:dist(outputCuFFT[k]) / norm) + local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8) + print("updateGradInputCuFFT", + gradInput:dist(gradInputCuFFT[k]) / norm) + local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8) + print("accGradParametersCuFFT (weight)", + gradWeight:dist(gradWeightCuFFT[k]) / norm) + local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8) + print("accGradParametersCuFFT (bias)", + gradBias:dist(gradBiasCuFFT[k]) / norm) + end + + local norm = math.sqrt(output:dot(output) + 1e-8) + mytester:assertle(output:dist(outputCuFFT[k]) / norm, + precision, 'error on output') + local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8) + mytester:assertle(gradInput:dist(gradInputCuFFT[k]) / norm, + precision, 'error on gradInput') + local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8) + mytester:assertle(gradWeight:dist(gradWeightCuFFT[k]) / norm, + precision, 'error on gradWeight') + local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8) + mytester:assertle(gradBias:dist(gradBiasCuFFT[k]) / norm, + precision, 'error on gradBias') + end + + return netCuFFT +end + +-- batch, inputPlanes, outputPlanes, kH, kW, iH, iW +local problemSizes = { + {1, 1, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1, 2}, + {1, 1, 1, 1, 1, 1, 3}, + {2, 1, 3, 1, 1, 1, 1}, + {2, 3, 1, 1, 1, 1, 1}, + {2, 3, 4, 5, 5, 5, 5}, + {1, 1, 1, 3, 3, 3, 3}, + {1, 1, 1, 2, 2, 2, 2}, + {1, 1, 1, 1, 2, 1, 2}, + {1, 1, 1, 1, 1, 2, 3}, + {2, 3, 4, 5, 5, 5, 5}, + {128, 64, 64, 1, 1, 1, 1}, + {128, 64, 100, 1, 1, 1, 1}, + {128, 64, 64, 3, 3, 3, 3}, + {128, 64, 64, 3, 3, 3, 3}, + {128, 64, 64, 3, 3, 3, 3}, + {128, 64, 64, 3, 3, 3, 3}, + {128, 64, 64, 3, 3, 3, 3}, + {1, 1, 1, 7, 5, 13, 14}, + -- Cannot put in unit tests due to 5GB memory limit + -- {128, 128, 128, 3, 3, 128, 128}, -- falls back to cudnn +} + +local _problemSizesICLR2015 = { + {16, 16, 16, 3, 3, 13, 13}, + {16, 16, 16, 3, 3, 16, 16}, + {16, 16, 16, 3, 3, 27, 27}, + {16, 16, 16, 3, 3, 32, 32}, + {16, 16, 16, 3, 3, 57, 57}, + {16, 16, 16, 3, 3, 64, 64}, + {32, 32, 32, 3, 3, 13, 13}, + {32, 32, 32, 3, 3, 16, 16}, + {32, 32, 32, 3, 3, 27, 27}, + {32, 32, 32, 3, 3, 32, 32}, + {32, 32, 32, 3, 3, 57, 57}, + {32, 32, 32, 3, 3, 64, 64}, + {64, 64, 64, 3, 3, 13, 13}, + {64, 64, 64, 3, 3, 16, 16}, + {64, 64, 64, 3, 3, 27, 27}, + {64, 64, 64, 3, 3, 32, 32}, + {64, 64, 64, 3, 3, 57, 57}, + {64, 64, 64, 3, 3, 64, 64}, + {128, 128, 128, 3, 3, 13, 13}, + {128, 128, 128, 3, 3, 16, 16}, + {128, 128, 128, 3, 3, 27, 27}, + {128, 128, 128, 3, 3, 32, 32}, + {128, 128, 128, 3, 3, 57, 57}, + {128, 128, 128, 3, 3, 64, 64}, +} + +local _problemSizesAlexNet = { + -- 1 GPU + {128, 96, 256, 5, 5, 31, 31}, + {128, 256, 384, 3, 3, 15, 15}, + {128, 384, 384, 3, 3, 15, 15}, + {128, 384, 256, 3, 3, 15, 15}, + -- 2 GPU model parallel + {128, 48, 128, 5, 5, 31, 31}, + {128, 256, 192, 3, 3, 15, 15}, + {128, 192, 192, 3, 3, 15, 15}, + {128, 192, 128, 3, 3, 15, 15}, + -- 4 GPU model parallel + {128, 24, 64, 5, 5, 31, 31}, + {128, 256, 96, 3, 3, 15, 15}, + {128, 96, 96, 3, 3, 15, 15}, + {128, 96, 64, 3, 3, 15, 15}, +} + +local function reportAndFree(net) + if printResults or printMemory then + local free, total = cutorch.getMemoryUsage() + print("Pre Collect Memory: " , free , " free " , total , " total") + end + assert(torch.type(net) == 'table', torch.type(net)) + -- Kill the local references to, as well as the global buffers + for i, v in ipairs(net) do + v:cleanupBuffers() + end + collectgarbage() + collectgarbage() + if printResults or printMemory then + local free, total = cutorch.getMemoryUsage() + print("Post Collect Memory: " , free , " free " , total , " total") + end +end + +local num_random_configurations = 100 +local problemsToRun = problemSizes + +--[[ +-- Convenient override of the default that are used for unit tests +local problemsToRun = _problemSizesAlexNet +local problemsToRun = _problemSizesICLR2015 +printMemory = true +timeResults = true +printResults = true + +num_random_configurations = 0 +printMemory = true +timeResults = true +--]] + +function test.test() + for i = 1, #problemsToRun do + local net = testLoop(problemsToRun[i]) + reportAndFree(net) + end + -- random configuration + for i = 1, num_random_configurations do + local net = testLoop({}) + reportAndFree(net) + end +end + +mytester:add(test) +mytester:run() diff --git a/test/test_DataParallel.lua b/test/test_DataParallel.lua index 72030e4..9662b5f 100644 --- a/test/test_DataParallel.lua +++ b/test/test_DataParallel.lua @@ -1,13 +1,9 @@ local fboptim = require('fboptim') -- Copyright 2004-present Facebook. All Rights Reserved. -local dprintL = (require 'fb.util.dbg').new('parallel') -local dprint = function(...) - return dprintL(1, ...) -end +require 'fb.luaunit' require 'optim' require 'fbcunn' -print 'Requiring cunn. This will take a while. Talk amongst yourselves.' require 'cunn' -- Hyper-params. We're targeting a toy problem that computes @@ -74,98 +70,92 @@ local function tensorsAreProbablySimilar(l, r, epsilon) return math.abs(l:norm() - r:norm()) < epsilon end --- Set up models on each GPU. -local dp = nn.DataParallel(1) -local simpleModels = {} -for i = 1,numGPUs do - if i == 1 then - simpleModels[i] = simpleModel() - else - simpleModels[i] = simpleModels[1]:clone() - end - dp:add(simpleModels[i]) -end - --- CPU models to cross-validate -local cpuModels = {} -local function syncCPUModels() - for i = 1,numGPUs do - cpuModels[i] = simpleModels[i]:clone() - cpuModels[i] = cpuModels[i]:double() - end -end -syncCPUModels() - --- Check an input/output pair against the CPU models -local function checkWideResult(inputs, outputs) - local function checkOneResult(input, modIdx, expectedOutput) - input = input:double() -- de-cudify - assert(tensorsAreProbablySimilar(cpuModels[modIdx]:forward(input), - expectedOutput)) - end - for j = 1, numGPUs do - checkOneResult(getNarrowedInput(inputs, j), j, outputs[{ {j} }]) - end -end - -local function checkCPUModelsAreEquivalent() - syncCPUModels() - local input = genInput() - local out = cpuModels[1]:forward(input) - for j = 2, numGPUs do - assert(tensorsAreProbablySimilar(out, cpuModels[j]:forward(input))) - end -end -checkCPUModelsAreEquivalent() - -dp:cuda() - --- Make sure forward produces same results as an individual copy -print('forward test {') -for i=1, 10 do - local inputs, targets = genWideExample() - dprint{ inputs, targets } - local outputs = dp:forward(inputs) - syncCPUModels() - checkWideResult(inputs, outputs) -end -print('} forward test done') - -print('optim test {') -local optimState = { - learningRate = 1e-1, - weightDecay = 1e-4, - momentum = 0.9, - learningRateDecay = 1e-7 -} - -local timer = torch.Timer() -local opt = nn.Optim(dp, optimState) -local criterion = nn.MSECriterion():cuda() - -local num_iteration = 10 -timer:reset() -for i=1, num_iteration do - local inputs, targets = genWideExample() - local outputs = dp:forward(inputs) - syncCPUModels() - checkWideResult(inputs, outputs) - opt:optimize(fboptim.sgd, inputs, targets, criterion) - local out = dp:forward(inputs) - local err = criterion:forward(out, targets) - print(i, err) -end -print(string.format("Total time spent = %f", timer:time().real / num_iteration)) -checkCPUModelsAreEquivalent() -print('} optim test done ') - --- Check only the speed for forward/backward. -timer:reset(); -for i=1, num_iteration do - local inputs, targets = genWideExample() - dp:forward(inputs) - opt:optimize(fboptim.sgd, inputs, targets, criterion) -end -print(string.format( - "Speedtest: Total time spent = %f", - timer:time().real / num_iteration)); +function testDataParallel() + -- Set up models on each GPU. + local dp = nn.DataParallel(1) + local simpleModels = {} + for i = 1,numGPUs do + if i == 1 then + simpleModels[i] = simpleModel() + else + simpleModels[i] = simpleModels[1]:clone() + end + dp:add(simpleModels[i]) + end + + -- CPU models to cross-validate + local cpuModels = {} + local function syncCPUModels() + for i = 1,numGPUs do + cpuModels[i] = simpleModels[i]:clone() + cpuModels[i] = cpuModels[i]:double() + end + end + syncCPUModels() + + -- Check an input/output pair against the CPU models + local function checkWideResult(inputs, outputs) + local function checkOneResult(input, modIdx, expectedOutput) + input = input:double() -- de-cudify + assert(tensorsAreProbablySimilar(cpuModels[modIdx]:forward(input), + expectedOutput)) + end + for j = 1, numGPUs do + checkOneResult(getNarrowedInput(inputs, j), j, outputs[{ {j} }]) + end + end + + local function checkCPUModelsAreEquivalent() + syncCPUModels() + local input = genInput() + local out = cpuModels[1]:forward(input) + for j = 2, numGPUs do + assert(tensorsAreProbablySimilar(out, cpuModels[j]:forward(input))) + end + end + checkCPUModelsAreEquivalent() + + dp:cuda() + + -- Make sure forward produces same results as an individual copy + for i=1, 10 do + local inputs, targets = genWideExample() + local outputs = dp:forward(inputs) + syncCPUModels() + checkWideResult(inputs, outputs) + end + + local optimState = { + learningRate = 1e-1, + weightDecay = 1e-4, + momentum = 0.9, + learningRateDecay = 1e-7 + } + + local timer = torch.Timer() + local opt = nn.Optim(dp, optimState) + local criterion = nn.MSECriterion():cuda() + + local num_iteration = 10 + timer:reset() + for i=1, num_iteration do + local inputs, targets = genWideExample() + local outputs = dp:forward(inputs) + syncCPUModels() + checkWideResult(inputs, outputs) + opt:optimize(fboptim.sgd, inputs, targets, criterion) + local out = dp:forward(inputs) + local err = criterion:forward(out, targets) + end + checkCPUModelsAreEquivalent() + + -- Check only the speed for forward/backward. + timer:reset(); + for i=1, num_iteration do + local inputs, targets = genWideExample() + dp:forward(inputs) + opt:optimize(fboptim.sgd, inputs, targets, criterion) + end +end + +LuaUnit:main() diff --git a/test/test_DataParallelComprehensive.lua b/test/test_DataParallelComprehensive.lua deleted file mode 100755 index 6df38ce..0000000 --- a/test/test_DataParallelComprehensive.lua +++ /dev/null @@ -1,132 +0,0 @@ --- Copyright 2004-present Facebook. All Rights Reserved. - -require 'optim' -require 'cunn' -require 'fbcunn' -- For nn.DataParallel -require 'fbnn' -- For nn.Optim - -local base_gpu = 1 -- Primary GPU to use -local num_gpus = 2 -- We will use {base_gpu, base_gpu+1, etc} with modulus -torch.setdefaulttensortype('torch.DoubleTensor') -torch.setnumthreads(8) -cutorch.setDevice(base_gpu) - --- Create an instance of the test framework -local precision = 5e-4 -local mytester = torch.Tester() -local test = {} - -function copyTable(x) -- Shallow copy - local ret = {} - for k,v in pairs(x) do ret[k] = v end - return ret -end - --- Build a dummy binary classifier. We will split the BATCHES across GPUs. -function buildNet(width, height, pool, feat, filt, num_convs) - local net = nn.Sequential() - assert(math.fmod(filt,2) == 1) - for i = 1, num_convs do - local fin = 3 - if (i > 1) then fin = feat end - net:add(nn.SpatialConvolutionMM(fin, feat, filt, filt, 1, 1, (filt-1)/2)) - net:add(nn.Threshold()) - end - net:add(nn.SpatialMaxPooling(pool, pool)) - net:add(nn.Reshape(width * height * feat / (pool * pool))) - net:add(nn.Linear(width * height * feat / (pool * pool), 2)) - -- net:add(nn.SoftMax()) -- This is fake anyway, so just do regression :-) - return net -end - -function test.DataParallel() - collectgarbage() - local width = 16 - local height = 16 - local pool = 4 - local feat = 8 - local filt = 5 - local num_convs = 2 - local num_sgd_steps = 2 - local sync_gpu_cpu_params_every = 1 - local batch_size = 2 * num_gpus - - -- Build a CPU model - local cpu_net = buildNet(width, height, pool, feat, filt, num_convs) - - -- Build a multi-GPU model - local gpu_net = nn.DataParallel(1):cuda() - for i = 1, num_gpus do - local cur_gpu = math.fmod(base_gpu + (i-1)-1, cutorch.getDeviceCount())+1 - cutorch.setDevice(cur_gpu) - gpu_net:add(cpu_net:clone():cuda(), cur_gpu) - end - cutorch.setDevice(base_gpu) - - local cpu_input = torch.rand(batch_size, 3, height, width) - local gpu_input = cpu_input:cuda() - local cpu_target = torch.rand(batch_size, 2) - local gpu_target = cpu_target:cuda() - - -- Set up an MSE optimizer on the GPU and CPU - local optim_state_cpu = { - learningRate = 1, -- Artificially big learning rate - weightDecay = 0, - } - local optim_state_gpu = copyTable(optim_state_cpu) - local opt_cpu = nn.Optim(cpu_net, optim_state_cpu) - local opt_gpu = nn.Optim(gpu_net, optim_state_gpu) - - local criterion_cpu = nn.MSECriterion() - local criterion_gpu = criterion_cpu:clone():cuda() - - for i = 1, num_sgd_steps do - collectgarbage() - - -- Perform an SGD step on the GPU and CPU - opt_cpu:optimize(optim.sgd, cpu_input, cpu_target, criterion_cpu) - opt_gpu:optimize(optim.sgd, gpu_input, gpu_target, criterion_gpu) - assert(cutorch.getDevice() == base_gpu, - 'DataParallel didnt restore GPU state to base_gpu') - - -- Now make sure that everything is the same - local cpu_output = cpu_net.output - local gpu_output = gpu_net.output - local cpu_gradInput = cpu_net.gradInput - local gpu_gradInput = gpu_net.gradInput - local cpu_params, cpu_gradParams = cpu_net:parameters() - local gpu_params, gpu_gradParams = gpu_net:get(1):parameters() - - mytester:assertlt((cpu_output - gpu_output:double()):abs():max(), - precision, 'fprop error ') - mytester:assertlt((criterion_cpu.gradInput - - criterion_gpu.gradInput:double()):abs():max(), precision, - 'CRITERION BPROP error ') - mytester:asserteq(#cpu_params, #gpu_params) - for j = 1, #cpu_params do - mytester:assertlt((cpu_params[j] - gpu_params[j]:double()):abs():max(), - precision, 'parameters error ') - end - mytester:asserteq(#cpu_gradParams, #gpu_gradParams) - for j = 1, #cpu_gradParams do - mytester:assertlt((cpu_gradParams[j] - - gpu_gradParams[j]:double()):abs():max(), precision, - 'BPROP error (gradParams)') - end - mytester:assertlt((cpu_gradInput - gpu_gradInput:double()):abs():max(), - precision, 'BPROP error (gradInput)') - - -- Sync the CPU and GPU weights every few "epochs" to prevent floating point - -- drift between SGD iterations (ie, they will eventually be divergent after - -- enough iterations) - if math.fmod(i, sync_gpu_cpu_params_every) == 0 then - for j = 1, #cpu_gradParams do - cpu_params[j]:copy(gpu_params[j]) - end - end - end -end - --- Now run the test above -mytester:add(test) -mytester:run() diff --git a/test/test_FBFFTTiling.lua b/test/test_FBFFTTiling.lua new file mode 100644 index 0000000..bb9dfbc --- /dev/null +++ b/test/test_FBFFTTiling.lua @@ -0,0 +1,208 @@ +require 'cunn' +require 'fbcunn' +require 'math' + +require 'fb.luaunit' +require('fbtorch') +g_mytester = torch.Tester() +local fb_test = {} + +local silence = true +local timeResults = false +local printDebug = false +local printMemory = false +local testCuDNN = true +local runUpdateOutput = true +local runUpdateGradInput = true +local runAccGradParameters = true + +local function reportAndFree(net) + if printMemory then + local free, total = cutorch.getMemoryUsage() + if not silence then + print('Pre Collect Memory: ' , free , ' free ' , total , ' total') + end + end + -- release entries from the global buffer table + if net then + net:cleanupBuffers() + net = nil + end + collectgarbage() + collectgarbage() + if printMemory then + local free, total = cutorch.getMemoryUsage() + if not silence then + print('Post Collect Memory: ' , free , ' free ' , total , ' total') + end + end +end + +local function testTiledFFT(problem, FFTConvolutionClass) + local batches = problem[1] or torch.random(16) + local inputPlanes = problem[2] or torch.random(16) + local outputPlanes = problem[3] or torch.random(16) + -- Values that make sense, start from kernel size + local kH = problem[6] or 4 + math.random(11) + local kW = problem[7] or 4 + math.random(11) + local iH = problem[4] or 1 + 2 * kH + math.random(13) + local iW = problem[5] or 1 + 2 * kW + math.random(13) + local tileH = kH + math.random(5) + tileH = problem[8] or math.min(tileH, iH - 1) + local tileW = kW + math.random(5) + tileW = problem[9] or math.min(tileW, iW - 1) + local padH = problem[10] or math.min(kH - 1, tileH - kH, math.random(7)) + local padW = problem[11] or math.min(kW - 1, tileW - kW, math.random(7)) + local reuseRandom = math.min(torch.random(5) % 5 + 1) + local reuses = { + nn.SpatialConvolutionFFT.memoryReuseNone, + nn.SpatialConvolutionFFT.memoryReuseInput, + nn.SpatialConvolutionFFT.memoryReuseWeight, + nn.SpatialConvolutionFFT.memoryReuseOutput, + nn.SpatialConvolutionFFT.memoryReuseAll, + } + local reuse = problem[12] or reuses[reuseRandom] + + if not silence then + print('Running ', batches, inputPlanes, outputPlanes, + ' kH = ', kH, ' x ', 'kW = ', kW, + ' x ', 'iH = ', iH, ' x ', 'iW = ', iW, + ' x ', 'padH = ', padH, ' x ', padW, ' tile by ', tileH, 'x', tileW, + ' reuse = ', reuse) + end + + -- Testing tiling, 1 batch, input plane, output plane are enough + local ps = {batches, inputPlanes, iH, iW} + local input = torch.Tensor(torch.LongStorage(ps)):cuda():normal() + local ps = {batches, + outputPlanes, + iH - kH + 2 * padH + 1, + iW - kW + 2 * padW + 1} + local gradOutput = torch.Tensor(torch.LongStorage(ps)):cuda():normal() + local scale = torch.uniform() + local net = FFTConvolutionClass(inputPlanes, + outputPlanes, + kW, + kH, + 1, + 1, + padW, + padH, + tileW, + tileH, + reuse):cuda() + net.cudnnDebug = testCuDNN -- this line activates internal testing vs CuDNN + + if silence then + net.reportErrors = false + end + + if runUpdateOutput then + net.printDebugLevel = -1 + if net.printDebugLevel >= 3 then + -- Nasty debugging to be expected + local val = 1 + input:apply(function() val = val + 1 return val end) + local val = 1 + net.weight:apply(function() val = val + 1 return val end) + end + + net:updateOutput(input) + end + + + if runUpdateGradInput then + net.printDebugLevel = -1 + if net.printDebugLevel >= 3 then + -- Nasty debugging to be expected + local val = 1 + gradOutput:apply(function() val = val + 1 return val end) + local val = 1 + net.weight:apply(function() val = val + 1 return val end) + end + + net:updateGradInput(input, gradOutput) + end + + + if runAccGradParameters then + net.printDebugLevel = -1 + if net.printDebugLevel >= 3 then + -- Nasty debugging to be expected + scale = 1.0 + local val = 1 + input:apply(function() val = val + 1 return val end) + local val = 1 + gradOutput:apply(function() val = val + 1 return val end) + end + net:accGradParameters(input, gradOutput, scale) + end + + g_mytester:assert(net.cudnnChecks) + + return net +end + + +local problemsToRun = { + -- iH, iW, kH, kW, tileH, tileW, padH, padW, reuse + {2, 2, 2, 12, 12, 3, 3, 8, 8, 0, 0, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {2, 2, 2, 128, 128, 3, 3, 16, 16, 0, 0, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 128, 112, 112, 3, 3, 32, 32, 0, 0, + nn.SpatialConvolutionFFT.memoryReuseAll}, +} + +local numTests = 25 + +-- Convenient override of the default that are used for unit tests +-- numTests = 1 +-- silence = false +-- timeResults = true +-- printDebug = false +-- printMemory = false +-- runUpdateOutput = true +-- runUpdateGradInput = true +-- runAccGradParameters = true + +local testSync = true +local testAsync = true +local testIterated = true +function fb_test.testTiledFFT() + for i = 1, #problemsToRun do + if testSync then + local net = + testTiledFFT(problemsToRun[i], nn.SpatialConvolutionFFTTiledSync) + reportAndFree(net) + end + if testAsync then + local net = + testTiledFFT(problemsToRun[i], nn.SpatialConvolutionFFTTiledAsync) + reportAndFree(net) + end + if testIterated then + local net = testTiledFFT( + problemsToRun[i], nn.SpatialConvolutionFFTTiledIterated) + reportAndFree(net) + end + end + for step = 1, numTests do + if testSync then + local net = testTiledFFT({}, nn.SpatialConvolutionFFTTiledSync) + reportAndFree(net) + end + if testAsync then + local net = testTiledFFT({}, nn.SpatialConvolutionFFTTiledAsync) + reportAndFree(net) + end + if testIterated then + local net = testTiledFFT({}, nn.SpatialConvolutionFFTTiledIterated) + reportAndFree(net) + end + end +end + +g_mytester = torch.Tester() +g_mytester:add(fb_test) +g_mytester:run() diff --git a/test/test_FFT.lua b/test/test_FFT.lua index 04259d4..a2e64f9 100644 --- a/test/test_FFT.lua +++ b/test/test_FFT.lua @@ -1,6 +1,6 @@ -- Copyright 2004-present Facebook. All Rights Reserved. --- require('fb.luaunit') -local torch = require('fbtorch') +require('fb.luaunit') +require('fbtorch') require 'cunn' require 'fbcunn' @@ -15,19 +15,72 @@ local mytester = torch.Tester() local precision = 1e-4 local test = {} +local silence = true local printResults = false +local printMemory = false +local timeResults = false +local skipTest = false -local kNumGPUs = 1 -local maxSize = 128000000 +local maxSize = 1e30 local maxBatch = 4 local maxInputPlanes = 13 local maxOutputPlanes = 13 local maxKernelSize = 7 -local maxInputSize = 60 +local maxInputSize = 32 - maxKernelSize -local function testLoop(problemSize) + +local function reportAndFree(net) + if (printResults or printMemory) and not silence then + local free, total = cutorch.getMemoryUsage() + print('Pre Collect Memory: ' , free , ' free ' , total , ' total', + total - free, 'consumption') + end + -- release entries from the global buffer table + if net then + net:cleanupBuffers() + net = nil + end + collectgarbage() + collectgarbage() + if (printResults or printMemory) and not silence then + local free, total = cutorch.getMemoryUsage() + print('Post Collect Memory: ' , free , ' free ' , total , ' total', + total - free, 'consumption') + end +end + +local function timeFunction( + printString, fun, module, arg1, arg2, arg3, arg4, arg5) + if not timeResults then + return fun(module, arg1, arg2, arg3, arg4, arg5) + end + + local numTrials = 5 + local time = 0 + for i = 1, numTrials do + local timer = torch.Timer() + cutorch.synchronize() + fun(module, arg1, arg2, arg3, arg4, arg5) + cutorch.synchronize() + if i > 1 then + time = time + timer:time().real + end + end + time = time / (numTrials - 1) + if not silence then + print(printString .. time * 1000 .. ' ms') + end + + -- Avoid messing up the accGradParameters case, this is benchmarking + -- only so we're ok + module.gradBias:zero() + module.gradWeight:zero() + return fun(module, arg1, arg2, arg3, arg4, arg5) +end + +local function testLoop(problemSize, fftImplementation) local batchSize = problemSize[1] or 4 * torch.random(maxBatch) - local nInputPlanes = problemSize[2] or torch.random(maxInputSize) + local nInputPlanes = problemSize[2] or torch.random(maxInputPlanes) local nOutputPlanes = problemSize[3] or torch.random(maxOutputPlanes) local kH = problemSize[4] or torch.random(maxKernelSize) -- If not specified, make it square to avoid blatant rectangular @@ -39,6 +92,28 @@ local function testLoop(problemSize) -- inefficiences with FBFFT atm local iW = problemSize[7] or math.max(kW, torch.random(maxInputSize) + 4 - kW + 1) + local padH = problemSize[8] or math.min(torch.random(5) % 5, kH - 1) + local padW = problemSize[9] or math.min(torch.random(5) % 5, kW - 1) + local tileH = problemSize[10] + local tileW = problemSize[11] + local reuseRandom = math.min(torch.random(5) % 5 + 1) + local reuses = { + nn.SpatialConvolutionFFT.memoryReuseNone, + nn.SpatialConvolutionFFT.memoryReuseInput, + nn.SpatialConvolutionFFT.memoryReuseWeight, + nn.SpatialConvolutionFFT.memoryReuseOutput, + nn.SpatialConvolutionFFT.memoryReuseAll, + } + local reuse = problemSize[12] or reuses[reuseRandom] + + if fftImplementation == 'cufft' then + iW = iW + 2 * padW + iH = iH + 2 * padH + padW = 0 + padH = 0 + tileW = nil + tileH = nil + end -- Only small tests, having many small random tests that also -- exercise synchronizations is far more valuable than bigger ones @@ -53,106 +128,145 @@ local function testLoop(problemSize) end local scale = torch.random(100) / 100.0 - print('Running ', - batchSize, nInputPlanes, nOutputPlanes, kH, kW, iH, iW, scale) + if not silence then + print('Running ', batchSize, nInputPlanes, nOutputPlanes, + kH, kW, iH, iW, scale, ' pad by ', padH, 'x', padW, + ' tile by ', tileH, 'x', tileW, ' reuse ', reuse) + end + + local input = torch.CudaTensor(batchSize, nInputPlanes, iH, iW):normal() + local gradOutput = torch.CudaTensor(batchSize, + nOutputPlanes, + iH + 2 * padH - kH + 1, + iW + 2 * padW - kW + 1):normal() + + local netCuDNN, output, gradInput, gradWeight, gradBias + -- Convenient way to skip tests to debug performance + if not skipTest then + netCuDNN = + cudnn.SpatialConvolution(nInputPlanes, nOutputPlanes, + kW, kH, 1, 1, padW, padH):cuda() + netCuDNN.gradWeight:zero() + netCuDNN.gradBias:zero() + + output = + timeFunction('CUDNN updateOutput: ', netCuDNN.updateOutput, + netCuDNN, input, scale):float() + gradInput = + timeFunction('CUDNN updateGradInput: ', netCuDNN.updateGradInput, + netCuDNN, input, gradOutput):float() + timeFunction('CUDNN accGradParameters: ', netCuDNN.accGradParameters, + netCuDNN, input, gradOutput, scale) + gradWeight = netCuDNN.gradWeight:float() + gradBias = netCuDNN.gradBias:float() + end + + local net + if tileH and tileW then + net = + nn.SpatialConvolutionFFTTiled(nInputPlanes, + nOutputPlanes, + kW, + kH, + 1, + 1, + padW, + padH, + tileW, + tileH, + reuse) + else + if fftImplementation == 'fbfft' then + net = nn.SpatialConvolutionFBFFT( + nInputPlanes, nOutputPlanes, kW, kH, 1, 1, padW, padH, reuse) + elseif fftImplementation == 'cufft' then + net = nn.SpatialConvolutionCuFFT( + nInputPlanes, nOutputPlanes, kW, kH, 1, 1, padW, padH, reuse) + elseif fftImplementation == 'fbfftgemm' then + net = nn.SpatialConvolutionFBFFTGemm( + nInputPlanes, nOutputPlanes, kW, kH, 1, 1, padW, padH, reuse) + else + assert(false, 'Unknown fftImplementation ' .. fftImplementation) + end + end - local net = nn.SpatialConvolution(nInputPlanes, nOutputPlanes, kW, kH) - local input = torch.Tensor(batchSize, nInputPlanes, iH, iW):normal() - local gradOutput = - torch.Tensor(batchSize, nOutputPlanes, iH-kH+1, iW-kW+1):normal() + local name = fftImplementation + net:cuda() net.gradWeight:zero() net.gradBias:zero() - local output = net:updateOutput(input, scale):clone() - - local gradInput = net:updateGradInput(input, gradOutput):clone() - net:accGradParameters(input, gradOutput, scale) - local gradWeight = net.gradWeight:clone() - local gradBias = net.gradBias:clone() - - for j = 1,kNumGPUs do -- test cuda resources reuse with kNumGPUs iterations - local netCuFFT = {} - local outputCuFFT = {} - local gradInputCuFFT = {} - local gradWeightCuFFT = {} - local gradBiasCuFFT = {} - - for k = 1, kNumGPUs do -- Across kNumGPUs GPUs - if k > 1 then - cutorch.setDevice(k) - end - - netCuFFT[k] = - nn.SpatialConvolutionCuFFT(nInputPlanes, nOutputPlanes, kW, kH) - netCuFFT[k].debug = true - netCuFFT[k].gradWeight:zero() - netCuFFT[k].gradBias:zero() - netCuFFT[k].weight:copy(net.weight) - netCuFFT[k].bias:copy(net.bias) - netCuFFT[k]:cuda() - - outputCuFFT[k] = - netCuFFT[k]:updateOutput(input:clone():cuda(), scale):float() - gradInputCuFFT[k] = - netCuFFT[k]:updateGradInput(input:clone():cuda(), - gradOutput:clone():cuda()):float() - netCuFFT[k]:accGradParameters(input:clone():cuda(), - gradOutput:clone():cuda(), scale) - gradWeightCuFFT[k] = netCuFFT[k].gradWeight:clone():float() - gradBiasCuFFT[k] = netCuFFT[k].gradBias:clone():float() - - if printResults then + if netCuDNN then + net.weight:copy(netCuDNN.weight) + net.bias:copy(netCuDNN.bias) + end + -- net.cudnnDebug = false + -- net.printDebugLevel = -1 + + local outputFFT = timeFunction(name .. 'updateOutput: ', + net.updateOutput, + net, + input):float() + + local gradInputFFT = timeFunction(name .. 'updateGradInput: ', + net.updateGradInput, + net, + input, + gradOutput):float() + timeFunction(name .. 'accGradParameters: ', + net.accGradParameters, + net, + input, + gradOutput, + scale) + + if not skipTest then + local gradWeightFFT = net.gradWeight:float() + local gradBiasFFT = net.gradBias:float() + + if printResults and not silence then local norm = math.sqrt(output:dot(output) + 1e-8) - print("updateOutputCuFFT", output:dist(outputCuFFT[k]) / norm) + print('updateOutput' .. name, output:dist(outputFFT) / norm) local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8) - print("updateGradInputCuFFT", - gradInput:dist(gradInputCuFFT[k]) / norm) + print('updateGradInput' .. name, + gradInput:dist(gradInputFFT) / norm) local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8) - print("accGradParametersCuFFT (weight)", - gradWeight:dist(gradWeightCuFFT[k]) / norm) + print('accGradParameters' .. name .. ' (weight)', + gradWeight:dist(gradWeightFFT) / norm) local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8) - print("accGradParametersCuFFT (bias)", - gradBias:dist(gradBiasCuFFT[k]) / norm) - end - - - local norm = math.sqrt(output:dot(output) + 1e-8) - mytester:assertle(output:dist(outputCuFFT[k]) / norm, - precision, 'error on output') - local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8) - mytester:assertle(gradInput:dist(gradInputCuFFT[k]) / norm, - precision, 'error on gradInput') - local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8) - mytester:assertle(gradWeight:dist(gradWeightCuFFT[k]) / norm, - precision, 'error on gradWeight') - local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8) - mytester:assertle(gradBias:dist(gradBiasCuFFT[k]) / norm, - precision, 'error on gradBias') - end - end - - if printResults then - local free_bytes, total_bytes = cutorch.getMemoryUsage() - print ("free after collection, total", free_bytes, " ", total_bytes) - end - - collectgarbage() - - if printResults then - local free_bytes, total_bytes = cutorch.getMemoryUsage() - print ("free after collection, total", free_bytes, " ", total_bytes) - end + print('accGradParameters' .. name .. ' (bias)', + gradBias:dist(gradBiasFFT) / norm) + end + + local norm = math.sqrt(output:dot(output) + 1e-8) + mytester:assertle(output:dist(outputFFT) / norm, + precision, 'error on output') + local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8) + mytester:assertle(gradInput:dist(gradInputFFT) / norm, + precision, 'error on gradInput') + local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8) + mytester:assertle(gradWeight:dist(gradWeightFFT) / norm, + precision, 'error on gradWeight') + local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8) + mytester:assertle(gradBias:dist(gradBiasFFT) / norm, + precision, 'error on gradBias') + end + + return net end -- batch, inputPlanes, outputPlanes, kH, kW, iH, iW local problemSizes = { {1, 1, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1, 2}, + {1, 1, 1, 1, 1, 1, 3}, + {1, 1, 1, 3, 3, 4, 4}, + {1, 1, 1, 3, 3, 8, 8}, + {2, 1, 3, 1, 1, 1, 1}, + {2, 3, 1, 1, 1, 1, 1}, {2, 3, 4, 5, 5, 5, 5}, {1, 1, 1, 3, 3, 3, 3}, {1, 1, 1, 2, 2, 2, 2}, {1, 1, 1, 1, 2, 1, 2}, {1, 1, 1, 1, 1, 2, 3}, - {1, 1, 1, 1, 1, 1, 2}, - {1, 1, 1, 1, 1, 1, 1}, {2, 3, 4, 5, 5, 5, 5}, {128, 64, 64, 1, 1, 1, 1}, {128, 64, 100, 1, 1, 1, 1}, @@ -161,6 +275,21 @@ local problemSizes = { {128, 64, 64, 3, 3, 3, 3}, {128, 64, 64, 3, 3, 3, 3}, {128, 64, 64, 3, 3, 3, 3}, + {1, 1, 1, 7, 5, 13, 14}, + -- Cannot put in unit tests due to 5GB memory limit + -- {128, 128, 128, 3, 3, 128, 128}, -- falls back to cudnn + {1, 1, 1, 5, 5, 27, 27, 0, 0}, + {1, 1, 1, 5, 5, 27, 27, 1, 0}, + {1, 1, 1, 5, 5, 27, 27, 0, 1}, + {1, 1, 1, 5, 5, 27, 27, 1, 2}, + {1, 1, 1, 5, 5, 27, 27, 2, 1}, + {1, 1, 1, 5, 5, 27, 27, 2, 2}, + {1, 1, 1, 3, 4, 19, 23, 0, 0}, + {1, 1, 1, 3, 4, 19, 23, 1, 0}, + {1, 1, 1, 3, 4, 19, 23, 0, 1}, + {1, 1, 1, 3, 4, 19, 23, 1, 2}, + {1, 1, 1, 3, 4, 19, 23, 2, 1}, + {1, 1, 1, 3, 4, 19, 23, 2, 2}, } local _problemSizesICLR2015 = { @@ -191,33 +320,293 @@ local _problemSizesICLR2015 = { } local _problemSizesAlexNet = { - -- 1 GPU - {128, 96, 256, 5, 5, 31, 31}, - {128, 256, 384, 3, 3, 15, 15}, - {128, 384, 384, 3, 3, 15, 15}, - {128, 384, 256, 3, 3, 15, 15}, - -- 2 GPU model parallel - {128, 48, 128, 5, 5, 31, 31}, - {128, 256, 192, 3, 3, 15, 15}, - {128, 192, 192, 3, 3, 15, 15}, - {128, 192, 128, 3, 3, 15, 15}, - -- 4 GPU model parallel - {128, 24, 64, 5, 5, 31, 31}, - {128, 256, 96, 3, 3, 15, 15}, - {128, 96, 96, 3, 3, 15, 15}, - {128, 96, 64, 3, 3, 15, 15}, + -- 1 GPU + {128, 96, 256, 5, 5, 27, 27, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 96, 256, 5, 5, 27, 27, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 256, 384, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 256, 384, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 384, 384, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 384, 384, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 384, 256, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 384, 256, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + -- 2 GPU model parallel + {128, 48, 128, 5, 5, 27, 27, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 48, 128, 5, 5, 27, 27, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 256, 192, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 256, 192, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 192, 192, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 192, 192, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 192, 128, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 192, 128, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + -- 4 GPU model parallel + {128, 24, 64, 5, 5, 27, 27, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 24, 64, 5, 5, 27, 27, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 256, 96, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 256, 96, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 96, 96, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 96, 96, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 96, 64, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 96, 64, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, } -local num_random_configurations = 5 +local _problemSizesVGG = { + {64, 64, 64, 3, 3, 32, 32, 0, 0, 8, 8, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 64, 3, 3, 32, 32, 0, 0, 8, 8, + nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 64, 64, 3, 3, 32, 32, 0, 0, 16, 16, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 64, 3, 3, 32, 32, 0, 0, 16, 16, + nn.SpatialConvolutionFFT.memoryReuseAll}, + -- Test fallback to FBFFT convolutions + {64, 64, 64, 3, 3, 32, 32, 0, 0, 32, 32, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 64, 3, 3, 32, 32, 0, 0, 32, 32, + nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 64, 64, 3, 3, 64, 64, 0, 0, 16, 16, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 64, 3, 3, 64, 64, 0, 0, 16, 16, + nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 64, 64, 3, 3, 64, 64, 0, 0, 32, 32, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 64, 3, 3, 64, 64, 0, 0, 32, 32, + nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 64, 64, 3, 3, 128, 128, 0, 0, 16, 16, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 64, 3, 3, 128, 128, 0, 0, 16, 16, + nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 64, 64, 3, 3, 128, 128, 0, 0, 32, 32, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 64, 3, 3, 128, 128, 0, 0, 32, 32, + nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 64, 128, 3, 3, 112, 112, 1, 1, 16, 16, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 128, 3, 3, 112, 112, 1, 1, 16, 16, + nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 64, 128, 3, 3, 112, 112, 1, 1, 32, 32, + nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 128, 3, 3, 112, 112, 1, 1, 32, 32, + nn.SpatialConvolutionFFT.memoryReuseAll}, +} + + +-- These should correspond with Soumith's benchmarks +-- https://raw.githubusercontent.com/soumith/convnet-benchmarks/master/torch7/imagenet_winners/output_raw.log +local _benchmarkAlexNet = { + -- 1 GPU + {128, 64, 192, 5, 5, 27, 27, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 192, 384, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 384, 256, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 256, 256, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + + -- 1 GPU + {128, 64, 192, 5, 5, 27, 27, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 192, 384, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 384, 256, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 256, 256, 3, 3, 13, 13, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, +} + +local _benchmarkOverFeat = { + -- 1 GPU + {128, 96, 256, 5, 5, 24, 24, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 256, 512, 3, 3, 12, 12, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 512, 1024, 3, 3, 12, 12, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {128, 1024, 1024, 3, 3, 12, 12, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + + -- 1 GPU + {128, 96, 256, 5, 5, 24, 24, 2, 2, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 256, 512, 3, 3, 12, 12, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 512, 1024, 3, 3, 12, 12, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {128, 1024, 1024, 3, 3, 12, 12, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, +} + +local _benchmarkVGG = { + -- 1 GPU + {64, 3, 64, 3, 3, 224, 224, 1, 1, + 32, 32, nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 64, 128, 3, 3, 112, 112, 1, 1, + 32, 32, nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 128, 256, 3, 3, 56, 56, 1, 1, + 32, 32, nn.SpatialConvolutionFFT.memoryReuseAll}, + + {64, 256, 256, 3, 3, 56, 56, 1, 1, + 32, 32, nn.SpatialConvolutionFFT.memoryReuseAll}, + + {64, 256, 512, 3, 3, 28, 28, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 512, 512, 3, 3, 28, 28, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + + {64, 512, 512, 3, 3, 14, 14, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + {64, 512, 512, 3, 3, 14, 14, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll}, + + -- 1 GPU + {64, 3, 64, 3, 3, 224, 224, 1, 1, + 32, 32, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 64, 128, 3, 3, 112, 112, 1, 1, + 32, 32, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 128, 256, 3, 3, 56, 56, 1, 1, + 32, 32, nn.SpatialConvolutionFFT.memoryReuseNone}, + + {64, 256, 256, 3, 3, 56, 56, 1, 1, + 32, 32, nn.SpatialConvolutionFFT.memoryReuseNone}, + + {64, 256, 512, 3, 3, 28, 28, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 512, 512, 3, 3, 28, 28, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + + {64, 512, 512, 3, 3, 14, 14, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 512, 512, 3, 3, 14, 14, 1, 1, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, +} + +local _stressTest = { + {1, 128, 128, 3, 3, 8, 8, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 3, 128, 3, 3, 8, 8, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 3, 512, 3, 3, 8, 8, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {1, 256, 512, 3, 3, 8, 8, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 128, 128, 3, 3, 8, 8, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 256, 512, 3, 3, 8, 8, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {1, 16, 16, 3, 3, 16, 16, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {1, 128, 128, 3, 3, 16, 16, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {1, 256, 512, 3, 3, 16, 16, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 3, 128, 3, 3, 16, 16, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 3, 512, 3, 3, 16, 16, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 128, 128, 3, 3, 16, 16, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 256, 512, 3, 3, 16, 16, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {1, 16, 16, 3, 3, 32, 32, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, +-- Investigation says the cost of FFT weights is too high since +-- they are only used once in this case. Good thing is that batch +-- size of 1 should be for inference only and precomputing the FFT +-- of the weights is a viable approach + {1, 128, 128, 3, 3, 32, 32, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {1, 256, 512, 3, 3, 32, 32, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, +---------------------------------------------------------------- + {64, 3, 128, 3, 3, 32, 32, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 3, 512, 3, 3, 32, 32, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 128, 128, 3, 3, 32, 32, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, + {64, 256, 512, 3, 3, 32, 32, 0, 0, + nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone}, +} + +local problemsToRun = _stressTest +local num_random_configurations = 25 + +--[[ +-- Convenient override of the default that are used for unit tests +problemsToRun = _problemSizesAlexNet +problemsToRun = _problemSizesICLR2015 +problemsToRun = _problemSizesVGG +printMemory = true +timeResults = true +num_random_configurations = 0 +--]] + +local testCuFFT = false +local testFBFFT = true +local testFBFFTGemm = true function test.test() - for i = 1, #problemSizes do - testLoop(problemSizes[i]) - end - -- random configuration - for i = 1, num_random_configurations do - testLoop({}) - end + for i = 1, #problemsToRun do + if testFBFFT then + local net = testLoop(problemsToRun[i], 'fbfft') + reportAndFree(net) + end + if testFBFFTGemm then + local net = testLoop(problemsToRun[i], 'fbfftgemm') + reportAndFree(net) + end + if testCuFFT then + local net = testLoop(problemsToRun[i], 'cufft') + reportAndFree(net) + end + end + + for size in pairs({'big', 'small'}) do + if size == 'big' then + maxInputSize = 32 - maxKernelSize + else + maxInputSize = 128 - maxKernelSize + end + -- random configuration + for i = 1, num_random_configurations do + if testFBFFT then + local net = testLoop({}, 'fbfft') + reportAndFree(net) + end + if testFBFFTGemm then + local net = testLoop({}, 'fbfftgemm') + reportAndFree(net) + end + if testCuFFT then + local net = testLoop({}, 'cufft') + reportAndFree(net) + end + end + end end mytester:add(test) diff --git a/test/test_FFTModule.lua b/test/test_FFTModule.lua index d052473..e5de3cb 100644 --- a/test/test_FFTModule.lua +++ b/test/test_FFTModule.lua @@ -10,11 +10,11 @@ local mytester = torch.Tester() local FFTTester = {} local printResults = false -local precision = 2e-7 +local precision = 2 * 2e-7 -- 2 ULPs relative to the largest input -- We exploit hermitian symmetry to write out only 1/2 the data. -- CuFFT exploits hermitian symmetry along the innermost dimension --- FBFFT is parameteriazble only determined by the output tensor dimesions. +-- FBFFT is parametriazble only determined by the output tensor dimensions. -- Ideally we would use outermost dimension hermitian symmetry for better -- coalescing but if we check correctness vs CuFFT then we match it. local runTests = true @@ -31,55 +31,47 @@ local _iclr2015TestCases = { {1, {4, 32}}, {1, {4, 64}}, {1, {4, 128}}, - {1, {4, 256}}, {1, {32, 8}}, {1, {32, 16}}, {1, {32, 32}}, {1, {32, 64}}, {1, {32, 128}}, - {1, {32, 256}}, {1, {128, 8}}, {1, {128, 16}}, {1, {128, 32}}, {1, {128, 64}}, {1, {128, 128}}, - {1, {128, 256}}, {1, {1024, 8}}, {1, {1024, 16}}, {1, {1024, 32}}, {1, {1024, 64}}, {1, {1024, 128}}, - {1, {1024, 256}}, {1, {4096, 8}}, {1, {4096, 16}}, {1, {4096, 32}}, {1, {4096, 64}}, {1, {4096, 128}}, - {1, {4096, 256}}, {1, {128 * 128, 8}}, {1, {128 * 128, 16}}, {1, {128 * 128, 32}}, {1, {128 * 128, 64}}, {1, {128 * 128, 128}}, - {1, {128 * 128, 256}}, {1, {256 * 256, 8}}, {1, {256 * 256, 16}}, {1, {256 * 256, 32}}, {1, {256 * 256, 64}}, {1, {256 * 256, 128}}, - {1, {256 * 256, 256}}, {2, {4, 8, 8}}, {2, {4, 16, 16}}, {2, {4, 32, 32}}, {2, {4, 64, 64}}, - {2, {4, 128, 128}}, {2, {32, 8, 8}}, {2, {32, 16, 16}}, @@ -105,19 +97,23 @@ local _iclr2015TestCases = { {2, {4096, 64, 64}}, {2, {4096, 128, 128}}, + {2, {1, 8, 8}}, + {2, {1, 16, 16}}, + {2, {1, 32, 32}}, + {2, {1, 64, 64}}, + {2, {128 * 128, 8, 8}}, {2, {128 * 128, 16, 16}}, {2, {128 * 128, 32, 32}}, {2, {128 * 128, 64, 64}}, {2, {128 * 128, 128, 128}}, ---[[ {2, {256 * 256, 8, 8}}, {2, {256 * 256, 16, 16}}, {2, {256 * 256, 32, 32}}, {2, {256 * 256, 64, 64}}, - {2, {256 * 256, 128, 128}}, ---]] +-- Too much memory +-- {2, {256 * 256, 128, 128}}, } local _stressTestCases = { @@ -128,7 +124,6 @@ local _stressTestCases = { {1, {32 * 32, 32}}, {1, {32 * 32, 64}}, {1, {32 * 32, 128}}, - {1, {32 * 32, 256}}, {2, {32 * 32, 2, 2}}, {2, {32 * 32, 4, 4}}, @@ -145,7 +140,6 @@ local _stressTestCases = { {1, {64 * 64, 32}}, {1, {64 * 64, 64}}, {1, {64 * 64, 128}}, - {1, {64 * 64, 256}}, {2, {64 * 64, 2, 2}}, {2, {64 * 64, 4 ,4}}, @@ -162,7 +156,6 @@ local _stressTestCases = { {1, {128 * 128, 32}}, {1, {128 * 128, 64}}, {1, {128 * 128, 128}}, - {1, {128 * 128, 256}}, {2, {128 * 128, 2, 2}}, {2, {128 * 128, 4 ,4}}, @@ -175,6 +168,14 @@ local _stressTestCases = { } local testCases = { + {1, {1, 2}}, + {1, {1, 4}}, + {1, {1, 8}}, + {1, {1, 16}}, + {1, {1, 32}}, + {1, {1, 64}}, + {1, {1, 128}}, + {1, {127, 2}}, {1, {127, 4}}, {1, {127, 8}}, @@ -182,7 +183,6 @@ local testCases = { {1, {127, 32}}, {1, {127, 64}}, {1, {127, 128}}, - {1, {127, 256}}, {1, {437, 2}}, {1, {437, 4}}, @@ -191,7 +191,14 @@ local testCases = { {1, {437, 32}}, {1, {437, 64}}, {1, {437, 128}}, - {1, {437, 256}}, + + {2, {1, 2, 2}}, + {2, {1, 4 ,4}}, + {2, {1, 8, 8}}, + {2, {1, 16, 16}}, + {2, {1, 32, 32}}, + {2, {1, 64, 64}}, + {2, {1, 128, 128}}, {2, {9, 2, 2}}, {2, {9, 4 ,4}}, @@ -269,30 +276,28 @@ local function benchmarkCuFFT(problemSize, timeCuda) local freqSize = {} for i = 1, #timeSize do if i == #timeSize then - freqSize = concat(freqSize, {math.floor(timeSize[i] / 2) + 1}) + table.insert(freqSize, math.floor(timeSize[i] / 2) + 1) else - freqSize = concat(freqSize, {timeSize[i]}) + table.insert(freqSize, timeSize[i]) end end - freqSize = concat(freqSize, {2}) - + table.insert(freqSize, 2) local timeInvCuda = timeCuda:clone() local frequencyCuda = - torch.CudaTensor(torch.LongStorage(freqSize)):fill(-47.0) + torch.CudaTensor(torch.LongStorage(freqSize)):fill(0 / 0) local batchDims = #timeSize - fftDim - local net = nn.FFTWrapper(1) + local net = nn.FFTWrapper("cufft", 0, 0, "timed") -- no padding net:fft(timeCuda, frequencyCuda, batchDims) net:ffti(timeInvCuda, frequencyCuda, batchDims) local timeInv = timeInvCuda:double() local frequency = frequencyCuda:double() if printResults then - print('forward re:', frequency:select(fftDim + 2, 1)) - print('forward im:', frequency:select(fftDim + 2, 2)) - - print('inverse re:', timeInv) + print('cufft forward re:', frequency:select(fftDim + 2, 1)) + print('cufft forward im:', frequency:select(fftDim + 2, 2)) + print('cufft inverse re:', timeInv) end timeInvCuda = {} @@ -302,6 +307,11 @@ local function benchmarkCuFFT(problemSize, timeCuda) return frequency, timeInv end +local function transposedLayout(fftDim, fftSize) + if fftDim == 2 and (fftSize < 8 or fftSize > 32) then return true end + return false +end + local function benchmarkFBFFT(problemSize, timeCuda, frequency2) local fftDim = problemSize[1] local timeSize = problemSize[2] @@ -310,32 +320,31 @@ local function benchmarkFBFFT(problemSize, timeCuda, frequency2) for i = 1, #timeSize do if i == hermitianDim then - freqSize = concat(freqSize, {math.floor(timeSize[i] / 2) + 1}) + table.insert(freqSize, math.floor(timeSize[i] / 2) + 1) else - freqSize = concat(freqSize, {timeSize[i]}) + table.insert(freqSize, timeSize[i]) end end - freqSize = concat(freqSize, {2}) + table.insert(freqSize, 2) local timeInvCuda = timeCuda:clone() local frequencyCuda = - torch.CudaTensor(torch.LongStorage(freqSize)):fill(-47.0) + torch.CudaTensor(torch.LongStorage(freqSize)):fill(0 / 0) local batchDims = #timeSize - fftDim - local net = nn.FFTWrapper(0) + local net = nn.FFTWrapper("fbfft", 0, 0, "timed") -- no padding net:fft(timeCuda, frequencyCuda, batchDims) net:ffti(timeInvCuda, frequencyCuda, batchDims) local timeInv = timeInvCuda:double() local frequency = frequencyCuda:double() - if fftDim == 2 then + if transposedLayout(fftDim, timeSize[hermitianDim]) then frequency = frequency:transpose(2, 3) end if printResults then - print('forward re:', frequency:select(fftDim + 2, 1)) - print('forward im:', frequency:select(fftDim + 2, 2)) - - print('inverse re:', timeInv) + print('fbfft forward re:', frequency:select(fftDim + 2, 1)) + print('fbfft forward im:', frequency:select(fftDim + 2, 2)) + print('fb inverse re:', timeInv) end timeInvCuda = {} @@ -376,8 +385,25 @@ local function initCuda(ps, localInit) res = res + 1 end return res - end):cuda() + end):cuda() elseif localInit == 5 then + local val = 0 + local res = 0 + timeCudaTensor = torch.Tensor( + torch.LongStorage(ps)):apply(function() + val = val + 1 + if val == ps[#ps] + 1 then + val = 1 + res = res + 1 + end + return res + end) + if #timeCudaTensor:size() == 3 then + timeCudaTensor = timeCudaTensor:transpose(2,3):contiguous():cuda() + else + timeCudaTensor = timeCudaTensor:cuda() + end + elseif localInit == 6 then local val = 0 timeCudaTensor = torch.Tensor( torch.LongStorage(ps)):apply(function() @@ -404,33 +430,190 @@ local function run(localInit, problemSizes) print(timeCudaTensor:float()) end - local function assertdiff(reffft, fbfft, fftDim, fftSize) - if ps[1] > 512 then - print('Skip horrendously long test, need to transpose', - ' the data efficiently to test') - return - end - local m = (reffft:double() - fbfft:double()):abs():max() - local n = reffft:double():norm() + 1e-10 - local nfbfft = fbfft:double():norm() + 1e-10 - if m / n > precision then - if printResults then - print('Check max diff, norm, norm fbfft, max normalized = ', - m, n, nfbfft, m / n) - print('FAILS CHECK !!') - print(m, n, m / n) - end - end - assert(m / n < precision) - return + local function checkEqual(a, b, complexCheck) + if printResults then + print('Top left block equality\n', a, b) + end + local a = a:double():abs() + 1e-10 + local b = b:double():abs() + 1e-10 + local delta = (a:double() - b:double()):abs() + local max = a:max() + 1e-20 + local deltaNorm = delta:div(max) + + if printResults and deltaNorm:max() > precision then + print('Check max delta, norm ref fft, max normalized, prec = ', + delta:max(), b:norm(), deltaNorm:max(), precision) + print('RE:\n', + a:select(#a:size(), 1, 1), + b:select(#b:size(), 1, 1)) + print('IM:\n', + a:select(#a:size(), 2, 1), + b:select(#b:size(), 2, 1)) + end + if deltaNorm:max() > precision then + print('Error Delta RE', delta:select(#delta:size(), 1, 1)) + print('Error Delta IM', delta:select(#delta:size(), 2, 1)) + end + assert(deltaNorm:max() <= precision, + deltaNorm:max() .. ' > ' .. precision) + end + + local function checkOrthogonalSymmetry(r, fftSize) + if printResults then + print('Row orthogonal symmetry\n', r) + end + + local max = r:clone():abs():max() + 1e-20 + for k = 1, fftSize / 2 - 1 do + local d1 = r[fftSize / 2 + 1 - k][1] - r[fftSize / 2 + 1 + k][1] + assert( + math.abs(d1) / max < precision, + d1 .. ' ' .. math.abs(d1) / max .. ' ' .. precision + ) + local d2 = r[fftSize / 2 + 1 - k][2] + r[fftSize / 2 + 1 + k][2] + assert( + math.abs(d2) / max < precision, + d2 .. ' ' .. math.abs(d2) / max .. ' ' .. precision + ) + end end + local + function checkCentralSymmetry(cuFFT, fbFFT, fftSize, imaginaryPart) + if printResults then + print('Remaining block central symmetry', cuFFT, fbFFT) + end + assert(cuFFT:size(1) == fbFFT:size(1)) + assert(cuFFT:size(2) == fbFFT:size(2)) + assert(cuFFT:size(3) == fbFFT:size(3)) + assert(cuFFT:size(2) == cuFFT:size(3)) + + local max = cuFFT:clone():abs():max() + 1e-20 + for i = 1, cuFFT:size(1) do + for j = 1, cuFFT:size(2) do + for k = 1, cuFFT:size(2) do + local fbFFTVal = fbFFT + [i][1 + cuFFT:size(2) - j][1 + cuFFT:size(2) - k] + + local d1 = cuFFT[i][j][k] - fbFFTVal + if imaginaryPart then + d1 = cuFFT[i][j][k] + fbFFTVal + end + + if math.abs(d1) / max > precision then + print('Error Delta\n', d1, ' @ ', i, j, k) + print(cuFFT[i][j][k], + ' vs ', + fbFFTVal) + end + assert( + math.abs(d1) / max < precision, + d1 .. ' ' .. math.abs(d1) / max .. ' ' .. precision + ) + end + end + end + end + + local function assertdiffHermitian( + reffft, fbfft, fftDim, fftSize, complexCheck) + if ps[1] > 512 then + print('Skip long test based on lua side loops') + return + end + + if fftDim == 1 or (fftDim == 2 and not complexCheck) then + -- Just check tensor relative equality modulo precision + checkEqual(reffft:clone(), fbfft:clone()) + else + assert(complexCheck) + assert(fftDim == 2) + -- Hermitian check is comprised of 4 checks, one is fbfft vs + -- cufft, the others are symmetry checks + checkEqual( + fbfft:narrow( + 2, 1, fftSize / 2 + 1 + ):narrow(3, 1, fftSize / 2 + 1):clone(), + reffft:narrow( + 2, 1, fftSize / 2 + 1 + ):narrow(3, 1, fftSize / 2 + 1):clone() + ) + + -- Orthogonal symmetry for first and middle rows along vertical + -- plane FFTSize / 2 = 1 + for i = 1, reffft:size(1) do + for k = 1, fftSize / 2 + 1 do + checkOrthogonalSymmetry(fbfft[i][1]:clone(), fftSize) + checkOrthogonalSymmetry( + fbfft[i][fftSize / 2 + 1]:clone(), fftSize) + end + end + + if fftSize > 2 then + -- Central symmetry for: + -- [1, FFTSize / 2) x [FFTSize / 2 + 1, FFTSize) and + -- [FFTSize / 2 + 1, FFTSize) x [FFTSize / 2 + 1, FFTSize) + local f = fbfft:narrow( + 2, 2, (fftSize / 2 - 1) + ):narrow( + 3, fftSize / 2 + 1 + 1, (fftSize / 2 - 1) + ):clone() + local c = reffft:narrow( + 3, 2, (fftSize / 2 - 1) + ):narrow( + 2, fftSize / 2 + 1 + 1, (fftSize / 2 - 1) + ):clone() + checkCentralSymmetry( + c:select(4, 1), f:select(4, 1), fftSize) + checkCentralSymmetry( + c:select(4, 2), f:select(4, 2), fftSize, true) + end + end + return + end + + local function assertdiffTransposed(reffft, fbfft, fftDim, fftSize) + if ps[1] > 512 then + print('Skip horrendously long test, need to transpose', + ' the data efficiently to test') + return + end + local m = (reffft:double() - fbfft:double()):abs():max() + local n = reffft:double():norm() + 1e-10 + local nfbfft = fbfft:double():norm() + 1e-10 + if m / n > precision then + print('Check max diff, norm, norm fbfft, max normalized = ', + m, n, nfbfft, m / n) + print('FAILS CHECK !!') + print(m, n, m / n) + if fftDim == 2 and #reffft:size() == 4 then + print('DIFFTENSOR REAL!\n') + print(reffft:add(-fbfft):float():select(fftDim + 2, 1)) + print('DIFFTENSOR IM!\n') + print(reffft:add(-fbfft):float():select(fftDim + 2, 2)) + else + print(reffft, fbfft) + print('DIFFTENSOR REAL!\n') + print(reffft:add(-fbfft):float()) + end + end + assert(m / n < precision) + return + end + + local cufft, cuifft = benchmarkCuFFT(problemSizes[i], timeCudaTensor) local fbfft, fbifft = benchmarkFBFFT(problemSizes[i], timeCudaTensor, matchCuFFTAlloc) - local cufft, cuifft = benchmarkCuFFT(problemSizes[i], timeCudaTensor) + + local fftSize = ps[2] if runTests then - assertdiff(cufft, fbfft, fftDim, ps[2]) - assertdiff(cuifft, fbifft, fftDim, ps[2]) + if not transposedLayout(fftDim, fftSize) then + assertdiffHermitian(cufft, fbfft, fftDim, fftSize, true) + assertdiffHermitian(cuifft, fbifft, fftDim, fftSize, false) + else + assertdiffTransposed(cufft, fbfft, fftDim, fftSize) + assertdiffTransposed(cuifft, fbifft, fftDim, fftSize) + end end timeCudaTensor = {} @@ -439,20 +622,30 @@ local function run(localInit, problemSizes) end end +printResults = false +local localInits = {7} -- only run on random inputs to cut down testing time +local runCases = testCases + +--[[ +-- Convenient override of the default that are used for unit tests +localInits = {1} +runCases = _iclr2015TestCases +--]] + function FFTTester.test() -- Type of initialization: -- 1: fill(1.0f) -- 2: 1.0f if 0 mod 2 else 2.0f -- 3: val % 4 + 1 -- 4: val == row --- 5: starts at 1.0f and += 1.0f at each entry +-- 5: val == col +-- 6: starts at 1.0f and += 1.0f at each entry -- else: random - local localInits = {1, 2, 3, 4, 5, 6} - for i = 1, #localInits do - run(localInits[i], testCases) - collectgarbage() - cutorch.synchronize() - end + for i = 1, #localInits do + run(localInits[i], runCases) + collectgarbage() + cutorch.synchronize() + end end mytester:add(FFTTester) diff --git a/test/test_SequentialCriterion.lua b/test/test_SequentialCriterion.lua index fae3a01..e831dc0 100644 --- a/test/test_SequentialCriterion.lua +++ b/test/test_SequentialCriterion.lua @@ -77,7 +77,8 @@ function testSequentialCriterion() local n_classes = torch.random(200) local module = nn.Linear(input_size, n_classes) local crit = nn.ClassNLLCriterion() - testSequentialCriterion_run(input_size, n_classes, module, crit) + testSequentialCriterion_run(input_size, n_classes, module, + crit, 'torch.LongTensor') -- try with HSM local input1_size = torch.random(200) diff --git a/test/test_SparseNLLCriterion.lua b/test/test_SparseNLLCriterion.lua index b9cfffa..576c201 100644 --- a/test/test_SparseNLLCriterion.lua +++ b/test/test_SparseNLLCriterion.lua @@ -79,9 +79,9 @@ end function testSparseNLLCriterion() for k = 1, test_repeats do - local n_classes = torch.random(1000) + local n_classes = torch.random(100) local K = torch.random(n_classes) - local batch_size = torch.random(100) + local batch_size = torch.random(32) local err1, err2 = test_sparseNLL(K, n_classes, batch_size, false) assertTrue(err1 < 1e-3) assertTrue(err2 < 1e-3) diff --git a/test/test_SpatialConvolutionTuned.lua b/test/test_SpatialConvolutionTuned.lua new file mode 100644 index 0000000..959501e --- /dev/null +++ b/test/test_SpatialConvolutionTuned.lua @@ -0,0 +1,209 @@ +require 'cunn' +require 'fbcunn' +require 'math' + +require 'fb.luaunit' +require('fbtorch') +g_mytester = torch.Tester() +local fb_test = {} + +local silence = true +local printMemory = false +local inferenceOnly = false + +local function reportAndFree(net) + if printMemory and not silence then + local free, total = cutorch.getMemoryUsage() + print("Pre Collect Memory: " , free , " free " , total , " total") + end + -- release entries from the global buffer table + if net and net.cleanupBuffers then + net:cleanupBuffers() + net = nil + end + collectgarbage() + collectgarbage() + if printMemory and not silence then + local free, total = cutorch.getMemoryUsage() + print("Post Collect Memory: " , free , " free " , total , " total") + end +end + +local function testSpatialConvolutionTuned(problem, FFTConvolutionClass) + local batches = problem[1] + local inputPlanes = problem[2] + local outputPlanes = problem[3] + local iH = problem[4] + local iW = problem[5] + local kH = problem[6] + local kW = problem[7] + local padH = problem[8] + local padW = problem[9] + + if not silence then + print('Running ', batches, inputPlanes, outputPlanes, + " kH = ", kH, " x ", "kW = ", kW, + " x ", "iH = ", iH, " x ", "iW = ", iW, + " x ", "padH = ", padH, " x ", padW) + end + + -- All the necessary checks are already performed while searching + -- for the best convolution + local netForward = fbnn.SpatialConvolution( + inputPlanes, + outputPlanes, + kW, + kH, + 1, + 1, + padW, + padH, + nil, -- no memory limit + inferenceOnly -- not just inference + ) + if not silence then + netForward.reportLevel = 2 + end + + local ps = {batches, inputPlanes, iH, iW} + local input = torch.Tensor(torch.LongStorage(ps)):cuda() + local ps = {batches, + outputPlanes, + iH - kH + 2 * padH + 1, + iW - kW + 2 * padW + 1} + local gradOutput = torch.Tensor(torch.LongStorage(ps)):cuda() + local scale = torch.random(100) / 100.0 + netForward:updateOutput(input) + if not inferenceOnly then + netForward:updateGradInput(input, gradOutput) + netForward:accGradParameters(input, gradOutput, scale) + end + + return netForward +end + + +local problemsToRun = { + -- batch, input, output, iH, iW, kH, kW, padH, padW + {1, 1, 1, 4, 4, 3, 3, 0, 0}, + {1, 1, 1, 1, 1, 1, 1, 0, 0}, + {1, 1, 1, 1, 2, 1, 2, 0, 0}, + {1, 1, 1, 1, 3, 1, 3, 0, 0}, + {1, 1, 1, 6, 6, 4, 4, 0, 0}, + {1, 1, 1, 11, 11, 8, 8, 0, 0}, + {2, 1, 3, 1, 1, 1, 1, 0, 0}, + {2, 3, 1, 1, 1, 1, 1, 0, 0}, + {2, 3, 4, 5, 5, 5, 5, 0, 0}, + {1, 1, 1, 3, 3, 3, 3, 0, 0}, + {1, 1, 1, 2, 2, 2, 2, 0, 0}, + {1, 1, 1, 1, 2, 1, 2, 0, 0}, + {1, 1, 1, 2, 3, 2, 3, 0, 0}, + {2, 3, 4, 5, 5, 5, 5, 0, 0}, + {128, 64, 64, 1, 1, 1, 1, 0, 0}, + {128, 64, 100, 1, 1, 1, 1, 0, 0}, + {128, 64, 64, 3, 3, 3, 3, 0, 0}, + {128, 64, 64, 3, 3, 3, 3, 0, 0}, + {128, 64, 64, 3, 3, 3, 3, 0, 0}, + {128, 64, 64, 3, 3, 3, 3, 0, 0}, + {128, 64, 64, 3, 3, 3, 3, 0, 0}, + {1, 1, 1, 20, 17, 13, 14, 0, 0}, + -- Cannot put in unit tests due to 5GB memory limit + -- {128, 128, 128, 128, 128, 3, 3, 0, 0}, -- falls back to cudnn + {1, 1, 1, 27, 27, 5, 5, 0, 0}, + {1, 1, 1, 27, 27, 5, 5, 1, 0}, + {1, 1, 1, 27, 27, 5, 5, 0, 1}, + {1, 1, 1, 27, 27, 5, 5, 1, 2}, + {1, 1, 1, 27, 27, 5, 5, 2, 1}, + {1, 1, 1, 27, 27, 5, 5, 2, 2}, + {1, 1, 1, 19, 23, 3, 4, 0, 0}, + {1, 1, 1, 19, 23, 3, 4, 1, 0}, + {1, 1, 1, 19, 23, 3, 4, 0, 1}, + {1, 1, 1, 19, 23, 3, 4, 1, 2}, + {1, 1, 1, 19, 23, 3, 4, 2, 1}, + {1, 1, 1, 19, 23, 3, 4, 2, 2}, + + {1, 1, 1, 128, 128, 3, 3, 0, 0}, +} + +local _expensiveProblemsToRun = { + {1, 512, 768, 16, 16, 14, 14, 0, 0}, + {2, 512, 768, 16, 16, 14, 14, 0, 0}, + {8, 512, 768, 16, 16, 14, 14, 0, 0}, + {1, 512, 768, 24, 24, 14, 14, 0, 0}, + {2, 512, 768, 24, 24, 14, 14, 0, 0}, + {8, 512, 768, 24, 24, 14, 14, 0, 0}, + {1, 512, 768, 72, 72, 14, 14, 0, 0}, + {2, 512, 768, 72, 72, 14, 14, 0, 0}, + {8, 512, 768, 72, 72, 14, 14, 0, 0}, +} + +local _benchmark3x3 = { + {64, 3, 64, 224, 224, 3, 3, 1, 1}, + {32, 32, 32, 30, 30, 3, 3, 0, 0}, + {64, 64, 64, 30, 30, 3, 3, 0, 0}, + {128, 128, 128, 30, 30, 3, 3, 0, 0}, + {32, 32, 32, 27, 27, 3, 3, 1, 1}, + {64, 64, 64, 27, 27, 3, 3, 1, 1}, + {128, 128, 128, 27, 27, 3, 3, 1, 1}, + {32, 32, 32, 14, 14, 3, 3, 0, 0}, + {64, 64, 64, 14, 14, 3, 3, 0, 0}, + {128, 128, 128, 14, 14, 3, 3, 0, 0}, + {32, 32, 32, 12, 12, 3, 3, 1, 1}, + {64, 64, 64, 12, 12, 3, 3, 1, 1}, + {128, 128, 128, 12, 12, 3, 3, 1, 1}, + {64, 128, 128, 14, 14, 3, 3, 1, 1}, + {64, 256, 256, 14, 14, 3, 3, 1, 1}, + {64, 512, 512, 14, 14, 3, 3, 1, 1}, +} + +-- These should correspond with Soumith's benchmarks +-- https://raw.githubusercontent.com/soumith/convnet-benchmarks/master/torch7/imagenet_winners/output_raw.log +local _benchmarkAlexNet = { + -- 1 GPU + {128, 64, 192, 27, 27, 5, 5, 2, 2}, + {128, 192, 384, 13, 13, 3, 3, 1, 1}, + {128, 384, 256, 13, 13, 3, 3, 1, 1}, + {128, 256, 256, 13, 13, 3, 3, 1, 1}, +} + +local _benchmarkOverFeat = { + -- 1 GPU + {128, 96, 256, 24, 24, 5, 5, 2, 2}, + {128, 256, 512, 12, 12, 3, 3, 1, 1}, + {128, 512, 1024, 12, 12, 3, 3, 1, 1}, + {128, 1024, 1024, 12, 12, 3, 3, 1, 1}, +} + +local _benchmarkVGG = { + -- 1 GPU + {64, 3, 64, 224, 224, 3, 3, 1, 1}, + {64, 64, 128, 112, 112, 3, 3, 1, 1}, + {64, 128, 256, 56, 56, 3, 3, 1, 1}, + {64, 256, 256, 56, 56, 3, 3, 1, 1}, + {64, 256, 512, 28, 28, 3, 3, 1, 1}, + {64, 512, 512, 28, 28, 3, 3, 1, 1}, + {64, 512, 512, 14, 14, 3, 3, 1, 1}, + {64, 512, 512, 14, 14, 3, 3, 1, 1}, +} + +--[[ + Uncomment this for expensive problems + problemsToRun = _expensiveProblemsToRun + problemsToRun = _benchmarkAlexNet + problemsToRun = _benchmarkOverFeat + problemsToRun = _benchmarkVGG + problemsToRun = _benchmark3x3 + inferenceOnly = true +--]] + +function fb_test.testSpatialConvolutionTuned() + for i = 1, #problemsToRun do + local net = + testSpatialConvolutionTuned(problemsToRun[i]) + reportAndFree(net) + end +end + +g_mytester = torch.Tester() +g_mytester:add(fb_test) +g_mytester:run() diff --git a/test/test_TemporalKMaxPooling.lua b/test/test_TemporalKMaxPooling.lua index 33c9921..395ba3d 100644 --- a/test/test_TemporalKMaxPooling.lua +++ b/test/test_TemporalKMaxPooling.lua @@ -166,19 +166,5 @@ function TemporalKMaxPoolingTest.sequential() assert (gradInput_matches:sum() == gradInput_matches:numel()) end -function TemporalKMaxPoolingTest.dynamic() - local kmax = nn.TemporalKMaxPooling(2, 0.5) - local seq = nn.Sequential() - seq:add(nn.TemporalKMaxPooling(2, 0.5)) - - for n=12,13 do - local input = torch.randn(n, 1):cuda() - local kmax_output = kmax:updateOutput(input) - local seq_output = seq:updateOutput(input) - assert (kmax_output:size(1) == 6) - assert (torch.all(kmax_output:eq(seq_output))) - end -end - tester:add(TemporalKMaxPoolingTest) tester:run() diff --git a/test/test_WeightedLookupTable.lua b/test/test_WeightedLookupTable.lua index 06ee917..0bd1cb3 100644 --- a/test/test_WeightedLookupTable.lua +++ b/test/test_WeightedLookupTable.lua @@ -26,8 +26,8 @@ function test_WeightedLookupTable_forward() local input_length = 9 local tol = 1e-8 - local wlut = nn.WeightedLookupTable(table_size, embedding_dim) - local ulut = nn.LookupTable(table_size, embedding_dim) + local wlut = nn.WeightedLookupTable(table_size, embedding_dim):cuda() + local ulut = nn.LookupTable(table_size, embedding_dim):cuda() ulut.weight:copy(wlut.weight) assert(all(torch.eq(wlut.weight, ulut.weight))) @@ -35,22 +35,22 @@ function test_WeightedLookupTable_forward() local weights = torch.rand(input_length, 1) local winput = torch.cat(uinput, weights, 2) - local woutput = wlut:forward(winput) - local uoutput = ulut:forward(uinput) - + local woutput = wlut:forward(winput:cuda()) + local uoutput = ulut:forward(uinput:cuda()) + weights = weights:cuda() local expected_woutput = torch.cmul(uoutput, weights:expandAs(uoutput)) - assert(all(almost_equal(woutput, expected_woutput, tol))) + assert(all(almost_equal(woutput:float(), expected_woutput:float(), tol))) end function test_WeightedLookupTable_accGradParameters() local embedding_dim = 4 local table_size = 30 local input_length = 9 - local tol = 1e-8 + local tol = 1e-5 - local wlut = nn.WeightedLookupTable(table_size, embedding_dim) - local ulut = nn.LookupTable(table_size, embedding_dim) + local wlut = nn.WeightedLookupTable(table_size, embedding_dim):cuda() + local ulut = nn.LookupTable(table_size, embedding_dim):cuda() ulut.weight:copy(wlut.weight) assert(all(torch.eq(wlut.weight, ulut.weight))) @@ -58,16 +58,18 @@ function test_WeightedLookupTable_accGradParameters() local weights = torch.range(1, input_length):reshape(input_length, 1) local winput = torch.cat(uinput, weights, 2) + winput = winput:cuda() + uinput = uinput:cuda() local woutput = wlut:forward(winput) local uoutput = ulut:forward(uinput) local wgradOutput = torch.randn(woutput:size()) local ugradOutput = torch.cmul(wgradOutput, weights:expandAs(wgradOutput)) - wlut:accGradParameters(winput, wgradOutput, 1) - ulut:accGradParameters(uinput, ugradOutput, 1) + wlut:accGradParameters(winput, wgradOutput:cuda(), 1) + ulut:accGradParameters(uinput, ugradOutput:cuda(), 1) - assert(all(almost_equal(wlut.gradWeight, ulut.gradWeight, tol))) + assert(all(almost_equal(wlut.gradWeight:float(), ulut.gradWeight:float(), tol))) end