From 661cd1bb306805ebc4f2aaf30d008c00f95f30e2 Mon Sep 17 00:00:00 2001
From: Zeming Lin <ebetica0@gmail.com>
Date: Thu, 8 Sep 2016 05:00:46 -0400
Subject: [PATCH] Pushing internal FBCUNN updates

---
 .gitignore                                    |    1 +
 CMakeLists.txt                                |  402 +++++-
 PATENTS                                       |    2 +-
 cuda                                          |    2 +-
 examples/imagenet/README.md                   |    2 -
 fbcunn/AbstractParallel.lua                   |   49 +-
 fbcunn/BatchNormalization.lua                 |  185 +++
 fbcunn/CuBLASWrapper.lua                      |   39 +-
 fbcunn/DataParallel.lua                       |   10 +-
 fbcunn/FFTCDefs.lua                           |   34 +
 fbcunn/FFTWrapper.lua                         |  234 +++-
 fbcunn/FeatureLPPooling.lua                   |    4 +-
 fbcunn/LookupTableGPU.lua                     |    2 +-
 fbcunn/ModelParallel.lua                      |   47 +
 fbcunn/OneBitSGD.lua                          |    3 -
 fbcunn/SpatialBatchNormalization.lua          |  188 +++
 fbcunn/SpatialConvolution.lua                 |  501 ++++++++
 fbcunn/SpatialConvolutionCuFFT.lua            | 1081 +++++++++++++----
 fbcunn/SpatialConvolutionFBFFT.lua            |  433 +++++++
 fbcunn/SpatialConvolutionFBFFTGemm.lua        |  599 +++++++++
 fbcunn/SpatialConvolutionFFT.lua              | 1012 +++++++++++++++
 fbcunn/SpatialConvolutionFFTTiled.lua         |  924 ++++++++++++++
 fbcunn/SpatialConvolutionFFTTiledAsync.lua    |  369 ++++++
 fbcunn/SpatialConvolutionFFTTiledIterated.lua |  231 ++++
 fbcunn/SpatialConvolutionFFTTiledSync.lua     |  247 ++++
 fbcunn/TemporalKMaxPooling.lua                |   19 +
 fbcunn/init.lua                               |   23 +-
 src/BLASParameters.cpp                        |    7 +-
 src/BLASParameters.h                          |   27 +
 src/BatchNormalization.cu                     |  460 +++++++
 src/ConvolutionBias.cu                        |   35 +-
 src/CrossMapNormalization.cu                  |    2 +-
 src/CrossMapNormalizationHost.cpp             |    4 +-
 src/CuBLASWrapper.cpp                         |  129 +-
 src/CuBLASWrapper.h                           |   15 +-
 src/CuBLASWrapperLua.cpp                      |  187 ++-
 src/CudaTensorUtils.cpp                       |   10 +-
 src/CudaTensorUtils.h                         |    2 +-
 src/DeviceTensorUtils.h                       |    2 +-
 src/FeatureLPPooling.cu                       |   10 +-
 src/FeatureLPPoolingHost.cpp                  |    8 +-
 src/HSMHost.cpp                               |    2 +-
 src/HalfPrec.cpp                              |    8 +-
 src/HalfPrecKernels.cu                        |    6 +-
 src/HalfPrecTest.cpp                          |    2 +-
 src/InitCuda.cpp                              |   17 +-
 src/LocallyConnected.cuh                      |    2 +-
 src/LocallyConnectedHost.cpp                  |   10 +-
 src/LookupTableGPUHost.cpp                    |    4 +-
 src/MM.cu                                     |   67 +-
 src/MM.h                                      |    3 +-
 src/OneBitQuantization.cu                     |    2 +-
 src/OneBitQuantizationHost.cpp                |    6 +-
 src/SparseNLLCriterion.cu                     |    6 +-
 src/SparseNLLCriterionHost.cpp                |    6 +-
 src/SpatialBatchNormalization.cu              |  791 ++++++++++++
 src/TemporalConvolutionFBHost.cpp             |   10 +-
 src/TemporalKMaxPooling.cu                    |    6 +-
 src/TemporalKMaxPoolingHost.cpp               |   13 +-
 src/TemporalMaxPooling.cu                     |   13 +-
 src/WeightedLookupTable.cu                    |   51 +
 src/WeightedLookupTableHost.cpp               |   58 +
 src/fft/CuFFTConvolution.cpp                  |   30 +-
 src/fft/CuFFTConvolution_AccGradParameters.cu |   14 +-
 src/fft/CuFFTConvolution_UpdateGradInput.cu   |   12 +-
 src/fft/CuFFTConvolution_UpdateOutput.cu      |   14 +-
 src/fft/CuFFTStrategy.h                       |    4 +-
 src/fft/CuFFTWrapper.cu                       |   51 +-
 src/fft/CuFFTWrapper.cuh                      |   23 +-
 src/fft/FBFFTDevice.cu                        |   10 +-
 src/fft/FBFFTHost.cpp                         |   36 +-
 src/fft/FBFFTHost.h                           |    3 +-
 src/fft/FFTIteratedConvolution.cu             |   98 ++
 src/fft/FFTWrapperLua.cpp                     |  212 +++-
 src/fft/SpatialConvolutionCuFFT.cpp           |   64 +-
 src/fft/SpatialConvolutionCuFFT.h             |    2 +-
 src/fft/SpatialConvolutionCuFFTHost.cpp       |   17 +-
 src/fft/SpatialConvolutionCuFFTTuner.cpp      |    6 +-
 src/fft/SpatialConvolutionCuFFTTuner.h        |    2 +-
 src/fft/Utils-inl.h                           |   16 +-
 src/fft/Utils.h                               |    2 +-
 src/util/AsyncCopier.cpp                      |    7 +-
 src/util/AsyncCopier.h                        |    2 +-
 src/util/GlobalAsyncCopier.cpp                |    6 +-
 src/util/Misc.cpp                             |   72 +-
 src/util/Misc.h                               |   14 +-
 src/util/Transform.cu                         |    4 +-
 src/util/Transform.cuh                        |    2 +-
 test/BiasTest.cpp                             |   50 +-
 test/ConvolutionTest.cpp                      |  149 ++-
 test/CuBLASTest.cpp                           |   38 +-
 test/CudaTensorTest.cpp                       |   95 +-
 test/CudaTensorTestKernels.cu                 |    7 +-
 test/FFTTest.cpp                              |   55 +-
 test/InputCentricConvolution_UpdateOutput.cu  |    8 +-
 test/ReferenceConvolutions.cpp                |   18 +-
 test/ReferenceConvolutions.h                  |    2 +-
 test/TestUtils.cpp                            |    2 +-
 test/TestUtils.h                              |    8 +-
 test/test.lua                                 |   44 +
 test/test_BatchNormalization.lua              |  227 ++++
 test/test_ClassHierarchicalNLLCriterion.lua   |    6 +-
 .../{benchmark_cublas.lua => test_CuBLAS.lua} |   92 +-
 test/test_CuFFT.lua                           |  310 +++++
 test/test_DataParallel.lua                    |  190 ++-
 test/test_DataParallelComprehensive.lua       |  132 --
 test/test_FBFFTTiling.lua                     |  208 ++++
 test/test_FFT.lua                             |  615 ++++++++--
 test/test_FFTModule.lua                       |  329 +++--
 test/test_SequentialCriterion.lua             |    3 +-
 test/test_SparseNLLCriterion.lua              |    4 +-
 test/test_SpatialConvolutionTuned.lua         |  209 ++++
 test/test_TemporalKMaxPooling.lua             |   14 -
 test/test_WeightedLookupTable.lua             |   26 +-
 114 files changed, 10624 insertions(+), 1564 deletions(-)
 delete mode 100644 examples/imagenet/README.md
 create mode 100644 fbcunn/BatchNormalization.lua
 create mode 100644 fbcunn/FFTCDefs.lua
 create mode 100644 fbcunn/SpatialBatchNormalization.lua
 create mode 100644 fbcunn/SpatialConvolution.lua
 create mode 100644 fbcunn/SpatialConvolutionFBFFT.lua
 create mode 100644 fbcunn/SpatialConvolutionFBFFTGemm.lua
 create mode 100644 fbcunn/SpatialConvolutionFFT.lua
 create mode 100644 fbcunn/SpatialConvolutionFFTTiled.lua
 create mode 100644 fbcunn/SpatialConvolutionFFTTiledAsync.lua
 create mode 100644 fbcunn/SpatialConvolutionFFTTiledIterated.lua
 create mode 100644 fbcunn/SpatialConvolutionFFTTiledSync.lua
 create mode 100644 src/BatchNormalization.cu
 create mode 100644 src/SpatialBatchNormalization.cu
 create mode 100644 src/WeightedLookupTable.cu
 create mode 100644 src/WeightedLookupTableHost.cpp
 create mode 100644 src/fft/FFTIteratedConvolution.cu
 create mode 100644 test/test_BatchNormalization.lua
 rename test/{benchmark_cublas.lua => test_CuBLAS.lua} (56%)
 create mode 100644 test/test_CuFFT.lua
 delete mode 100755 test/test_DataParallelComprehensive.lua
 create mode 100644 test/test_FBFFTTiling.lua
 create mode 100644 test/test_SpatialConvolutionTuned.lua

diff --git a/.gitignore b/.gitignore
index 943482a..21db6f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 TARGETS
 facebook
+build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ebe083..c39a06e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,88 +10,89 @@ FIND_PACKAGE(Torch REQUIRED)
 INCLUDE(MultiLevelIncludes)
 MLI_SET_DEPTH(2)
 FIND_PACKAGE(Folly REQUIRED)
-FIND_PACKAGE(CUDA 6.5 REQUIRED)
+FIND_PACKAGE(CUDA 7.5 REQUIRED)
 LIST(APPEND CUDA_NVCC_FLAGS "-arch=sm_35")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
-#SET(CMAKE_SKIP_BUILD_RPATH  FALSE)
-#SET(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
-#SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 # bad to hardcode like this, but i dont see any other way yet. baby steps
-SET(CMAKE_INSTALL_RPATH "${Torch_INSTALL_LIB}/lua/5.1;/usr/local/lib:${CMAKE_INSTALL_RPATH}") 
+SET(CMAKE_INSTALL_RPATH "${Torch_INSTALL_LIB}/lua/5.1;/usr/local/lib:${CMAKE_INSTALL_RPATH}")
 
 INCLUDE_DIRECTORIES("${Torch_INSTALL_INCLUDE}/THC")
 LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
 
 INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}")
 INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/src")
+INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/cuda")
+
+##################### Lua stuff  ########################
 
 FILE(GLOB luasrc fbcunn/*.lua)
+INSTALL(
+  FILES
+  ${luasrc}
+  DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/fbcunn")
 
-########################################################
+###################  C++ Stuff  #########################
+################# libfbcunn
 SET(src-cuda
   src/init.cu
-  src/Utils.cpp
   )
-CUDA_ADD_LIBRARY(fbcunn MODULE ${src-cuda})
-TARGET_LINK_LIBRARIES(fbcunn luaT THC TH  fbcunnlayers_cuda)
+CUDA_ADD_LIBRARY(libfbcunn MODULE ${src-cuda})
+TARGET_LINK_LIBRARIES(libfbcunn luaT THC TH thpp folly fbcunn_custate fbcuda_util)
 
 ### Torch packages supposes libraries prefix is "lib"
-SET_TARGET_PROPERTIES(fbcunn PROPERTIES
-  PREFIX "lib"
-  IMPORT_PREFIX "lib")
+SET_TARGET_PROPERTIES(libfbcunn PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
 
-INSTALL(TARGETS fbcunn
+INSTALL(TARGETS libfbcunn
   RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
   LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
 
-#######################
-SET(src-layers-cuda
-  src/CrossMapNormalization.cu
-  src/FeatureLPPooling.cu
-  src/HalfPrecKernels.cu
-  src/HSM.cu
-  src/LookupTableGPU.cu
-  src/OneBitQuantization.cu
-  src/SparseNLLCriterion.cu
-  src/TemporalKMaxPooling.cu
-  # src/TemporalMaxPooling.cu this is included directly in init.cu
-
-  src/ConvolutionBias.cu
-
-  src/fft/CuFFTWrapper.cu
-  src/fft/FBFFTDevice.cu
-  src/fft/CuFFTConvolution_UpdateOutput.cu
-  src/fft/CuFFTConvolution_UpdateGradInput.cu
-  src/fft/CuFFTConvolution_AccGradParameters.cu
-  src/fft/CuFFTConvolution.cpp
-  src/fft/SpatialConvolutionCuFFT.cpp
-  src/fft/SpatialConvolutionCuFFTTuner.cpp
+################# fbcunn_custate
+SET(src-cuda
+  src/Utils.cpp
+  )
+CUDA_ADD_LIBRARY(fbcunn_custate SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(fbcunn_custate luaT THC TH thpp folly)
 
-  src/LocallyConnectedAccGradParameters.cu
-  src/LocallyConnectedUpdateGradInput.cu
-  src/LocallyConnectedUpdateOutput.cu
+SET_TARGET_PROPERTIES(fbcunn_custate PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
 
-  src/MM.cu
+INSTALL(TARGETS fbcunn_custate
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
 
-  src/util/Transform.cu
-  src/util/Misc.cpp
 
-  src/CudaTensorUtils.cpp
-  src/CuBLASWrapper.cpp
-  src/BLASParameters.cpp
-  cuda/KernelTimer.cpp
-  src/fft/FBFFTHost.cpp
+################# layers_cuda
+SET(src-cuda
+  src/CrossMapNormalization.cu
+  src/LocallyConnectedUpdateOutput.cu
+  src/LocallyConnectedUpdateGradInput.cu
+  src/LocallyConnectedAccGradParameters.cu
+  src/LookupTableGPU.cu
+  src/HSM.cu
+  src/TemporalKMaxPooling.cu
+  src/SparseNLLCriterion.cu
+  src/WeightedLookupTable.cu
   )
+CUDA_ADD_LIBRARY(layers_cuda SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(layers_cuda luaT THC TH thpp folly feature_lp_pooling one_bit_quantization)
+
+SET_TARGET_PROPERTIES(layers_cuda PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
 
-CUDA_ADD_LIBRARY(fbcunnlayers_cuda SHARED ${src-layers-cuda})
-TARGET_LINK_LIBRARIES(fbcunnlayers_cuda luaT THC TH folly ${CUDA_cufft_LIBRARY})
-INSTALL(TARGETS fbcunnlayers_cuda
+INSTALL(TARGETS layers_cuda
   RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
   LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
 
-SET(src-layers-cpp
+################# cuda_ext
+SET(src-cuda
   src/InitCuda.cpp
   src/CrossMapNormalizationHost.cpp
+  src/LocallyConnectedHost.cpp
   src/LookupTableGPUHost.cpp
   src/HSMHost.cpp
   src/TemporalConvolutionFBHost.cpp
@@ -99,23 +100,298 @@ SET(src-layers-cpp
   src/OneBitQuantizationHost.cpp
   src/SparseNLLCriterionHost.cpp
   src/FeatureLPPoolingHost.cpp
-  src/fft/SpatialConvolutionCuFFTHost.cpp
-  src/fft/FFTWrapperLua.cpp
   src/CuBLASWrapperLua.cpp
+  src/fft/FFTWrapperLua.cpp
+  src/fft/SpatialConvolutionCuFFT.cpp
+  src/fft/SpatialConvolutionCuFFTHost.cpp
+  src/fft/SpatialConvolutionCuFFTTuner.cpp
+  src/WeightedLookupTableHost.cpp
+  )
+CUDA_ADD_LIBRARY(cuda_ext MODULE ${src-cuda})
+TARGET_LINK_LIBRARIES(cuda_ext luaT THC TH thpp folly libtorch_fb_fbcunn_convolution_bias cublas_wrapper cufft_convolution_cuda cufft_convolution_host fbcunn_custate layers_cuda torch_fb_fbcunn_mm fbcuda_util)
 
-  src/LocallyConnectedHost.cpp
-  src/Utils.cpp
-)
+SET_TARGET_PROPERTIES(cuda_ext PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
 
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-ADD_LIBRARY(fbcunnlayers MODULE ${src-layers-cpp})
-TARGET_LINK_LIBRARIES(fbcunnlayers fbcunnlayers_cuda luaT THC TH folly)
+INSTALL(TARGETS cuda_ext
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}/fbcunn"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}/fbcunn")
+
+################# libhalfprec
+SET(src-cuda
+  src/HalfPrec.cpp
+  )
+CUDA_ADD_LIBRARY(libhalfprec MODULE ${src-cuda})
+TARGET_LINK_LIBRARIES(libhalfprec luaT THC TH thpp folly fbcunn_custate libcudahalf fbcuda_util)
+
+SET_TARGET_PROPERTIES(libhalfprec PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
 
-INSTALL(TARGETS fbcunnlayers
+INSTALL(TARGETS libhalfprec
   RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
   LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
 
-INSTALL(
-  FILES
-  ${luasrc}
-  DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/fbcunn")
\ No newline at end of file
+################# libcudahalf
+SET(src-cuda
+  src/HalfPrecKernels.cu
+  )
+CUDA_ADD_LIBRARY(libcudahalf SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(libcudahalf luaT THC TH thpp folly util ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+
+SET_TARGET_PROPERTIES(libcudahalf PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS libcudahalf
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# cublas_wrapper
+SET(src-cuda
+  src/BLASParameters.cpp
+  src/CuBLASWrapper.cpp
+  )
+CUDA_ADD_LIBRARY(cublas_wrapper SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(cublas_wrapper luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+
+SET_TARGET_PROPERTIES(cublas_wrapper PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS cublas_wrapper
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# torch_fb_fbcunn_mm
+SET(src-cuda
+  src/MM.cu
+  )
+CUDA_ADD_LIBRARY(torch_fb_fbcunn_mm SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(torch_fb_fbcunn_mm luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+
+SET_TARGET_PROPERTIES(torch_fb_fbcunn_mm PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS torch_fb_fbcunn_mm
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# torch_fb_fbcunn_FFTIteratedConvolution
+SET(src-cuda
+  src/fft/FFTIteratedConvolution.cu
+  )
+CUDA_ADD_LIBRARY(torch_fb_fbcunn_FFTIteratedConvolution SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(torch_fb_fbcunn_FFTIteratedConvolution luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+
+SET_TARGET_PROPERTIES(torch_fb_fbcunn_FFTIteratedConvolution PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS torch_fb_fbcunn_FFTIteratedConvolution
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# torch_fb_fbcunn_batch_norm
+SET(src-cuda
+  src/BatchNormalization.cu
+  src/SpatialBatchNormalization.cu
+  )
+CUDA_ADD_LIBRARY(torch_fb_fbcunn_batch_norm SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(torch_fb_fbcunn_batch_norm luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+
+SET_TARGET_PROPERTIES(torch_fb_fbcunn_batch_norm PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS torch_fb_fbcunn_batch_norm
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# one_bit_quantization
+SET(src-cuda
+  src/OneBitQuantization.cu
+  )
+CUDA_ADD_LIBRARY(one_bit_quantization SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(one_bit_quantization luaT THC TH thpp folly)
+
+SET_TARGET_PROPERTIES(one_bit_quantization PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS one_bit_quantization
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+
+################# feature_lp_pooling
+SET(src-cuda
+  src/FeatureLPPooling.cu
+  )
+CUDA_ADD_LIBRARY(feature_lp_pooling SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(feature_lp_pooling luaT THC TH thpp folly)
+
+SET_TARGET_PROPERTIES(feature_lp_pooling PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS feature_lp_pooling
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# cuda_tensor_utils
+SET(src-cuda
+  src/CudaTensorUtils.cpp
+  )
+CUDA_ADD_LIBRARY(cuda_tensor_utils SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(cuda_tensor_utils luaT THC TH thpp folly)
+
+SET_TARGET_PROPERTIES(cuda_tensor_utils PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS cuda_tensor_utils
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# fbfft_wrapper
+SET(src-cuda
+  src/fft/FBFFTHost.cpp
+  )
+CUDA_ADD_LIBRARY(fbfft_wrapper SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(fbfft_wrapper luaT THC TH thpp folly cuda_tensor_utils fbfft_lib fbcuda_kernel_timer)
+
+SET_TARGET_PROPERTIES(fbfft_wrapper PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS fbfft_wrapper
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# fbfft_lib
+SET(src-cuda
+  src/fft/FBFFTDevice.cu
+  )
+CUDA_ADD_LIBRARY(fbfft_lib SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(fbfft_lib luaT THC TH thpp folly)
+
+SET_TARGET_PROPERTIES(fbfft_lib PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS fbfft_lib
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# cufft_convolution_cuda
+SET(src-cuda
+  src/fft/CuFFTConvolution_UpdateOutput.cu
+  src/fft/CuFFTConvolution_AccGradParameters.cu
+  src/fft/CuFFTConvolution_UpdateGradInput.cu
+  )
+CUDA_ADD_LIBRARY(cufft_convolution_cuda SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(cufft_convolution_cuda luaT THC TH thpp folly libtorch_fb_fbcunn_convolution_bias cublas_wrapper cufft_wrapper cufft_convolution_host ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+
+SET_TARGET_PROPERTIES(cufft_convolution_cuda PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS cufft_convolution_cuda
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# cufft_convolution_host
+SET(src-cuda
+  src/fft/CuFFTConvolution.cpp
+  )
+CUDA_ADD_LIBRARY(cufft_convolution_host SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(cufft_convolution_host luaT THC TH thpp folly cufft_wrapper cublas_wrapper fbfft_wrapper torch_fb_fbcunn_mm)
+
+SET_TARGET_PROPERTIES(cufft_convolution_host PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS cufft_convolution_host
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# cufft_wrapper
+SET(src-cuda
+  src/fft/CuFFTWrapper.cu
+  )
+CUDA_ADD_LIBRARY(cufft_wrapper SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(cufft_wrapper luaT THC TH thpp folly ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+
+SET_TARGET_PROPERTIES(cufft_wrapper PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS cufft_wrapper
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# util
+SET(src-cuda
+  src/util/Transform.cu
+  src/util/AsyncCopier.cpp
+  src/util/GlobalAsyncCopier.cpp
+  src/util/Misc.cpp
+  )
+CUDA_ADD_LIBRARY(util SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(util luaT THC TH thpp folly fbcuda_util)
+
+SET_TARGET_PROPERTIES(util PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS util
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# libtorch_fb_fbcunn_convolution_bias
+SET(src-cuda
+  src/ConvolutionBias.cu
+  )
+CUDA_ADD_LIBRARY(libtorch_fb_fbcunn_convolution_bias SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(libtorch_fb_fbcunn_convolution_bias luaT THC TH thpp folly)
+
+SET_TARGET_PROPERTIES(libtorch_fb_fbcunn_convolution_bias PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS libtorch_fb_fbcunn_convolution_bias
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# fbcuda_util
+SET(src-cuda
+  cuda/util/CachedDeviceProperties.cpp
+  )
+CUDA_ADD_LIBRARY(fbcuda_util SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(fbcuda_util luaT THC TH thpp folly)
+
+SET_TARGET_PROPERTIES(fbcuda_util PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS fbcuda_util
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+
+################# fbcuda_kernel_timer
+SET(src-cuda
+  cuda/KernelTimer.cpp
+  )
+CUDA_ADD_LIBRARY(fbcuda_kernel_timer SHARED ${src-cuda})
+TARGET_LINK_LIBRARIES(fbcuda_kernel_timer luaT THC TH thpp folly)
+
+SET_TARGET_PROPERTIES(fbcuda_kernel_timer PROPERTIES
+  PREFIX ""
+  IMPORT_PREFIX "")
+
+INSTALL(TARGETS fbcuda_kernel_timer
+  RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
diff --git a/PATENTS b/PATENTS
index 4da7ff6..51e5a49 100644
--- a/PATENTS
+++ b/PATENTS
@@ -30,4 +30,4 @@ necessarily infringed by the Software standing alone.
 
 A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
 or contributory infringement or inducement to infringe any patent, including a
-cross-claim or counterclaim.
\ No newline at end of file
+cross-claim or counterclaim.
diff --git a/cuda b/cuda
index 26f01ee..8ef2af9 160000
--- a/cuda
+++ b/cuda
@@ -1 +1 @@
-Subproject commit 26f01ee8f8a3035cd58adb5ccf02245e58c06c04
+Subproject commit 8ef2af9b579b8610c59ab0ccf6e9075350de4320
diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md
deleted file mode 100644
index 2f207eb..0000000
--- a/examples/imagenet/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-This example has been moved to [https://github.com/soumith/imagenet-multiGPU.torch](https://github.com/soumith/imagenet-multiGPU.torch).  
-It will be maintained there further.
diff --git a/fbcunn/AbstractParallel.lua b/fbcunn/AbstractParallel.lua
index 005e83c..c8e8167 100644
--- a/fbcunn/AbstractParallel.lua
+++ b/fbcunn/AbstractParallel.lua
@@ -44,6 +44,45 @@ function AbstractParallel:_freeCaches()
     self.gradInput_gpu = {}
 end
 
+-- override nn.Module.type to handle gpu_assignments
+function AbstractParallel:type(type, tensorCache)
+   if not type then
+      return self._type
+   end
+
+   self:_freeCaches()
+
+    local current_gpuid = cutorch.getDevice()
+    for i, module in ipairs(self.modules) do
+        cutorch.setDevice(self.gpu_assignments[i])
+        module:type('torch.FloatTensor', {}):type(type, {})
+    end
+    cutorch.setDevice(current_gpuid)
+
+    for key,param in pairs(self) do
+        if key ~= 'modules' then
+            self[key] = nn.utils.recursiveType(param, type, tensorCache)
+        end
+    end
+
+    self._type = type
+    return self
+end
+
+-- override nn.Module.apply to handle gpu_assignments
+function AbstractParallel:apply(callback)
+
+    callback(self)
+
+    local current_gpuid = cutorch.getDevice()
+    for i, module in ipairs(self.modules) do
+        cutorch.setDevice(self.gpu_assignments[i])
+        module:apply(callback)
+    end
+    cutorch.setDevice(current_gpuid)
+end
+
+
 --[[
 This function yields the GPU id for the module to be added.
 
@@ -238,10 +277,6 @@ function AbstractParallel:updateParameters(learningRate)
     end
 end
 
-function AbstractParallel:share(mlp,...)
-    error("Share is not supported for the AbstractParallel layer.")
-end
-
 function AbstractParallel:clone()
     local clone = parent.clone(self)
     clone:cuda()
@@ -255,3 +290,9 @@ function AbstractParallel:reset(stdv)
         end)
     end
 end
+
+function AbstractParallel:clearState()
+   self:_freeCaches()
+
+   parent.clearState(self)
+end
diff --git a/fbcunn/BatchNormalization.lua b/fbcunn/BatchNormalization.lua
new file mode 100644
index 0000000..1135866
--- /dev/null
+++ b/fbcunn/BatchNormalization.lua
@@ -0,0 +1,185 @@
+--[[
+   This file implements Batch Normalization as described in the paper:
+   "Batch Normalization: Accelerating Deep Network Training
+                         by Reducing Internal Covariate Shift"
+                   by Sergey Ioffe, Christian Szegedy
+
+   This implementation is useful for inputs NOT coming from convolution layers.
+   For Convolution layers, see SpatialBatchNormalization.lua
+
+   The operation implemented is:
+   y =     ( x - mean(x) )
+        -------------------- * gamma + beta
+       standard-deviation(x)
+   where gamma and beta are learnable parameters.
+
+   The learning of gamma and beta is optional.
+
+   Usage:
+   with    learnable parameters: nn.BatchNormalization(N [, eps] [,momentum])
+                                 where N = dimensionality of input
+   without learnable parameters: nn.BatchNormalization(0 [, eps] [,momentum])
+
+   eps is a small value added to the standard-deviation to avoid divide-by-zero.
+       Defaults to 1e-5
+
+   Training: this layer keeps a running estimate of it's computed mean and std.
+   The running sum is kept with a default momentup of 0.1 (unless over-ridden)
+   Testing: this running mean/std is used to normalize.
+]]--
+
+
+local ffi = require 'ffi'
+
+ffi.cdef[[
+   void BatchNormalizationUpdateOutputFFI(
+       THCState* state,
+       THCudaTensor* input,
+       THCudaTensor* output,
+       THCudaTensor* centered,
+       THCudaTensor* std,
+       THCudaTensor* normalized,
+       THCudaTensor* runningMean,
+       THCudaTensor* runningStddev,
+       THCudaTensor* weight,
+       THCudaTensor* bias,
+       float epsilon,
+       float momentum,
+       bool train,
+       bool affine);
+   void BatchNormalizationUpdateGradInputFFI(
+       THCState* state,
+       THCudaTensor* gradInput,
+       THCudaTensor* gradOutput,
+       THCudaTensor* centered,
+       THCudaTensor* std,
+       THCudaTensor* weight,
+       bool affine);
+   void BatchNormalizationAccGradParametersFFI(
+       THCState* state,
+       THCudaTensor* gradOutput,
+       THCudaTensor* normalized,
+       THCudaTensor* gradWeight,
+       THCudaTensor* gradBias,
+       float scale);
+]]
+
+local lib_name = 'torch_fb_fbcunn_batch_norm'
+local lib_path = package.searchpath(lib_name, package.cpath)
+local BNFFI = ffi.load(lib_path and lib_path or lib_name)
+
+local BN, parent = torch.class('fbnn.BatchNormalization', 'nn.Module')
+
+function BN:__init(nOutput, eps, momentum, affine)
+   parent.__init(self)
+   assert(nOutput and type(nOutput) == 'number',
+          'Missing argument #1: dimensionality of input. ')
+   assert(nOutput ~= 0, 'To set affine=false call BatchNormalization'
+     .. '(nOutput,  eps, momentum, false) ')
+   if affine ~= nil then
+      assert(type(affine) == 'boolean', 'affine has to be true/false')
+      self.affine = affine
+   else
+      self.affine = true
+   end
+   self.eps = eps or 1e-5
+   self.train = true
+   self.momentum = momentum or 0.1
+   self.running_mean = torch.zeros(nOutput):cuda()
+   self.running_std = torch.ones(nOutput):cuda()
+
+   if self.affine then
+      self.weight = torch.CudaTensor(nOutput)
+      self.bias = torch.CudaTensor(nOutput)
+      self.gradWeight = torch.CudaTensor(nOutput)
+      self.gradBias = torch.CudaTensor(nOutput)
+      self:reset()
+   else
+      -- Give me empty tensors for proper FFI behavior
+      self.weight = torch.CudaTensor()
+      self.bias = torch.CudaTensor()
+      self.gradWeight = torch.CudaTensor()
+      self.gradBias = torch.CudaTensor()
+   end
+
+   -- Initialize from input on the first updateOutput / updateGradInput
+   self.output = nil
+   self.gradInput = nil
+end
+
+function BN:reset()
+   self.weight:uniform()
+   self.bias:zero()
+end
+
+function BN:updateOutput(input)
+   assert(input:dim() == 2, 'only mini-batch supported (2D tensor), got '
+             .. input:dim() .. 'D tensor instead')
+
+   self.std = self.std or self.running_std:clone():zero():cuda()
+   self.std:resizeAs(self.running_std)
+   self.centered = self.centered or input:clone():zero():cuda()
+   self.centered:resizeAs(input)
+   self.normalized = self.normalized or input:clone():zero():cuda()
+   self.normalized:resizeAs(input)
+   self.output = self.output or input:clone():zero():cuda()
+   self.output:resizeAs(input)
+
+   BNFFI.BatchNormalizationUpdateOutputFFI(cutorch._state,
+                                           input:cdata(),
+                                           self.output:cdata(),
+                                           self.centered:cdata(),
+                                           self.std:cdata(),
+                                           self.normalized:cdata(),
+                                           self.running_mean:cdata(),
+                                           self.running_std:cdata(),
+                                           self.weight:cdata(),
+                                           self.bias:cdata(),
+                                           self.eps,
+                                           self.momentum,
+                                           self.train,
+                                           self.affine)
+
+   return self.output
+end
+
+function BN:updateGradInput(input, gradOutput)
+   assert(input:dim() == 2, 'only mini-batch supported')
+   assert(gradOutput:dim() == 2, 'only mini-batch supported')
+   assert(self.train == true,
+          'should be in training mode when self.train is true')
+
+   self.gradInput = self.gradInput or input:clone():zero():cuda()
+   self.gradInput:resizeAs(input)
+
+   BNFFI.BatchNormalizationUpdateGradInputFFI(cutorch._state,
+                                              self.gradInput:cdata(),
+                                              gradOutput:cdata(),
+                                              self.centered:cdata(),
+                                              self.std:cdata(),
+                                              self.weight:cdata(),
+                                              self.affine)
+
+   return self.gradInput
+end
+
+function BN:accGradParameters(input, gradOutput, scale)
+   if self.affine then
+      scale = scale or 1.0
+      BNFFI.BatchNormalizationAccGradParametersFFI(cutorch._state,
+                                                   gradOutput:cdata(),
+                                                   self.normalized:cdata(),
+                                                   self.gradWeight:cdata(),
+                                                   self.gradBias:cdata(),
+                                                   scale)
+   end
+
+end
+
+function BN:clearState()
+   self.centered = nil
+   self.std = nil
+   self.normalized = nil
+
+   parent.clearState(self)
+end
diff --git a/fbcunn/CuBLASWrapper.lua b/fbcunn/CuBLASWrapper.lua
index 62cb6e8..5d0726d 100644
--- a/fbcunn/CuBLASWrapper.lua
+++ b/fbcunn/CuBLASWrapper.lua
@@ -2,17 +2,48 @@
 
 local CuBLASWrapper = torch.class('nn.CuBLASWrapper')
 
-function CuBLASWrapper:__init()
+function CuBLASWrapper:__init(timed)
    self.iterDims = 0
    self.batchDims = 0
    self.handles = 0
    self.streams = 0
+   self.timed = timed or false
 end
 
-function CuBLASWrapper:matmult(A, B, C, iterDims, batchDims, handles, streams)
+function CuBLASWrapper:matmult(
+      A, B, C, iterDims, batchDims, transA, transB, scale)
+   self.transA = transA or 'n'
+   self.transB = transB or 'n'
    self.iterDims = table.getn(iterDims) or 0
    self.batchDims = table.getn(batchDims) or 0
-   self.handles = handles or 0
-   self.streams = streams or 0
+   self.scale = scale or 1.0
    A.nn.CuBLASWrapper_matmult(self, A, B, C)
 end
+
+function CuBLASWrapper:matmultComplex(
+      A, B, C, iterDims, batchDims, transA, transB, scale)
+   self.transA = transA or 'n'
+   self.transB = transB or 'n'
+   self.iterDims = table.getn(iterDims) or 0
+   self.batchDims = table.getn(batchDims) or 0
+   self.scale = scale or 1.0
+   A.nn.CuBLASWrapper_matmultComplex(self, A, B, C)
+end
+
+function CuBLASWrapper:transpose(
+      A, B, separator, transposeMetaData, handle, stream)
+   self.separator = separator or 0
+   self.transposeMetaData = transposeMetaData or false
+   self.handle = handle or 1 -- always handle 1 by default
+   self.stream = stream or 0
+   A.nn.CuBLASWrapper_transpose(self, A, B)
+end
+
+function CuBLASWrapper:transposeComplex(
+      A, B, separator, transposeMetaData, handle, stream)
+   self.separator = separator or 0
+   self.transposeMetaData = transposeMetaData or false
+   self.handle = handle or 1 -- always handle 1 by default
+   self.stream = stream or 0
+   A.nn.CuBLASWrapper_transposeComplex(self, A, B)
+end
diff --git a/fbcunn/DataParallel.lua b/fbcunn/DataParallel.lua
index eb4571b..99686e0 100644
--- a/fbcunn/DataParallel.lua
+++ b/fbcunn/DataParallel.lua
@@ -43,8 +43,8 @@ Pictorially
                         +--------+
 ```
 ]]
-local DataParallel, _ = torch.class('nn.DataParallel',
-                                    'nn.AbstractParallel')
+local DataParallel, parent = torch.class('nn.DataParallel',
+                                         'nn.AbstractParallel')
 
 -- `_distributeInput` slices the input along self.dimension
 -- and copies each portion into each child module.
@@ -182,3 +182,9 @@ function DataParallel:accUpdateGradParameters(_input, _gradOutput, lr)
    -- like mixGrads, averages the weights across all GPUs
    error('accUpdateGradParameters not implemented for: ' .. torch.type(self))
 end
+
+function DataParallel:clearState()
+   self.homeGradBuffers = {}
+
+   parent.clearState(self)
+end
diff --git a/fbcunn/FFTCDefs.lua b/fbcunn/FFTCDefs.lua
new file mode 100644
index 0000000..d2d72d5
--- /dev/null
+++ b/fbcunn/FFTCDefs.lua
@@ -0,0 +1,34 @@
+local ffi = require 'ffi'
+
+ffi.cdef[[
+   void updateOutputBiasFFI(THCState*, THCudaTensor*, THCudaTensor*);
+   void accGradParametersBiasFFI(
+      THCState*, THCudaTensor*, THCudaTensor*, float scale);
+   void transposeMMFFI(THCState*,
+                       THCudaTensor* tA,
+                       THCudaTensor* tB,
+                       THCudaTensor* tC,
+                       float invNorm,
+                       bool conjugateTransposeA,
+                       bool conjugateTransposeB,
+                       bool accumulate);
+   typedef struct {
+      static const int FFT_UpdateOutput = 0;
+      static const int FFT_UpdateGradInput = 1;
+      static const int FFT_AccGradParameters = 2;
+      int pass;
+   } FFTConvolutionPassFFI;
+   typedef struct {
+     THCudaTensor* tensor;
+     int padL;
+     int padU;
+   } TiledDeviceTensorFFI;
+   void convolveIteratedFFI(THCState* state,
+                            TiledDeviceTensorFFI* input,
+                            THCudaTensor* weight,
+                            TiledDeviceTensorFFI* output,
+                            int numTiles,
+                            int fftSize,
+                            FFTConvolutionPassFFI pass,
+                            float scale);
+]]
diff --git a/fbcunn/FFTWrapper.lua b/fbcunn/FFTWrapper.lua
index 83c35f8..42f0c6a 100644
--- a/fbcunn/FFTWrapper.lua
+++ b/fbcunn/FFTWrapper.lua
@@ -1,56 +1,198 @@
 -- Copyright 2004-present Facebook. All Rights Reserved.
 
+local ffi = require 'ffi'
+local package_path = package.searchpath('cufft_wrapper', package.cpath)
+if not package_path then -- not OSS
+        package_path = 'torch_fb_fbcunn_cufft_wrapper'
+end
+local CuFFTFFI = ffi.load(package_path)
+
+ffi.cdef[[
+typedef int cufftHandle;
+typedef int cufftResult;
+typedef int cufftHandle;
+
+typedef struct {
+   cufftHandle handle;
+} cufftHandleWrapper;
+
+cufftResult cufftDestroy(cufftHandle plan);
+void updateOutputBiasFFI(THCState*, THCudaTensor*, THCudaTensor*);
+cufftHandle makeCuFFTPlanFFI(THCState* state,
+                             THCudaTensor* realTH,
+                             THCudaTensor* cplxTH,
+                             bool direction,
+                             bool normalize,
+                             int fftVersion,
+                             int batchDimensions);
+]]
+
 local FFTWrapper = torch.class('nn.FFTWrapper')
 
-function FFTWrapper:__init(cufft)
-    self.batchDims = 0
-    self.cufft = cufft or 1
+FFTWrapper.emptyBuffer = torch.CudaTensor()
+
+function FFTWrapper:__init(cufft, padLeft, padUp, timed)
+   self.batchDims = 0
+
+   if cufft == nil or cufft == "cufft" then
+      self.cufft = true
+   else
+      self.cufft = false
+   end
+
+   if timed == "timed" then
+      self.timed = true
+   else
+      self.timed = false
+   end
+
+   self.padLeft = padLeft or 0
+   self.padUp = padUp or 0
 end
 
-function FFTWrapper:fft(time, frequency, batchDims)
-    assert(batchDims >= 1)
-    assert(batchDims <= 2)
-    self.batchDims = batchDims
-    -- If calling fft from lua directly, just pass a buffer in any case.
-    -- In practice it is only really needed for 2d-fft of size > 32
-    local buffer = {}
-    if self.cufft == 1 then
-        if #frequency:size() == 4 then
-            assert(frequency:size()[2] / 2 + 1 == frequency:size()[3])
-        end
-        -- Need to allocate explicit cufft plans, a buffer is not enough
-        buffer = torch.CudaTensor(torch.LongStorage({1, 1, 1, 1}))
-    else
-        if #frequency:size() == 4 then
-            assert(frequency:size()[3] / 2 + 1 == frequency:size()[2])
-        end
-        buffer = frequency:clone()
-    end
-    time.nn.FFTWrapper_fft(self, time, frequency, buffer)
+function FFTWrapper:fft(time, frequency, batchDims, plan)
+   assert(batchDims >= 1)
+   assert(batchDims <= 2)
+   assert(torch.type(time) == 'torch.CudaTensor', 'FBFFT only with CudaTensors')
+   self.batchDims = batchDims
+   -- If calling fft from lua directly, just pass a buffer in any case.
+   -- In practice it is only really needed for 2d-fft of size > 32
+   local buffer = FFTWrapper.emptyBuffer
+   if not self.cufft then
+      -- Make full buffer to hold the whole complex tensor if needed
+      -- TODO: Maybe fix this don't want to manage memory here.
+      -- On the other hand we don't care much since we should use tiling anyway
+      local fftDim = (#time:size() - batchDims)
+      local needsBuffer = false
+      for i = 1, fftDim do
+         if time:size(self.batchDims + i) > 32 or
+            frequency:size(self.batchDims + i) > 32 then
+               needsBuffer = true
+         end
+      end
+      if needsBuffer then
+         if fbnn.SpatialConvolution.reportWarnings then
+            print('FFTWrapper.lua: Perf killed by on-the-fly allocation, ',
+                  'consider using tiling and stay under 32 FFT size')
+         end
+         buffer = frequency:clone()
+      end
+   end
+   local handle = -1
+   if plan then
+      handle = plan.handle
+   end
+   time.nn.FFTWrapper_fft(self, time, frequency, buffer, handle)
 end
 
-function FFTWrapper:ffti(time, frequency, batchDims)
-    assert(batchDims >= 1)
-    assert(batchDims <= 2)
-    self.batchDims = batchDims
-    -- In practice it is only really needed for 2d-fft of size > 32
-    local size = frequency:size()
-    local bufferSize = {}
-    local buffer = torch.CudaTensor(torch.LongStorage({1, 1, 1, 1}))
-    -- Make full buffer to hold the whole complex tensor if needed
-    if self.cufft == 1 then
-        if #time:size() - batchDims == 2 then
-            assert(size[2] / 2 + 1 == size[3])
-        end
-    elseif batchDims == 1 and #size == 4 then
-        if batchDims == 1 and #size == 4 then
-            assert(size[3] / 2 + 1 == size[2])
-            --
-            bufferSize = torch.LongStorage({size[1], size[3], size[3], size[4]})
+function FFTWrapper:ffti(time, frequency, batchDims, plan)
+   assert(batchDims >= 1)
+   assert(batchDims <= 2)
+   assert(torch.type(time) == 'torch.CudaTensor', 'FBFFT only with CudaTensors')
+   self.batchDims = batchDims
+   -- In practice it is only really needed for 2d-fft of size > 32
+   local size = frequency:size()
+   local buffer = FFTWrapper.emptyBuffer
+
+   if not self.cufft then
+      -- Make full buffer to hold the whole complex tensor if needed
+      -- TODO: Maybe fix this don't want to manage memory here.
+      -- On the other hand we don't care much since we should use tiling anyway
+      local fftDim = (#time:size() - batchDims)
+      local needsBuffer = false
+      for i = 1, fftDim do
+         if time:size(self.batchDims + i) > 32 or
+            frequency:size(self.batchDims + i) > 32 then
+               needsBuffer = true
+         end
+      end
+      if needsBuffer and fftDim == 2 then
+         if fbnn.SpatialConvolution.reportWarnings then
+            print('FFTWrapper.lua: Perf killed by on-the-fly allocation, ',
+                  'consider using tiling and stay under 32 FFT size')
+         end
+         if batchDims == 1 then
+            local bufferSize = torch.LongStorage({
+                  size[1], size[3], size[3], size[4]})
             buffer = torch.CudaTensor(bufferSize)
-        else
-            buffer = frequency:clone()
-        end
-    end
-    time.nn.FFTWrapper_ffti(self, time, frequency, buffer)
+         elseif batchDims == 2 then
+            local bufferSize = torch.LongStorage({
+                  size[1], size[2], size[4], size[4], size[5]})
+            buffer = torch.CudaTensor(bufferSize)
+         end
+      end
+   end
+
+   local handle = -1
+   if plan then
+      handle = plan.handle
+   end
+
+   time.nn.FFTWrapper_ffti(self, time, frequency, buffer, handle)
+end
+
+
+-- CuFFTPlan allocation occurs in here because it depends on the tensor shape
+-- after transposition
+function FFTWrapper:fftTranspose(tensor, bufferComplex, bufferComplexTranspose,
+                                 batchDims, handle, stream, plan)
+   local transposeSeparator = batchDims
+   cutorch.setBlasHandle(handle)
+   cutorch.setStream(stream)
+   if self.cufft and not plan then
+      local version = 0
+      plan = ffi.new('cufftHandleWrapper')
+      plan.handle = CuFFTFFI.makeCuFFTPlanFFI(cutorch._state,
+                                              tensor:cdata(),
+                                              bufferComplex:cdata(),
+                                              true,
+                                              false,
+                                              version,
+                                              batchDims)
+      ffi.gc(plan, function(p)
+                CuFFTFFI.cufftDestroy(p.handle)
+      end)
+   end
+   self:fft(tensor, bufferComplex, batchDims, plan)
+   local cublasWrapper = nn.CuBLASWrapper()
+   cublasWrapper:transposeComplex(bufferComplex,
+                                  bufferComplexTranspose,
+                                  transposeSeparator,
+                                  false,
+                                  handle,
+                                  stream)
+   return plan
+end
+
+-- CuFFTPlan allocation occurs in here because it depends on the tensor shape
+-- after transposition
+function FFTWrapper:transposeIFFT(tensor, bufferComplex, bufferComplexTranspose,
+                                  batchDims, handle, stream, plan)
+   local transposeSeparator = batchDims
+   cutorch.setBlasHandle(handle)
+   cutorch.setStream(stream)
+   local cublasWrapper = nn.CuBLASWrapper()
+   cublasWrapper:transposeComplex(bufferComplexTranspose,
+                                  bufferComplex,
+                                  transposeSeparator,
+                                  false,
+                                  handle,
+                                  stream)
+
+   if self.cufft and not plan then
+      local version = 0
+      plan = ffi.new('cufftHandleWrapper')
+      plan.handle = CuFFTFFI.makeCuFFTPlanFFI(cutorch._state,
+                                              tensor:cdata(),
+                                              bufferComplex:cdata(),
+                                              false,
+                                              false,
+                                              version,
+                                              batchDims)
+      ffi.gc(plan, function(p)
+                CuFFTFFI.cufftDestroy(p.handle)
+      end)
+   end
+   self:ffti(tensor, bufferComplex, batchDims, plan)
+   return plan
 end
diff --git a/fbcunn/FeatureLPPooling.lua b/fbcunn/FeatureLPPooling.lua
index b460407..e15cbbc 100644
--- a/fbcunn/FeatureLPPooling.lua
+++ b/fbcunn/FeatureLPPooling.lua
@@ -50,7 +50,7 @@ function FeatureLPPooling:__init(width, stride, power, batch_mode)
 end
 
 function FeatureLPPooling:updateOutput(input)
-   if self:type() == 'torch.CudaTensor' then
+   if torch.type(input) == 'torch.CudaTensor' then
       input.nn.FeatureLPPooling_updateOutput(self, input)
    else
       error('CUDA only supported at the moment')
@@ -59,7 +59,7 @@ function FeatureLPPooling:updateOutput(input)
 end
 
 function FeatureLPPooling:updateGradInput(input, gradOutput)
-   if self:type() == 'torch.CudaTensor' then
+   if torch.type(input) == 'torch.CudaTensor' then
       input.nn.FeatureLPPooling_updateGradInput(self, input, gradOutput)
    else
       error('CUDA only supported at the moment')
diff --git a/fbcunn/LookupTableGPU.lua b/fbcunn/LookupTableGPU.lua
index 8d33f82..1aea141 100644
--- a/fbcunn/LookupTableGPU.lua
+++ b/fbcunn/LookupTableGPU.lua
@@ -29,7 +29,7 @@ end
 
 function LookupTableGPU:reset(stdv)
    stdv = stdv or 1
-   self.weight:normal(stdv)
+   self.weight:normal(0, stdv)
 end
 
 function LookupTableGPU:parameters()
diff --git a/fbcunn/ModelParallel.lua b/fbcunn/ModelParallel.lua
index d1ded2f..4919099 100644
--- a/fbcunn/ModelParallel.lua
+++ b/fbcunn/ModelParallel.lua
@@ -182,3 +182,50 @@ function ModelParallel:updateGradInput(_input, gradOutput)
 
     return self.gradInput
 end
+
+
+function ModelParallel:backward(_input, gradOutput, scale)
+   self:_distributeGradOutput(_input, gradOutput)
+
+   scale = scale or 1
+   -- update gradInput for each module
+    for i,module in ipairs(self.modules) do
+        local gpuid = self.gpu_assignments[i]
+        withDevice(gpuid, function()
+            module:backward(self.input_gpu[gpuid],
+                            self.gradOutput_gpu[i],
+                            scale)
+        end)
+    end
+
+    if not self.gradInput then return end -- if gradInput is nil, do nothing
+    self.gradInput:resizeAs(self.input_gpu[self.container_gpuid])
+
+    -- add gradInputs
+    for i, module in ripairs(self.modules) do
+        if module.gradInput then
+            if i == 1 then
+                self.gradInput:copy(module.gradInput)
+                return self.gradInput
+            end
+
+            local parent_module_idx = math.floor(i / 2)
+            local parent_gpuid = self.gpu_assignments[parent_module_idx]
+            withDevice(parent_gpuid, function()
+                           if not self.gradInput_gpu[i] then
+                               self.gradInput_gpu[i] = torch.CudaTensor()
+                           end
+
+                           self.gradInput_gpu[i]:resizeAs(module.gradInput)
+                           self:gpuSend(self.gradInput_gpu[i], module.gradInput)
+                           self.modules[parent_module_idx].gradInput:add(
+                               self.gradInput_gpu[i])
+            end)
+        end
+    end
+
+    -- Combine gradients for data parallel models
+    self:_mixGrads()
+
+    return self.gradInput
+end
diff --git a/fbcunn/OneBitSGD.lua b/fbcunn/OneBitSGD.lua
index cc1b903..dc47cda 100644
--- a/fbcunn/OneBitSGD.lua
+++ b/fbcunn/OneBitSGD.lua
@@ -4,9 +4,6 @@ OneBitSGD contains various utility functions for use in OneBitDataParallel, expo
 
 local M = {}
 
-local _fbd = require('fb.debugger')
-local _trace = require('fb.util.trace')
-
 local pl = require('pl.import_into')()
 local util = require('fb.util')
 local withDevice = cutorch.withDevice
diff --git a/fbcunn/SpatialBatchNormalization.lua b/fbcunn/SpatialBatchNormalization.lua
new file mode 100644
index 0000000..b894d7a
--- /dev/null
+++ b/fbcunn/SpatialBatchNormalization.lua
@@ -0,0 +1,188 @@
+--[[
+   This file implements Batch Normalization as described in the paper:
+   "Batch Normalization: Accelerating Deep Network Training
+                         by Reducing Internal Covariate Shift"
+                by Sergey Ioffe, Christian Szegedy
+
+   This implementation is useful for inputs coming from convolution layers.
+   For Non-convolutional layers, see BatchNormalization.lua
+
+   The operation implemented is:
+   y =     ( x - mean(x) )
+   -------------------- * gamma + beta
+   standard-deviation(x)
+   where gamma and beta are learnable parameters.
+
+   The learning of gamma and beta is optional.
+
+   Usage:
+   with    learnable parameters: nn.BatchNormalization(N [,eps] [,momentum])
+                                 where N = dimensionality of input
+   without learnable parameters: nn.BatchNormalization(0 [,eps] [,momentum])
+
+   eps is a small value added to the standard-deviation to avoid divide-by-zero.
+       Defaults to 1e-5
+
+   At training, it keeps a running estimate of its computed mean and std.
+   The running sum is kept with a default momentup of 0.1 (unless over-ridden)
+   At testing, this running mean/std is used to normalize.
+--]]
+
+local ffi = require 'ffi'
+
+ffi.cdef[[
+   void SpatialBatchNormalizationUpdateOutputFFI(
+       THCState* state,
+       THCudaTensor* input,
+       THCudaTensor* output,
+       THCudaTensor* centered,
+       THCudaTensor* std,
+       THCudaTensor* normalized,
+       THCudaTensor* runningMean,
+       THCudaTensor* runningStddev,
+       THCudaTensor* weight,
+       THCudaTensor* bias,
+       float epsilon,
+       float momentum,
+       bool train,
+       bool affine);
+   void SpatialBatchNormalizationUpdateGradInputFFI(
+       THCState* state,
+       THCudaTensor* gradInput,
+       THCudaTensor* gradOutput,
+       THCudaTensor* centered,
+       THCudaTensor* std,
+       THCudaTensor* weight,
+       bool affine);
+   void SpatialBatchNormalizationAccGradParametersFFI(
+       THCState* state,
+       THCudaTensor* gradOutput,
+       THCudaTensor* normalized,
+       THCudaTensor* gradWeight,
+       THCudaTensor* gradBias,
+       float scale);
+]]
+
+local lib_name = 'torch_fb_fbcunn_batch_norm'
+local lib_path = package.searchpath(lib_name, package.cpath)
+local BNFFI = ffi.load(lib_path and lib_path or lib_name)
+
+local BN, parent = torch.class('fbnn.SpatialBatchNormalization', 'nn.Module')
+
+function BN:__init(nFeature, eps, momentum, affine)
+   parent.__init(self)
+   assert(nFeature and type(nFeature) == 'number',
+          'Missing argument #1: Number of feature planes. ')
+   assert(nFeature ~= 0, 'To set affine=false call SpatialBatchNormalization'
+     .. '(nFeature,  eps, momentum, false) ')
+   if affine ~=nil then
+      assert(type(affine) == 'boolean', 'affine has to be true/false')
+      self.affine = affine
+   else
+      self.affine = true
+   end
+   self.eps = eps or 1e-5
+   self.train = true
+   self.momentum = momentum or 0.1
+
+   self.running_mean = torch.zeros(nFeature):cuda()
+   self.running_std = torch.ones(nFeature):cuda()
+   if self.affine then
+      self.weight = torch.CudaTensor(nFeature)
+      self.bias = torch.CudaTensor(nFeature)
+      self.gradWeight = torch.CudaTensor(nFeature)
+      self.gradBias = torch.CudaTensor(nFeature)
+      self:reset()
+   else
+      -- Give me empty tensors for proper FFI behavior
+      self.weight = torch.CudaTensor()
+      self.bias = torch.CudaTensor()
+      self.gradWeight = torch.CudaTensor()
+      self.gradBias = torch.CudaTensor()
+   end
+
+   -- Initialize from input on the first updateOutput / updateGradInput
+   self.output = nil
+   self.gradInput = nil
+end
+
+function BN:reset()
+   self.weight:uniform()
+   self.bias:zero()
+end
+
+function BN:updateOutput(input)
+   assert(input:dim() == 4, 'only mini-batch supported (4D tensor), got '
+             .. input:dim() .. 'D tensor instead')
+
+   self.std = self.std or self.running_std:clone():zero():cuda()
+   self.std:resizeAs(self.running_std)
+   self.centered = self.centered or input:clone():zero():cuda()
+   self.centered:resizeAs(input)
+   self.normalized = self.normalized or input:clone():zero():cuda()
+   self.normalized:resizeAs(input)
+   self.output = self.output or input:clone():zero():cuda()
+   self.output:resizeAs(input)
+
+   BNFFI.SpatialBatchNormalizationUpdateOutputFFI(
+      cutorch._state,
+      input:cdata(),
+      self.output:cdata(),
+      self.centered:cdata(),
+      self.std:cdata(),
+      self.normalized:cdata(),
+      self.running_mean:cdata(),
+      self.running_std:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.eps,
+      self.momentum,
+      self.train,
+      self.affine)
+
+   return self.output
+end
+
+function BN:updateGradInput(input, gradOutput)
+   assert(input:dim() == 4, 'only mini-batch supported')
+   assert(gradOutput:dim() == 4, 'only mini-batch supported')
+   assert(self.train == true,
+          'should be in training mode when self.train is true')
+
+   self.gradInput = self.gradInput or input:clone():zero():cuda()
+   self.gradInput:resizeAs(input)
+
+   BNFFI.SpatialBatchNormalizationUpdateGradInputFFI(
+      cutorch._state,
+      self.gradInput:cdata(),
+      gradOutput:cdata(),
+      self.centered:cdata(),
+      self.std:cdata(),
+      self.weight:cdata(),
+      self.affine)
+
+   return self.gradInput
+end
+
+function BN:accGradParameters(input, gradOutput, scale)
+   if self.affine then
+      scale = scale or 1.0
+      BNFFI.SpatialBatchNormalizationAccGradParametersFFI(
+         cutorch._state,
+         gradOutput:cdata(),
+         self.normalized:cdata(),
+         self.gradWeight:cdata(),
+         self.gradBias:cdata(),
+         scale)
+   end
+
+end
+
+
+function BN:clearState()
+   self.centered = nil
+   self.std = nil
+   self.normalized = nil
+
+   parent.clearState(self)
+end
diff --git a/fbcunn/SpatialConvolution.lua b/fbcunn/SpatialConvolution.lua
new file mode 100644
index 0000000..ed24526
--- /dev/null
+++ b/fbcunn/SpatialConvolution.lua
@@ -0,0 +1,501 @@
+-- Copyright 2014 - present Facebook. All Rights Reserved.
+
+-- This is the module that you should most likely call if you want the fastest
+-- convolution available. It is a wrapper to cudnn as well as different
+-- FFT-based implementations.
+--
+-- Instantiate with fbnn.SpatialConvolution(nInputPlane,
+--                                          nOutputPlane,
+--                                          kW,
+--                                          kH,
+--                                          dW,                    [1]
+--                                          dH,                    [1]
+--                                          padLeft,               [0]
+--                                          padUp,                 [0]
+--                                          maximalMemoryOverhead, [nil]
+--                                          inferenceOnly)         [false]
+-- where:
+--   - the first parameters have the traditional meaning,
+--   - maximalMemoryOverhead: limit on the amount of memory
+--     overhead you want to allow, nil meaning no limit
+--   - inferenceOnly: whether the module is used for inference or training.
+--     Spercifying inference only saves time in the autotuning process
+--
+-- On the first call to updateOutput, a simple autotuning search kicks off
+-- which compares the performance of different flavors of:
+--   FBFFT + FBMM, FBFFT + cublasGemm, FBFFT Tiled sync, FBFFT Tiled async
+--   and cudnn
+-- In the future we can also wrap more specialized kernels (e.g.
+--   no memory overhead FFTs, Nervana's convolutions etc)
+
+require 'cudnn'
+
+local argcheck = require 'argcheck'
+local SpatialConvolution, parent =
+   torch.class('fbnn.SpatialConvolution', 'nn.Module')
+
+fbnn.SpatialConvolution.reportErrors = false
+fbnn.SpatialConvolution.reportWarnings = false
+
+function SpatialConvolution:__init(nInputPlane,
+                                   nOutputPlane,
+                                   kW,
+                                   kH,
+                                   dW,
+                                   dH,
+                                   padLeft,
+                                   padUp,
+                                   maximalMemoryOverhead,
+                                   inferenceOnly)
+   parent.__init(self)
+   self.inputPlanes = nInputPlane
+   self.outputPlanes = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW or 1
+   self.dH = dH or 1
+   self.padLeft = padLeft or 0
+   self.padUp = padUp or 0
+   self.inferenceOnly = inferenceOnly
+   self.maximalMemoryOverhead = maximalMemoryOverhead
+   self.reportLevel = 0
+
+   -- Allocate an underlying CuDNN
+   self.cudnnModuleInst =
+      cudnn.SpatialConvolution(nInputPlane,
+                               nOutputPlane,
+                               kW,
+                               kH,
+                               dW,
+                               dH,
+                               padLeft,
+                               padUp):cuda()
+
+   -- Take its tensors as my own
+   self.weight = self.cudnnModuleInst.weight
+   self.output = self.cudnnModuleInst.output
+   self.bias = self.cudnnModuleInst.bias
+   self.gradWeight = self.cudnnModuleInst.gradWeight
+   self.gradBias = self.cudnnModuleInst.gradBias
+end
+
+function SpatialConvolution:setInferenceOnly(val)
+   assert(type(val) == 'boolean')
+   self.inferenceOnly = val
+end
+
+function SpatialConvolution:setReuseWeights(val)
+   assert(self.bestModuleInst, 'Must tune before reusing weights')
+   if self.bestModuleInst.setReuseWeights then
+      self.bestModuleInst:setReuseWeights(val)
+   end
+end
+
+--------------------------------------------------------------------------------
+-- Detail
+--------------------------------------------------------------------------------
+local function _timeFunction(
+      fun, mod, arg1, arg2, arg3, arg4, arg5)
+   local numTrials = 3
+   local time = 0
+   cutorch.synchronize()
+   for i = 1, numTrials do
+      local timer = torch.Timer()
+      fun(mod, arg1, arg2, arg3, arg4, arg5)
+      cutorch.synchronize()
+      if i > 1 then
+         time = time + timer:time().real
+      end
+   end
+   time = time / (numTrials - 1)
+   return time * 1000
+end
+
+local runModule = argcheck {
+   { name = "mod", type = "table" },
+   -- { name = "mod", type = "nn.Module" },
+   -- { name = "mod", type = "nn.SpatialConvolutionFBFFT" },
+   { name = "input", type = "torch.CudaTensor"},
+   { name = "gradOutput", type = "torch.CudaTensor"},
+   { name = "parameters", type = "table"},
+   { name = "extraParameters", type = "table"},
+   { name = "inferenceOnly", type = "boolean"},
+   { name = "scale", type = "number"},
+   call = function(
+      mod, input, gradOutput, parameters, extraParameters, inferenceOnly, scale)
+         local params = {}
+         for _, v in pairs(parameters) do
+            table.insert(params, v)
+         end
+         for _, v in pairs(extraParameters) do
+            table.insert(params, v)
+         end
+
+         local inst = mod(unpack(params)):cuda()
+
+         -- Setup autotuning behavior, unused in CuDNN
+         inst.printDebugLevel = -1
+         if inst.printDebugLevel >= 3 then
+            print(inst, unpack(params))
+            inst.cudnnDebug = true
+            if inst.printDebugLevel >= 4 then
+               input:fill(1.0)
+               inst.weight:fill(1.0)
+               gradOutput:fill(1.0)
+            else
+               input:normal()
+               inst.weight:normal()
+               gradOutput:normal()
+            end
+         end
+         inst.autotuningPass = true
+         inst.reportErrors = fbnn.SpatialConvolution.reportErrors or false
+
+         local timing1, timing2, timing3 = 0, 0, 0
+         timing1 = timing1 +
+            _timeFunction(inst.updateOutput, inst, input)
+         if not inst.success then
+            inst:cleanupBuffers()
+            return 1e32, 0, 0, nil
+         end
+
+         if inferenceOnly then
+            return timing1, 0, 0, inst
+         end
+
+         timing2 = timing2 +
+            _timeFunction(inst.updateGradInput, inst, input, gradOutput)
+         if not inst.success then
+            inst:cleanupBuffers()
+            return 1e32, 0, 0, nil
+         end
+
+         timing3 = timing3 +
+            _timeFunction(inst.accGradParameters, inst, input, gradOutput, scale)
+         if not inst.success then
+            inst:cleanupBuffers()
+            return 1e32, 0, 0, nil
+         end
+
+         -- Unset autotuning behavior, unused in CuDNN
+         inst.autotuningPass = false
+         inst.reportErrors = true
+
+         return timing1, timing2, timing3, inst
+      end
+}
+
+function SpatialConvolution:_tune(batchSize,
+                                  iW,
+                                  iH,
+                                  nInputPlane,
+                                  nOutputPlane,
+                                  kW,
+                                  kH,
+                                  dW,
+                                  dH,
+                                  padLeft,
+                                  padUp,
+                                  inferenceOnly)
+   -- Just compare cudnn to various FFT variants and pick the best
+   local timings = {}
+   local ps = {batchSize, nInputPlane, iH, iW}
+   local input = torch.Tensor(torch.LongStorage(ps)):cuda()
+   local ps = {batchSize,
+               nOutputPlane,
+               math.floor((iH - kH + 2 * padUp) / dH) + 1,
+               math.floor((iW - kW + 2 * padLeft) / dW) + 1}
+   local gradOutput = torch.Tensor(torch.LongStorage(ps)):cuda()
+   local scale = torch.random(100) / 100.0
+
+   local preFree = cutorch.getMemoryUsage()
+   local timing1, timing2, timing3 = 0, 0, 0
+   timing1 = timing1 + _timeFunction(self.cudnnModuleInst.updateOutput,
+                          self.cudnnModuleInst,
+                          input)
+   if not inferenceOnly then
+      timing2 = timing2 + _timeFunction(self.cudnnModuleInst.updateGradInput,
+                             self.cudnnModuleInst,
+                             input,
+                             gradOutput)
+      timing3 = timing3 + _timeFunction(self.cudnnModuleInst.accGradParameters,
+                             self.cudnnModuleInst,
+                             input,
+                             gradOutput,
+                             scale)
+   end
+   local postFree = cutorch.getMemoryUsage()
+   local cudnnTiming = timing1 + timing2 + timing3
+   timings[self.cudnnModuleInst] = {
+      parameters = nil,
+      memoryConsumption = preFree - postFree,
+      timing1,
+      timing2,
+      timing3
+   }
+
+   -- Only investigate FFT for stride == 1
+   local bestTiming = 1e32
+   if dW == 1 and dH == 1 then
+      local bestModule = nil
+      self.bestModuleInst = nil
+      local modules
+
+      if iW > 32 or iH > 32 then
+         -- Don't waste time on inefficient 64x64 or 128x128 convolutions atm
+         -- TODO: Fix 3 issues:
+         --   1. implement fast 64 and 128,
+         --   2. drop buffer malloced at each call
+         --   3. tune FBMM for 64x64 and 128x128
+         modules = {
+            -- requires explicit padding and is slow
+            -- nn.SpatialConvolutionCuFFT,
+            nn.SpatialConvolutionFFTTiledSync,
+            nn.SpatialConvolutionFFTTiledAsync,
+            -- too slow atm
+            -- nn.SpatialConvolutionFFTTiledIterated
+         }
+      else
+         modules = {
+            -- requires explicit padding and is slow
+            -- nn.SpatialConvolutionCuFFT,
+            nn.SpatialConvolutionFBFFT,
+            -- only activate if fbmm perf is suspiciously low
+            -- nn.SpatialConvolutionFBFFTGemm, activate if suspicious fbmm perf
+            nn.SpatialConvolutionFFTTiledSync,
+            nn.SpatialConvolutionFFTTiledAsync,
+            -- too slow atm
+            -- nn.SpatialConvolutionFFTTiledIterated
+         }
+      end
+
+      for i_mod in pairs(modules)
+      do
+         local mod = modules[i_mod]
+         local extraParameters = {}
+         if mod == nn.SpatialConvolutionFBFFT or
+            mod == nn.SpatialConvolutionFBFFTGemm
+         then
+            extraParameters = {
+               -- reuse, streams
+               {nn.SpatialConvolutionFFT.memoryReuseAll, 16},
+               {nn.SpatialConvolutionFFT.memoryReuseNone, 16}
+            }
+         elseif mod == nn.SpatialConvolutionFFTTiledSync
+            or mod == nn.SpatialConvolutionFFTTiledAsync
+            or mod == nn.SpatialConvolutionFFTTiledIterated
+         then
+            -- tileH, tileW, reuse
+            if kH <= 3 and kW <= 3 then
+               extraParameters = {
+                  -- Only enable 8 x 8 manually, is often too expensive by default
+                  -- {8, 8, nn.SpatialConvolutionFFT.memoryReuseNone},
+                  {16, 16, nn.SpatialConvolutionFFT.memoryReuseNone},
+                  {32, 32, nn.SpatialConvolutionFFT.memoryReuseNone},
+                  -- {8, 8, nn.SpatialConvolutionFFT.memoryReuseAll},
+                  {16, 16, nn.SpatialConvolutionFFT.memoryReuseAll},
+                  {32, 32, nn.SpatialConvolutionFFT.memoryReuseAll},
+               }
+            elseif kH <= 9 and kW <= 9 then
+               extraParameters = {
+                  {16, 16, nn.SpatialConvolutionFFT.memoryReuseNone},
+                  {32, 32, nn.SpatialConvolutionFFT.memoryReuseNone},
+                  {16, 16, nn.SpatialConvolutionFFT.memoryReuseAll},
+                  {32, 32, nn.SpatialConvolutionFFT.memoryReuseAll},
+               }
+            else
+               extraParameters = {
+                  {32, 32, nn.SpatialConvolutionFFT.memoryReuseNone},
+                  {32, 32, nn.SpatialConvolutionFFT.memoryReuseAll},
+               }
+            end
+         end
+
+         for i_params in pairs(extraParameters)
+         do
+            local preFree = cutorch.getMemoryUsage()
+            local timing1, timing2, timing3, inst =
+               runModule(mod,
+                         input,
+                         gradOutput,
+                         { nInputPlane,
+                           nOutputPlane,
+                           kW,
+                           kH,
+                           dW,
+                           dH,
+                           padLeft,
+                           padUp
+                         },
+                         extraParameters[i_params],
+                         inferenceOnly,
+                         scale
+               )
+
+            local postFree = cutorch.getMemoryUsage()
+            local exceedsAdmissibleMemory = true
+            if inst then
+               timings[inst] = {
+                  parameters = extraParameters[i_params],
+                  memoryConsumption = preFree - postFree,
+                  timing1,
+                  timing2,
+                  timing3
+               }
+               exceedsAdmissibleMemory =
+                  (self.maximalMemoryOverhead and
+                      (timings[inst].memoryConsumption -
+                          timings[self.cudnnModuleInst].memoryConsumption) >
+                      self.maximalMemoryOverhead)
+
+            end
+
+            if timing1 + timing2 + timing3 < bestTiming and
+               not exceedsAdmissibleMemory
+            then
+               bestTiming = timing1 + timing2 + timing3
+               bestModule = mod
+               if self.bestModuleInst and self.bestModuleInst.cleanupBuffers then
+                  self.bestModuleInst:cleanupBuffers()
+               end
+               self.bestModuleInst = inst
+            elseif inst then
+               inst:cleanupBuffers()
+            end
+            inst = nil
+            collectgarbage()
+            collectgarbage()
+         end
+      end
+
+      if self.reportLevel >= 3 then
+         print('Timings: ', timings)
+      end
+      if self.reportLevel >= 1 then
+         print('Best FFT: ', bestTiming, ' ', self.bestModuleInst)
+         print('cudnn   : ', cudnnTiming, ' ', self.cudnnModuleInst)
+      end
+      if self.reportLevel >= 2 then
+         print('FFT   detail ', timings[self.bestModuleInst])
+         print('CuDNN detail ', timings[self.cudnnModuleInst])
+      end
+
+      -- Always run correctness check atm, move later to only run when FFT wins.
+      if bestModule ~= cudnn.SpatialConvolution and self.bestModuleInst then
+         -- Fail if check fails here, don't fallback to cudnn
+         self.bestModuleInst.autotuningPass = true
+         self.bestModuleInst.cudnnDebug = true
+         self.bestModuleInst.printDebugLevel = -1
+         input:normal()
+         gradOutput:normal()
+         self.bestModuleInst:reset()
+         self.bestModuleInst:updateOutput(input)
+         if not inferenceOnly then
+            self.bestModuleInst:updateGradInput(input, gradOutput)
+            self.bestModuleInst:accGradParameters(input, gradOutput, scale)
+         end
+         assert(self.bestModuleInst.cudnnChecks)
+         self.bestModuleInst.autotuningPass = false
+         self.bestModuleInst.cudnnDebug = false
+         self.bestModuleInst.printDebugLevel = -1
+      end
+   end
+
+   if bestTiming > cudnnTiming then
+      self.bestModuleInst = self.cudnnModuleInst
+      self.bestModuleInst:resetWeightDescriptors()
+   end
+
+   -- if self.bestModuleInst == self.cudnnModuleInst, just reduces the refcount
+   -- otherwise prepares for collection
+   self.cudnnModuleInst = nil
+
+   -- Take as my own
+   self.weight = self.bestModuleInst.weight
+   self.output = self.bestModuleInst.output
+   self.bias = self.bestModuleInst.bias
+   self.gradWeight = self.bestModuleInst.gradWeight
+   self.gradBias = self.bestModuleInst.gradBias
+
+   collectgarbage()
+   collectgarbage()
+end
+
+-- Update output (i.e. forward prop)
+function SpatialConvolution:updateOutput(input)
+   assert(#input:size() == 4, 'Only supports 4-D tensors atm')
+
+   if not self.bestModuleInst then
+      -- used for tuning consistency
+      self.batchSize = input:size(1)
+      self.iH = input:size(3)
+      self.iW = input:size(4)
+      self:_tune(self.batchSize,
+                 self.iW,
+                 self.iH,
+                 self.inputPlanes,
+                 self.outputPlanes,
+                 self.kW,
+                 self.kH,
+                 self.dW,
+                 self.dH,
+                 self.padLeft,
+                 self.padUp,
+                 self.inferenceOnly)
+   end
+
+   assert(self.batchSize == input:size(1),
+          'Batches tuned for: ' .. self.batchSize .. ' VS ' ..  input:size(1))
+   assert(self.inputPlanes == input:size(2),
+          'InputPlanes tuned for: ' .. self.inputPlanes ..
+             ' VS ' ..  input:size(2))
+   assert(self.iH == input:size(3),
+          'InputH tuned for: ' .. self.iH .. ' VS ' ..  input:size(3))
+   assert(self.iW == input:size(4),
+          'InputW tuned for: ' .. self.iW .. ' VS ' ..  input:size(4))
+
+   -- weights are updated each iteration, pass them on
+   self.bestModuleInst.weight = self.weight
+   self.output = self.bestModuleInst:updateOutput(input)
+   self.bias = self.bestModuleInst.bias
+
+   assert(self.outputPlanes == self.output:size(2),
+          'OutputPlanes tuned for: ' .. self.outputPlanes ..
+             ' VS ' ..  self.output:size(2))
+
+   assert(self.bestModuleInst)
+   if torch.type(self.bestModuleInst) ~= 'cudnn.SpatialConvolution' then
+      assert(self.bestModuleInst.cudnnChecks)
+   end
+
+   return self.output
+end
+
+
+function SpatialConvolution:updateGradInput(input, gradOutput)
+   assert(self.bestModuleInst, 'Must have been tuned in updateOutput already!')
+   assert(not self.inferenceOnly, 'Inference only specified => no gradInput ')
+   self.bestModuleInst.gradInput =
+      self.bestModuleInst:updateGradInput(input, gradOutput)
+   self.gradInput = self.bestModuleInst.gradInput
+   return self.gradInput
+end
+
+
+function SpatialConvolution:accGradParameters(
+      input, gradOutput, scale)
+   assert(self.bestModuleInst, 'Must have been tuned in updateOutput already!')
+   assert(not self.inferenceOnly, 'Inference only specified => no accGrads ')
+   -- gradWeight / gradBias are updated each iteration, pass them on
+   self.bestModuleInst.gradWeight = self.gradWeight
+   self.bestModuleInst.gradBias = self.gradBias
+   self.bestModuleInst:accGradParameters(input, gradOutput, scale)
+end
+
+
+function SpatialConvolution:cleanupBuffers()
+   if self.bestModuleInst and self.bestModuleInst.cleanupBuffers then
+      self.bestModuleInst:cleanupBuffers()
+   end
+   self.bestModuleInst = nil
+end
diff --git a/fbcunn/SpatialConvolutionCuFFT.lua b/fbcunn/SpatialConvolutionCuFFT.lua
index 2de69d5..7031b6b 100644
--- a/fbcunn/SpatialConvolutionCuFFT.lua
+++ b/fbcunn/SpatialConvolutionCuFFT.lua
@@ -1,298 +1,889 @@
 -- Copyright 2004-present Facebook. All Rights Reserved.
 
-local mk = require('multikey')
-
--- Hoist this in a global buffer module
-cudaTensorBuffers = {}
-FFTConvolution = 'FFTConvolutionBuffer'
-FFTConvolutionTranspose = 'FFTConvolutionTransposeBuffer'
-FFTInputBufferType = 0
-FFTInputTransposeBufferType = 1
-FFTOutputBufferType = 2
-FFTOutputTransposeBufferType = 3
-FFTWeightBufferType = 4
-FFTWeightTransposeBufferType = 5
-
--- Float assumed, 4 bytes
-sizeOfElem = 4
-
+require 'cudnn'
+local List = require 'pl.List'
+local thrift = require('fb.thrift')
+local ffi = require 'ffi'
+local lib_name = 'torch_fb_fbcunn_convolution_bias'
+local lib_path = package.searchpath(lib_name, package.cpath)
+local ConvolutionBiasFFI = ffi.load(lib_path and lib_path or lib_name)
+
+--[[
+   Actual module
+--]]
 local SpatialConvolutionCuFFT, parent =
-      torch.class('nn.SpatialConvolutionCuFFT', 'nn.Module')
+   torch.class('nn.SpatialConvolutionCuFFT', 'nn.SpatialConvolutionFFT')
+
+function SpatialConvolutionCuFFT:__init(nInputPlane,
+                                        nOutputPlane,
+                                        kW,
+                                        kH,
+                                        dW,
+                                        dH,
+                                        padLeft,
+                                        padUp,
+                                        memoryReusePolicy,
+                                        numCudaStreams)
+   assert(torch.type(nInputPlane) == 'number')
+   assert(torch.type(nOutputPlane) == 'number')
+   assert(torch.type(kW) == 'number')
+   assert(torch.type(kH) == 'number')
+   assert(torch.type(dW) == 'number')
+   assert(torch.type(dH) == 'number')
+   assert(memoryReusePolicy == nil or
+             torch.type(memoryReusePolicy) == 'string' or
+             torch.type(memoryReusePolicy) == 'table')
+   assert(numCudaStreams == nil or torch.type(numCudaStreams) == 'number')
+
+   parent.__init(self,
+                 nInputPlane,
+                 nOutputPlane,
+                 kW,
+                 kH,
+                 dW,
+                 dH,
+                 padLeft,
+                 padUp,
+                 memoryReusePolicy,
+                 numCudaStreams)
+
+   parent.fftImplementation = 'cufft'
+
+   assert(self.padUp == 0 and
+             self.padDown == 0 and
+             self.padLeft == 0 and
+             self.padRight == 0, "cufft does not support implicit padding!")
+
+   -- Sanity assertions
+   assert(self.printDebugLevel == -1)
+   assert(self.nInputPlane == nInputPlane)
+   assert(self.nOutputPlane == nOutputPlane)
+   assert(self.kW == kW)
+   assert(self.kH == kH)
+   assert(self.dH == 1, "fft only supports stride-1 convolutions atm")
+   assert(self.dW == 1, "fft only supports stride-1 convolutions atm")
 
-local precision = 0.00002
-local printDebug = false
-local debug = false
+   assert(self.weight:size(1) == nOutputPlane and
+             self.weight:size(2) == nInputPlane and
+             self.weight:size(3) == kH and
+             self.weight:size(4) == kW)
+   assert(self.bias:size(1) == nOutputPlane)
+   assert(self.gradWeight:size(1) == nOutputPlane and
+             self.gradWeight:size(2) == nInputPlane and
+             self.gradWeight:size(3) == kH and
+             self.gradWeight:size(4) == kW)
+   assert(self.gradBias:size(1) == nOutputPlane)
+
+   -- Temporary buffers
+   assert(not self.inputBuffer)
+   assert(not self.inputTransposeBuffer)
+   assert(not self.inputPadded)
+   assert(not self.outputBuffer)
+   assert(not self.outputTransposeBuffer)
+   assert(not self.outputPadded)
+   assert(not self.weightBuffer)
+   assert(not self.weightTransposeBuffer)
+   assert(not self.weightPadded)
+
+   -- CuFFT plans
+   assert(not self.cufftPlanInputFFT)
+   assert(not self.cufftPlanWeightFFT)
+   assert(not self.cufftPlanOutputFFT)
+   assert(not self.cufftPlanInputIFFT)
+   assert(not self.cufftPlanWeightIFFT)
+   assert(not self.cufftPlanOutputIFFT)
+end
 
-function SpatialConvolutionCuFFT:__init(nInputPlane, nOutputPlane,
-                                        kW, kH, dW, dH)
-   parent.__init(self)
+--[[
+   Helper function to perform explicit padding
+   In the case of cufft, padding must be explicit with zeros on the
+   inputs of the algorithm. fbfft does not need this.
+--]]
+function SpatialConvolutionCuFFT:isOutputOfPass(pass, tensor)
+   assert(pass == nn.SpatialConvolutionFFT.ForwardFFTPass or
+             pass == nn.SpatialConvolutionFFT.BackwardFFTPass or
+             pass == nn.SpatialConvolutionFFT.AccGradientFFTPass)
+   if pass == nn.SpatialConvolutionFFT.ForwardFFTPass and
+         tensor == self.output
+   then
+      return true
+   end
+   if pass == nn.SpatialConvolutionFFT.BackwardFFTPass and
+         tensor == self.gradInput
+   then
+      return true
+   end
+   if pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and
+         tensor == self.gradWeight
+   then
+      return true
+   end
+   return false
+end
 
-   self.nInputPlane = nInputPlane
-   self.nOutputPlane = nOutputPlane
-   self.kW = kW
-   self.kH = kH
-   self.dW = dW or 1
-   self.dH = dH or 1
+function SpatialConvolutionCuFFT:fftPadding(tensor, pass, inputTensor)
+   -- Always input, weight, output
+   local tensorList = {}
+   local paddedList = {}
+   if pass == nn.SpatialConvolutionFFT.ForwardFFTPass then
+      tensorList = {tensor, self.weight, self.output}
+      paddedList = {self.inputPadded, self.weightPadded, self.outputPadded}
+   elseif pass == nn.SpatialConvolutionFFT.BackwardFFTPass then
+      tensorList = {self.gradInput, self.weight, tensor}
+      paddedList = {self.inputPadded, self.weightPadded, self.outputPadded}
+   elseif pass == nn.SpatialConvolutionFFT.AccGradientFFTPass then
+      tensorList = {inputTensor, self.gradWeight, tensor}
+      paddedList = {self.inputPadded, self.weightPadded, self.outputPadded}
+   end
 
-   assert(self.dW == 1, "fft only supports stride-1 convolutions atm")
+   for ind = 1, #tensorList do
+      -- If we have a non empty padded tensor
+      if paddedList[ind] and paddedList[ind]:nElement() > 0 then
+         local _orig   = tensorList[ind]
+         local padded = paddedList[ind]
+         if not self:isOutputOfPass(pass, tensorList[ind]) then
+            local sizes = tensorList[ind]:size()
+            local paddedSizes = paddedList[ind]:size()
+            -- resize messes up strides, I want a fortran subarray here,
+            -- do it manually
+            padded:set(padded:storage(),
+                       padded:storageOffset(),
+                       sizes,
+                       padded:stride())
+            padded:copy(tensorList[ind])
+            -- make tensor full again, it is now contiguous and zero padded
+            padded:set(padded:storage(),
+                       padded:storageOffset(),
+                       paddedSizes, padded:stride())
+         end
+      end
+   end
 
-   self.weight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
-   self.bias = torch.Tensor(nOutputPlane)
-   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
-   self.gradBias = torch.Tensor(nOutputPlane)
+   -- swap original and padded tensors to be transparent for the
+   -- convolution pass
+   if pass == nn.SpatialConvolutionFFT.ForwardFFTPass then
+      if self.inputPadded and self.inputPadded:nElement() > 0 then
+         tensor, self.inputPadded = self.inputPadded, tensor
+      end
+      if self.weightPadded and self.weightPadded:nElement() > 0 then
+         self.weight, self.weightPadded = self.weightPadded, self.weight
+      end
+      if self.outputPadded and self.outputPadded:nElement() > 0 then
+         self.output, self.outputPadded = self.outputPadded, self.output
+      end
+   elseif pass == nn.SpatialConvolutionFFT.BackwardFFTPass then
+      if self.inputPadded and self.inputPadded:nElement() > 0 then
+         self.gradInput, self.inputPadded = self.inputPadded, self.gradInput
+      end
+      if self.weightPadded and self.weightPadded:nElement() > 0 then
+         self.weight, self.weightPadded = self.weightPadded, self.weight
+      end
+      if self.outputPadded and self.outputPadded:nElement() > 0 then
+         tensor, self.outputPadded = self.outputPadded, tensor
+      end
+   elseif pass == nn.SpatialConvolutionFFT.AccGradientFFTPass then
+      if self.inputPadded and self.inputPadded:nElement() > 0 then
+         inputTensor, self.inputPadded = self.inputPadded, inputTensor
+      end
+      if self.weightPadded and self.weightPadded:nElement() > 0 then
+         self.gradWeight, self.weightPadded = self.weightPadded, self.gradWeight
+      end
+      if self.outputPadded and self.outputPadded:nElement() > 0 then
+         tensor, self.outputPadded = self.outputPadded, tensor
+      end
+   end
 
-   self:reset()
+   return tensor, inputTensor
 end
 
-function SpatialConvolutionCuFFT:reset(stdv)
-   if stdv then
-      stdv = stdv * math.sqrt(3)
-   else
-      stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
+
+--[[
+   Helper function to undo padding
+   In the case of cufft, padding must be explicit with zeros on the
+   inputs of the algorithm. fbfft does not need this.
+--]]
+function SpatialConvolutionCuFFT:fftUnpadding(tensor, pass, inputTensor)
+   -- Always input, weight, output
+   local tensorList = {}
+   local paddedList = {}
+   -- Here the paddedList and tensorList are reversed compared to fftPadding
+   -- Only true for those tensors that are actually padded (i.e. self.
+   -- inputPadded both non nil and not empty)
+   if pass == nn.SpatialConvolutionFFT.ForwardFFTPass then
+      paddedList = {tensor, self.weight, self.output}
+      tensorList = {self.inputPadded, self.weightPadded, self.outputPadded}
+   elseif pass == nn.SpatialConvolutionFFT.BackwardFFTPass then
+      paddedList = {self.gradInput, self.weight, tensor}
+      tensorList = {self.inputPadded, self.weightPadded, self.outputPadded}
+   elseif pass == nn.SpatialConvolutionFFT.AccGradientFFTPass then
+      paddedList = {inputTensor, self.gradWeight, tensor}
+      tensorList = {self.inputPadded, self.weightPadded, self.outputPadded}
    end
-   if nn.oldSeed then
-      self.weight:apply(function()
-         return torch.uniform(-stdv, stdv)
-      end)
-      self.bias:apply(function()
-         return torch.uniform(-stdv, stdv)
-      end)
-   else
-      self.weight:uniform(-stdv, stdv)
-      self.bias:uniform(-stdv, stdv)
+
+   for ind = 1, #tensorList do
+      -- If we have a non-empty padded tensor
+      if tensorList[ind] and tensorList[ind]:nElement() > 0 then
+         local orig   = tensorList[ind]
+         local padded = paddedList[ind]
+         if self:isOutputOfPass(pass, paddedList[ind]) then
+            local sizes = tensorList[ind]:size()
+            local paddedSizes = paddedList[ind]:size()
+            -- resize messes up strides, I want a fortran subarray here,
+            -- do it manually
+            padded:set(padded:storage(),
+                       padded:storageOffset(),
+                       sizes,
+                       padded:stride())
+            orig:copy(padded)
+            -- make tensor full again, it is now contiguous and zero padded
+            padded:set(padded:storage(),
+                       padded:storageOffset(),
+                       paddedSizes,
+                       padded:stride())
+         end
+      end
    end
-end
 
-local function debugVSMM(pass, module, toTest, fun, param1, param2, param3)
-   local o = toTest:float():clone()
-   toTest:zero()
-   module.padding = 0
-   module.finput = torch.CudaTensor()
-   module.fgradInput = torch.CudaTensor()
-   -- linearize weight for MM
-   module.gradWeight =
-      module.gradWeight:view(module.nOutputPlane,
-                             module.nInputPlane * module.kH * module.kW)
-   module.weight =
-      module.weight:view(module.nOutputPlane,
-                         module.nInputPlane * module.kH * module.kW)
-   local test = fun(module, param1, param2, param3)
-   -- reset layout of weight after MM
-   module.gradWeight =
-      module.gradWeight:view(module.nOutputPlane,
-                             module.nInputPlane,
-                             module.kH,
-                             module.kW)
-   module.weight =
-      module.weight:view(module.nOutputPlane,
-                         module.nInputPlane,
-                         module.kH,
-                         module.kW)
-   local norm = math.sqrt(test:float():dot(test:float()) + 1e-8)
-   if test:float():dist(o:float()) / norm > precision then
-      print('error ', pass, test:float():dist(o:float()) / norm, precision)
-      os.exit()
-   elseif printDebug then
-      print('debug vs MM check passes ',
-            pass, o:min(), o:max(), o:mean(), o:std(), o:sum())
+   -- swap original and padded tensors to be transparent for the
+   -- convolution pass
+   if pass == nn.SpatialConvolutionFFT.ForwardFFTPass then
+      if self.inputPadded and self.inputPadded:nElement() > 0 then
+         tensor, self.inputPadded = self.inputPadded, tensor
+      end
+      if self.weightPadded and self.weightPadded:nElement() > 0 then
+         self.weight, self.weightPadded = self.weightPadded, self.weight
+      end
+      if self.outputPadded and self.outputPadded:nElement() > 0 then
+         self.output, self.outputPadded = self.outputPadded, self.output
+      end
+   elseif pass == nn.SpatialConvolutionFFT.BackwardFFTPass then
+      if self.inputPadded and self.inputPadded:nElement() > 0 then
+         self.gradInput, self.inputPadded = self.inputPadded, self.gradInput
+      end
+      if self.weightPadded and self.weightPadded:nElement() > 0 then
+         self.weight, self.weightPadded = self.weightPadded, self.weight
+      end
+      if self.outputPadded and self.outputPadded:nElement() > 0 then
+         tensor, self.outputPadded = self.outputPadded, tensor
+      end
+   elseif pass == nn.SpatialConvolutionFFT.AccGradientFFTPass then
+      if self.inputPadded and self.inputPadded:nElement() > 0 then
+         inputTensor, self.inputPadded = self.inputPadded, inputTensor
+      end
+      if self.weightPadded and self.weightPadded:nElement() > 0 then
+         self.gradWeight, self.weightPadded = self.weightPadded, self.gradWeight
+      end
+      if self.outputPadded and self.outputPadded:nElement() > 0 then
+         tensor, self.outputPadded = self.outputPadded, tensor
+      end
    end
+
+   return tensor, inputTensor
 end
 
-function SpatialConvolutionCuFFT:updateOutput(input)
-   self:prepareBuffers(input:size())
-   input.nn.SpatialConvolutionCuFFT_updateOutput(self, input)
+function SpatialConvolutionCuFFT:prepareSizeAndBuffers(i, w, o, metaData)
+   return self:prepareCuFFTSizeAndBuffers(i, w, o, metaData, metaData.pass)
+end
 
-   if debug == true then
-      debugVSMM("updateOutput",
-                self,
-                self.output,
-                input.nn.SpatialConvolutionMM_updateOutput,
-                input)
+--[[
+   Update output
+--]]
+function SpatialConvolutionCuFFT:updateOutputFFTImpl(input, reuseList)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   local metaData = {}
+   metaData.pass = nn.SpatialConvolutionFFT.ForwardFFTPass
+
+   local commonSize =
+      self:prepareSizeAndBuffers(input, self.weight, self.output, metaData)
+
+   local cublasBatchDims = 2
+   -- 2D convolutions on 4D tensors atm
+   assert(#input:size() == cublasBatchDims + 2)
+
+   local inputFFTStream = 1
+   local weightFFTStream = 2
+   local gemmStream = 3
+   assert(cutorch.getNumStreams() >= 3)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+   -- In cufft mode, we have explicit padding tensors
+   input = self:fftPadding(input, nn.SpatialConvolutionFFT.ForwardFFTPass)
+   -- Padding / unpadding perform copies on default stream, synchronize all
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 1. FFT + transpose input and weights
+   if not reuseList or
+      not reuseList:contains(
+         nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType)
+   then
+      cutorch.setStream(inputFFTStream)
+      self.cufftPlanInputFFT =
+         fftWrapper:fftTranspose(input,
+                                 self.inputBuffer,
+                                 self.inputTransposeBuffer,
+                                 cublasBatchDims,
+                                 1, -- handle
+                                 inputFFTStream, -- stream
+                                 self.cufftPlanInputFFT)
    end
 
+   if not reuseList or
+      not reuseList:contains(
+         nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType)
+   then
+      cutorch.setStream(weightFFTStream)
+      self.cufftPlanWeightFFT =
+         fftWrapper:fftTranspose(self.weight,
+                                 self.weightBuffer,
+                                 self.weightTransposeBuffer,
+                                 cublasBatchDims,
+                                 2, -- handle
+                                 weightFFTStream, -- stream
+                                 self.cufftPlanWeightFFT)
+   end
+
+   -- 2. CGEMM on transposed tensors
+   -- This call uses all the handles and streams available
+   -- CuBLAS is column major and computes C' = B' * A'
+   local useBatchedMM = (commonSize[3] * commonSize[4] >= 128)
+   local cublasWrapper = nn.CuBLASWrapper()
+   local norm = self:getNormalizationFactor(commonSize, input)
+
+   if not useBatchedMM then
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+
+      -- a. multiple GEMMs on multiple streams
+      cublasWrapper:matmultComplex(self.inputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   self.outputTransposeBuffer,
+                                   {0, 1}, -- iterDims == 2
+                                   { },    -- cublasBatchDims
+                                   'n',
+                                   'c',
+                                   1.0 / norm)
+
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+   else
+      -- stream must match the IFFT stream for sync without waiting
+      -- explicitly
+      cutorch.setStream(gemmStream)
+      cutorch.streamWaitFor(gemmStream, {inputFFTStream, weightFFTStream})
+      cublasWrapper:matmultComplex(self.inputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   self.outputTransposeBuffer,
+                                   {},     -- iterDims
+                                   {0, 1}, -- cublasBatchDims == 2
+                                   'n',
+                                   'c',
+                                   1.0 / norm)
+   end
+
+   -- 3. transpose + IFFT output
+   cutorch.setStream(gemmStream)
+   self.cufftPlanOutputIFFT =
+      fftWrapper:transposeIFFT(self.output,
+                               self.outputBuffer,
+                               self.outputTransposeBuffer,
+                               cublasBatchDims,
+                               1, -- handle
+                               gemmStream, -- stream
+                               self.cufftPlanOutputIFFT)
+
+   -- ##############################################
+   -- Padding / unpadding perform copies on default stream, synchronize all
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 4. If cufft, needs resize
+   self:fftUnpadding(input, nn.SpatialConvolutionFFT.ForwardFFTPass)
+
+   -- Synchronize all: Padding / unpadding perform copies on default stream
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 5. Finally, bias update
+   cutorch.setStream(gemmStream)
+   ConvolutionBiasFFI.updateOutputBiasFFI(
+      cutorch._state, self.output:cdata(), self.bias:cdata())
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
    return self.output
 end
 
-function SpatialConvolutionCuFFT:explorePerformance(input, batches,
-      inputs, planes, inputRows, inputCols, kernelRows, kernelCols)
-   input.nn.SpatialConvolutionCuFFT_explorePerformance(self, batches,
-      inputs, planes, inputRows, inputCols, kernelRows, kernelCols)
-end
+--[[
+   Update input gradients
+--]]
+
+
+function SpatialConvolutionCuFFT:updateGradInputFFTImpl(
+      input, gradOutput, reuseList)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   local metaData = {}
+   metaData.pass = nn.SpatialConvolutionFFT.BackwardFFTPass
+
+   local commonSize =
+      self:prepareSizeAndBuffers(input, self.weight, gradOutput, metaData)
+
+   local cublasBatchDims = 2
+   -- 2D convolutions on 4D tensors atm
+   assert(#input:size() == cublasBatchDims + 2)
+
+   local weightFFTStream = 1
+   local gradOutputFFTStream = 2
+   local gemmStream = 3
+   assert(cutorch.getNumStreams() >= 3)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+   -- If cufft, we may have padding tensors into which to copy the data
+   gradOutput = self:fftPadding(gradOutput,
+                                nn.SpatialConvolutionFFT.BackwardFFTPass)
+   -- Padding / unpadding perform copies on default stream, synchronize all
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 1. FFT + transpose gradOutput and weights
+   if not reuseList or
+      not reuseList:contains(
+         nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType)
+   then
+      cutorch.setStream(gradOutputFFTStream)
+      self.cufftPlanOutputFFT =
+         fftWrapper:fftTranspose(gradOutput,
+                                 self.outputBuffer,
+                                 self.outputTransposeBuffer,
+                                 cublasBatchDims,
+                                 1, -- handle
+                                 gradOutputFFTStream, -- stream
+                                 self.cufftPlanOutputFFT)
+   end
 
-function SpatialConvolutionCuFFT:cleanupBuffers(input)
-   input.nn.SpatialConvolutionCuFFT_cleanupBuffers()
-end
+   if (not reuseList or
+      not reuseList:contains(
+         nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType)) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseWeight)
+   then
+      -- TODO: fix this: transpose changes the TH metadata post buffer
+      -- get/put which screws up the tensor
+      cutorch.setStream(weightFFTStream)
+      self.cufftPlanWeightFFT =
+         fftWrapper:fftTranspose(self.weight,
+                                 self.weightBuffer,
+                                 self.weightTransposeBuffer,
+                                 cublasBatchDims,
+                                 2, -- handle
+                                 weightFFTStream, -- stream
+                                 self.cufftPlanWeightFFT)
+   end
 
-function SpatialConvolutionCuFFT:updateGradInput(input, gradOutput)
-   self:prepareBuffers(input:size())
-   input.nn.SpatialConvolutionCuFFT_updateGradInput(self, gradOutput)
-
-   if debug == true then
-      debugVSMM("updateGradInput",
-                self,
-                self.gradInput,
-                input.nn.SpatialConvolutionMM_updateGradInput,
-                input,
-                gradOutput)
+   -- 2. CGEMM on transposed tensors
+   -- This call uses all the handles and streams available
+   -- CuBLAS is column major and computes C' = B' * A'
+   local useBatchedMM = (commonSize[3] * commonSize[4] >= 128)
+   local cublasWrapper = nn.CuBLASWrapper()
+   local norm = self:getNormalizationFactor(commonSize, gradOutput)
+   if not useBatchedMM then
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+
+      cublasWrapper:matmultComplex(self.outputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   self.inputTransposeBuffer,
+                                   {0, 1}, -- iterDims == 2
+                                   { },    -- cublasBatchDims
+                                   'n',
+                                   'n',
+                                   1.0 / norm)
+
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+   else
+      -- stream must match the IFFT stream for sync without waiting
+      -- explicitly
+      cutorch.setStream(gemmStream)
+      cutorch.streamWaitFor(gemmStream, {weightFFTStream, gradOutputFFTStream})
+
+      cublasWrapper:matmultComplex(self.outputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   self.inputTransposeBuffer,
+                                   { },    -- iterDims
+                                   {0, 1}, -- cublasBatchDims == 2
+                                   'n',
+                                   'n',
+                                   1.0 / norm)
    end
 
-   return self.gradInput
-end
+   -- 3. transpose + IFFT gradInput
+   cutorch.setStream(gemmStream)
+   self.cufftPlanInputIFFT =
+      fftWrapper:transposeIFFT(self.gradInput,
+                               self.inputBuffer,
+                               self.inputTransposeBuffer,
+                               cublasBatchDims,
+                               1, -- handle
+                               gemmStream, -- stream
+                               self.cufftPlanInputIFFT)
 
-local
-function wrapMM_accGradParameters_gradWeight(module, input, gradOutput, scale)
-   input.nn.SpatialConvolutionMM_accGradParameters(
-      module, input, gradOutput, scale)
-   return module.gradWeight
-end
+   -- ##############################################
+   -- Padding / unpadding perform copies on default stream, synchronize all
+   cutorch.streamBarrier(self.allStreams)
 
-local
-function wrapMM_accGradParameters_gradBias(module, input, gradOutput, scale)
-   input.nn.SpatialConvolutionMM_accGradParameters(
-      module, input, gradOutput, scale)
-   return module.gradBias
+   -- 4. If cufft, needs resize
+   self:fftUnpadding(gradOutput, nn.SpatialConvolutionFFT.BackwardFFTPass)
+
+   -- Padding / unpadding perform copies on default stream, synchronize all
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 5. No bias operation
+
+   return self.gradInput
 end
 
-function SpatialConvolutionCuFFT:accGradParameters(input, gradOutput, scale)
+
+--[[
+   Accumulate weight gradients
+--]]
+function SpatialConvolutionCuFFT:accGradParametersFFTImpl(
+      input, gradOutput, scale, reuseList)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
    scale = scale or 1
-   self:prepareBuffers(input:size())
-   input.nn.SpatialConvolutionCuFFT_accGradParameters(
-     self, input, gradOutput, scale)
-
-   if debug == true then
-      self.gradBias:zero() -- zero first to avoid accumulation
-      debugVSMM("accGradParameters_gradWeight",
-                self,
-                self.gradWeight,
-                wrapMM_accGradParameters_gradWeight,
-                input,
-                gradOutput,
-                scale)
-      local saveBias = self.gradBias:float():clone()
-      self.gradWeight:zero()
-      self.gradBias:zero()
-      debugVSMM("accGradParameters_gradBias",
-                self,
-                saveBias,
-                wrapMM_accGradParameters_gradBias,
-                input,
-                gradOutput,
-                scale)
+
+   local metaData = {}
+   metaData.pass = nn.SpatialConvolutionFFT.AccGradientFFTPass
+
+   local commonSize =
+      self:prepareSizeAndBuffers(input, self.gradWeight, gradOutput, metaData)
+
+   local cublasBatchDims = 2
+   -- 2D convolutions on 4D tensors atm
+   assert(#input:size() == cublasBatchDims + 2)
+
+   local inputFFTStream = 1
+   local gradOutputFFTStream = 2
+   local gradBiasFFTStream = 3
+   local gemmStream = 4
+   assert(cutorch.getNumStreams() >= gemmStream)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+   -- If cufft, we may have padding tensors into which to copy the data
+   gradOutput, input = self:fftPadding(
+      gradOutput,  nn.SpatialConvolutionFFT.AccGradientFFTPass, input)
+   assert(self.gradWeight:size(3) == commonSize[3])
+   assert(self.gradWeight:size(4) == commonSize[4])
+
+   -- Padding / unpadding perform copies on default stream, synchronize all
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 0. gradBIas update is independent
+   cutorch.setStream(gradBiasFFTStream)
+   ConvolutionBiasFFI.accGradParametersBiasFFI(
+      cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale)
+
+   -- 1. FFT + transpose gradOutput and weights
+   if (not reuseList or
+          not reuseList:contains(
+             nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType)) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseOutput)
+   then
+      -- TODO: fix this: transpose changes the TH metadata post buffer
+      -- get/put which screws up the tensor
+      cutorch.setStream(gradOutputFFTStream)
+      self.cufftPlanOutputFFT =
+         fftWrapper:fftTranspose(gradOutput,
+                                 self.outputBuffer,
+                                 self.outputTransposeBuffer,
+                                 cublasBatchDims,
+                                 1,
+                                 gradOutputFFTStream,
+                                 self.cufftPlanOutputFFT)
    end
-end
 
--- Type: input/gradInput, output/gradOutput or weight/gradWeight
--- Could lookup bit operations in lua and do in 1 line, just use a loop atm
-local function nextPowerOf2(val)
-   for i = 1, 10 do
-      if (2 ^ i) >= val then
-         return (2 ^ i)
-      end
+   if (not reuseList or
+          not reuseList:contains(
+             nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType)) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseInput)
+   then
+      cutorch.setStream(inputFFTStream)
+      self.cufftPlanInputFFT =
+         fftWrapper:fftTranspose(input,
+                                 self.inputBuffer,
+                                 self.inputTransposeBuffer,
+                                 cublasBatchDims,
+                                 2,
+                                 inputFFTStream,
+                                 self.cufftPlanInputFFT)
+   end
+
+   -- 2. CGEMM on transposed tensors
+   -- This call uses all the handles and streams available
+   -- CuBLAS is column major and computes C' = B' * A'
+   local useBatchedMM = (commonSize[3] * commonSize[4] >= 128)
+   local cublasWrapper = nn.CuBLASWrapper()
+   local norm = self:getNormalizationFactor(commonSize, gradOutput)
+   if not useBatchedMM then
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+
+      cublasWrapper:matmultComplex(self.outputTransposeBuffer,
+                                   self.inputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   {0, 1}, -- iterDims == 2
+                                   { },    -- cublasBatchDims
+                                   'c',
+                                   'n',
+                                   (1.0 * scale) / norm)
+
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+   else
+      -- stream must match the IFFT stream for sync without waiting
+      -- explicitly
+      cutorch.setStream(gemmStream)
+      cutorch.streamWaitFor(gemmStream, {inputFFTStream, gradOutputFFTStream})
+
+      cublasWrapper:matmultComplex(self.outputTransposeBuffer,
+                                   self.inputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   { },    -- iterDims
+                                   {0, 1}, -- cublasBatchDims == 2
+                                   'c',
+                                   'n',
+                                   (1.0 * scale) / norm)
    end
-   assert(false, 'Too large a convolution dimensions: ', val)
+
+   -- 3. transpose + IFFT gradInput
+   cutorch.setStream(gemmStream)
+   self.cufftPlanWeightIFFT =
+      fftWrapper:transposeIFFT(self.gradWeight,
+                               self.weightBuffer,
+                               self.weightTransposeBuffer,
+                               cublasBatchDims,
+                               1,          -- handle
+                               gemmStream, -- stream
+                               self.cufftPlanWeightIFFT)
+
+   -- ##############################################
+   -- Padding / unpadding perform copies on default stream, synchronize all
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 4. If cufft, needs resize
+   self:fftUnpadding(
+      gradOutput, nn.SpatialConvolutionFFT.AccGradientFFTPass, input)
+   assert(self.gradWeight:size(3) == self.kH)
+   assert(self.gradWeight:size(4) == self.kW)
+
+   -- Padding / unpadding perform copies on default stream, synchronize all
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
 end
 
 
-function SpatialConvolutionCuFFT:prepareBuffers(inputSize)
-   self.inputBuffer = getBuffer(FFTConvolution, FFTInputBufferType, inputSize)
-   self.inputTransposeBuffer = getBuffer(
-      FFTConvolutionTranspose, FFTInputTransposeBufferType, inputSize)
-
-   bufferSizesO = torch.LongStorage(4)
-   bufferSizesO[1] = inputSize[1]     -- batch
-   bufferSizesO[2] = self.nOutputPlane -- output planes
-   bufferSizesO[3] = inputSize[3]     -- input x is always max for buffer
-   bufferSizesO[4] = inputSize[4]     -- input y is always max for buffer
-   self.outputBuffer = getBuffer(FFTConvolution, FFTOutputBufferType, bufferSizesO)
-   self.outputTransposeBuffer = getBuffer(
-      FFTConvolutionTranspose, FFTOutputTransposeBufferType, bufferSizesO)
-
-   bufferSizesW = torch.LongStorage(4)
-   bufferSizesW[1] = self.nOutputPlane -- output planes
-   bufferSizesW[2] = self.nInputPlane  -- input planes
-   bufferSizesW[3] = inputSize[3]     -- input x is always max for buffer
-   bufferSizesW[4] = inputSize[4]     -- input y is always max for buffer
-   self.weightBuffer = getBuffer(FFTConvolution, FFTWeightBufferType, bufferSizesW)
-   self.weightTransposeBuffer = getBuffer(
-      FFTConvolutionTranspose, FFTWeightTransposeBufferType, bufferSizesW)
-
-   if self.inputBuffer and self.inputTransposeBuffer and
-      self.outputBuffer and self.outputTransposeBuffer and
-      self.weightBuffer and self.weightTransposeBuffer then
+--[[
+   -- Buffer creation and reuse given a size and a pass.
+   -- Different passes use different tensors as the 'output of the pass'.
+   --   nn.SpatialConvolutionFFT.ForwardFFTPass -> output
+   --   nn.SpatialConvolutionFFT.BackwardFFTPass -> input
+   --   nn.SpatialConvolutionFFT.AccGradientFFTPass -> weight
+   -- The buffers corresponding to the tensors that is the 'output of the pass'
+   -- must be properly transposed in order for the CGemm call to be consistent.
+   -- This is a simple metadata transposition, might as well construct properly.
+--]]
+function SpatialConvolutionCuFFT:prepareBuffers(commonSize, pass, metaData)
+   assert(commonSize and pass and self.fftImplementation)
+   assert(torch.type(metaData) == 'table', torch.type(metaData))
+
+   if not parent.prepareBuffers(self, commonSize, pass, metaData)
+   then
+      return false
+   end
+
+   local bufferSizesO = torch.LongStorage({
+         commonSize[1], self.nOutputPlane, commonSize[3], commonSize[4]})
+   local bufferSizesW = torch.LongStorage({
+         self.nOutputPlane, self.nInputPlane, commonSize[3], commonSize[4]})
+
+   self.inputPadded = self:getBuffer(
+      nn.SpatialConvolutionFFT.CuFFTPaddedInputBuffer,
+      commonSize,
+      false,
+      metaData)
+   self.outputPadded = self:getBuffer(
+      nn.SpatialConvolutionFFT.CuFFTPaddedOutputBuffer,
+      bufferSizesO,
+      false,
+      metaData)
+   self.weightPadded = self:getBuffer(
+      nn.SpatialConvolutionFFT.CuFFTPaddedWeightBuffer,
+      bufferSizesW,
+      false,
+      metaData)
+
+   self.inputTransposeBuffer = self:getBuffer(
+      nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType,
+      commonSize,
+      true,
+      metaData)
+   self.outputTransposeBuffer = self:getBuffer(
+      nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType,
+      bufferSizesO,
+      true,
+      metaData)
+   self.weightTransposeBuffer = self:getBuffer(
+      nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType,
+      bufferSizesW,
+      true,
+      metaData)
+
+   if self.inputTransposeBuffer and self.inputPadded and
+      self.outputTransposeBuffer and self.outputPadded and
+      self.weightTransposeBuffer and self.weightPadded then
          return true
    end
 
-   -- From here on, we should find failsafe to another SpatialConvolution
-   self.inputBuffer = nil
+   print('Not enough memory for CuFFT buffers, need to fall back')
+
+   -- TODO: From here on, we should failsafe to another SpatialConvolution
+   self:cleanupBuffers()
+
+   return false
+end
+
+function SpatialConvolutionCuFFT:cleanupBuffers()
+   parent.cleanupBuffers(self)
+
+   -- Kill cufft plans references to trigger GC
+   self.cufftPlanInputFFT = nil
+   self.cufftPlanWeightFFT = nil
+   self.cufftPlanOutputFFT = nil
+   self.cufftPlanInputIFFT = nil
+   self.cufftPlanWeightIFFT = nil
+   self.cufftPlanOutputIFFT = nil
+
+   -- Kill local references to global buffers
    self.inputTransposeBuffer = nil
-   self.outputBuffer = nil
+   self.inputPadded = nil
    self.outputTransposeBuffer = nil
-   self.weightBuffer = nil
+   self.outputPadded = nil
    self.weightTransposeBuffer = nil
-   freeBuffer(FFTConvolution, FFTInputBufferType, inputSize)
-   freeBuffer(FFTConvolutionTranspose, FFTInputTransposeBufferType, inputSize)
-   freeBuffer(FFTConvolution, FFTOutputBufferType, bufferSizesO)
-   freeBuffer(FFTConvolutionTranspose, FFTOutputTransposeBufferType, bufferSizesO)
-   freeBuffer(FFTConvolution, FFTWeightBufferType, bufferSizesW)
-   freeBuffer(FFTConvolutionTranspose, FFTWeightTransposeBufferType, bufferSizesW)
+   self.weightPadded = nil
+end
 
-   collectgarbage()
-   collectgarbage()
 
-   return false
-end
+ -- TODO: CuFFT is more flexible to allow for arbitrary FFT interpolation sizes.
+ -- When writing the autotuner, it is easy to get different interpolation sizes
+ -- for the FFTs in the 3 passes, perform best.
+ -- For correction of reuse, reuse should only work if interpolation sizes are
+ -- the same between 2 passes.
+ -- In practice this means supporting real producer / consumer semantics in the
+ -- tag space. In particular we need to match any read to a unique write and
+ -- ensure they occur in the proper order.
+ -- For instance, there is no reason that updateGradInput occurs before
+ -- accGradParameters so we need to ensure the first one writes gradOutput and
+ -- the second one reads it
+function SpatialConvolutionCuFFT:getBufferKey(BufferType, bufferSizes, metaData)
+   assert(torch.type(bufferSizes) == 'torch.LongStorage',
+          torch.type(bufferSizes))
+   assert(torch.type(metaData) == 'table', torch.type(metaData))
+
+   -- If no reuse, we hit into the buffers discrimianted by device and
+   -- BufferType. These buffers are shared with all FFT convolution modules
+   -- and do not allow reuse for long dependences (i.e. only gradOutput can
+   -- only be reused from a supporting backward implementation)
+   if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone)
+   then
+      return parent.getBufferKeyGeneric(self, BufferType)
+   end
 
-function getBuffer(OperationType, tensorType, tensorSizes)
-   d1 = tensorSizes[1]
-   d2 = tensorSizes[2]
-   -- Preemptively resize to d1 . d2 . 2^x . 2^y
-   d3 = math.max(nextPowerOf2(tensorSizes[3]), nextPowerOf2(tensorSizes[4]))
-   d4 = d3
-   assert(d3 == d4, 'Squared fft convolution to support fbfft')
-   numElements = d1 * d2 * d3 * (d4 / 2 + 1) * 2
-
-   storage = torch.LongStorage(5)
-
-   storage[1] = d1
-   storage[2] = d2
-   storage[3] = d3
-   storage[4] = d4 / 2 + 1
-   storage[5] = 2
-
-   -- Conservative max buffer size, always needed at least by fbfft
-   -- Handle memory bloat by tiled convolutions + inplace fft
-   if mk.get(cudaTensorBuffers,
-             OperationType,
-             tensorType,
-             cutorch.getDevice()) == nil then
-      local free_bytes, total_bytes = cutorch.getMemoryUsage()
-      if numElements * sizeOfElem > free_bytes then
-         return nil
-      end
+   if not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseWeight) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseInput) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseOutput)
+   then
+         assert(false, "unknown memory reuse policy " .. self.memoryReusePolicy)
+   end
 
-      mk.put(cudaTensorBuffers, OperationType, tensorType, cutorch.getDevice(),
-             torch.CudaTensor(storage))
-   else
-      -- Storage already exists but may need resizing.
-      -- If resizing means expanding, make sure we have enough space
-      t = mk.get(cudaTensorBuffers, OperationType, tensorType, cutorch.getDevice())
-      if numElements > t:nElement() then
-         -- Don't call cuda API unless really needed
-         local free_bytes, total_bytes = cutorch.getMemoryUsage()
-         if (numElements - t:nElement()) * sizeOfElem > free_bytes then
-            return nil
-         end
+   -- TODO: needs semantics for proper producer consumer dependences and
+   -- ordering for RAW dependences by using self.moduleTimeStep properly
+   local md = {}
+   if metaData then
+      -- This is an adhoc way to discriminate between
+      --   updateOutput   / updateGradInput      / accGradParameters
+      --   input  (false) /   gradInput  (true)  / input      (false)
+      --   output (true)  /   gradOutput (false) / input      (false)
+      --   weight (false) /   weight     (false) / gradWeight (true)
+      --
+      local isOutputOfAlgorithm = false
+      -- In cufft mode, the transposed complex buffers are reused
+      if (metaData.pass == nn.SpatialConvolutionFFT.ForwardFFTPass and
+             BufferType ==
+             nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType) or
+         (metaData.pass == nn.SpatialConvolutionFFT.BackwardFFTPass and
+             BufferType ==
+             nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType) or
+         (metaData.pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and
+             BufferType ==
+             nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType)
+      then
+         isOutputOfAlgorithm = true
       end
-      t:resize(storage)
+      md.isOutputOfAlgorithm = isOutputOfAlgorithm
    end
 
-   t = mk.get(cudaTensorBuffers, OperationType, tensorType, cutorch.getDevice())
-   return t
-end
+   -- If no memory reuse, all modules must use the same buffers, only
+   -- discriminate by buffer type and device id.
+   local moduleDiscr = self.moduleUID
+   if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone)
+   then
+      moduleDiscr = nil
+      bufferSizes = nil
+      md = nil
+   end
 
-function freeBuffer(OperationType, tensorType, tensorSizes)
-   mk.put(cudaTensorBuffers,
-          OperationType,
-          tensorType,
-          cutorch.getDevice(), nil)
+   local bufferKey = {
+      self.cudaTensorBuffers,
+      cutorch.getDevice(),
+      BufferType,
+      bufferSizes,
+      moduleDiscr,
+      -- Be sure to put a counter for buffer and reuse btw timesteps or
+      -- memory will be blown (i.e. full DSA = ouch)
+      -- self.moduleTimeStep,
+      md
+   }
+   local res = thrift.to_string(bufferKey)
+   if not self.bufferKeys:contains(res) then
+      self.bufferKeys:append(res)
+   end
+   return res
 end
diff --git a/fbcunn/SpatialConvolutionFBFFT.lua b/fbcunn/SpatialConvolutionFBFFT.lua
new file mode 100644
index 0000000..015dfcf
--- /dev/null
+++ b/fbcunn/SpatialConvolutionFBFFT.lua
@@ -0,0 +1,433 @@
+-- Copyright 2004-present Facebook. All Rights Reserved.
+
+require 'cudnn'
+local thrift = require('fb.thrift')
+local ffi = require 'ffi'
+
+local lib_name = 'torch_fb_fbcunn_mm'
+local lib_path = package.searchpath(lib_name, package.cpath)
+local FBMMFFI = ffi.load(lib_path and lib_path or lib_name)
+
+local lib_name = 'torch_fb_fbcunn_convolution_bias'
+local lib_path = package.searchpath(lib_name, package.cpath)
+local ConvolutionBiasFFI = ffi.load(lib_path and lib_path or lib_name)
+
+--[[
+   Actual module
+--]]
+local SpatialConvolutionFBFFT, parent =
+   torch.class('nn.SpatialConvolutionFBFFT', 'nn.SpatialConvolutionFFT')
+
+-- memoryReusePolicy is one of:
+--   SpatialConvolutionFFT.memoryReuseNone
+--   SpatialConvolutionFFT.memoryReuseWeight
+--   SpatialConvolutionFFT.memoryReuseInput
+--   SpatialConvolutionFFT.memoryReuseOutput
+function SpatialConvolutionFBFFT:__init(nInputPlane,
+                                        nOutputPlane,
+                                        kW,
+                                        kH,
+                                        dW,
+                                        dH,
+                                        padLeft,
+                                        padUp,
+                                        memoryReusePolicy,
+                                        numCudaStreams)
+   assert(torch.type(nInputPlane) == 'number')
+   assert(torch.type(nOutputPlane) == 'number')
+   assert(torch.type(kW) == 'number')
+   assert(torch.type(kH) == 'number')
+   assert(torch.type(dW) == 'number')
+   assert(torch.type(dH) == 'number')
+   assert(torch.type(padLeft) == 'number')
+   assert(torch.type(padUp) == 'number')
+   assert(memoryReusePolicy == nil or
+             torch.type(memoryReusePolicy) == 'string' or
+             torch.type(memoryReusePolicy) == 'table')
+   assert(numCudaStreams == nil or torch.type(numCudaStreams) == 'number')
+
+   parent.__init(
+      self, nInputPlane, nOutputPlane, kW, kH, dW, dH, padLeft, padUp,
+      memoryReusePolicy, numCudaStreams)
+   parent.fftImplementation = 'fbfft'
+
+   -- Sanity assertions
+   assert(self.printDebugLevel == -1)
+   assert(self.nInputPlane == nInputPlane)
+   assert(self.nOutputPlane == nOutputPlane)
+   assert(self.kW == kW)
+   assert(self.kH == kH)
+   assert(self.dH == 1, "fft only supports stride-1 convolutions atm")
+   assert(self.dW == 1, "fft only supports stride-1 convolutions atm")
+
+   assert(self.weight:size(1) == nOutputPlane and
+             self.weight:size(2) == nInputPlane and
+             self.weight:size(3) == kH and
+             self.weight:size(4) == kW)
+   assert(self.bias:size(1) == nOutputPlane)
+   assert(self.gradWeight:size(1) == nOutputPlane and
+             self.gradWeight:size(2) == nInputPlane and
+             self.gradWeight:size(3) == kH and
+             self.gradWeight:size(4) == kW)
+   assert(self.gradBias:size(1) == nOutputPlane)
+
+   -- Temporary buffers
+   assert(not self.inputBuffer)
+   assert(not self.inputTransposeBuffer)
+   assert(not self.inputPadded)
+   assert(not self.outputBuffer)
+   assert(not self.outputTransposeBuffer)
+   assert(not self.outputPadded)
+   assert(not self.weightBuffer)
+   assert(not self.weightTransposeBuffer)
+   assert(not self.weightPadded)
+
+   -- FBFFT plans, useless for fbfft
+   assert(not self.cufftPlanInputFFT)
+   assert(not self.cufftPlanWeightFFT)
+   assert(not self.cufftPlanOutputFFT)
+   assert(not self.cufftPlanInputIFFT)
+   assert(not self.cufftPlanWeightIFFT)
+   assert(not self.cufftPlanOutputIFFT)
+
+   assert(self.padUp < self.kH and self.padDown < self.kH and
+             self.padLeft < self.kW and self.padRight < self.kW,
+          "Padding must be smaller than kernel")
+end
+
+function SpatialConvolutionFBFFT:prepareSizeAndBuffers(i, w, o, metaData)
+   return self:prepareFBFFTSizeAndBuffers(i, w, o, metaData)
+end
+
+function SpatialConvolutionFBFFT:updateOutputFFTImpl(input, reuseList, metaData)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   if not metaData then
+      metaData = {}
+      metaData.pass = nn.SpatialConvolutionFFT.ForwardFFTPass
+   end
+
+   local commonSize =
+      self:prepareSizeAndBuffers(input, self.weight, self.output, metaData)
+
+   local cublasBatchDims = 2
+   -- 2D convolutions on 4D tensors atm
+   assert(#input:size() == cublasBatchDims + 2)
+
+   local inputFFTStream = 1
+   local weightFFTStream = 2
+   local fbmmStream = 3
+   assert(cutorch.getNumStreams() >= 3)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 1. FFTs
+   if not reuseList or
+      not reuseList:contains(nn.SpatialConvolutionFFT.FFTInputBufferType)
+   then
+      -- Potentially reuse buffer if so told
+      -- Makes sense because we could asynchronously compute these AoT
+      local fftWrapperPadded = nn.FFTWrapper(
+         self.fftImplementation, self.padLeft, self.padUp)
+      cutorch.setStream(inputFFTStream)
+      fftWrapperPadded:fft(input, self.inputBuffer, cublasBatchDims)
+      cutorch.setStream(fbmmStream)
+      cutorch.streamWaitFor(fbmmStream, {inputFFTStream})
+   end
+
+   if not reuseList or
+      not reuseList:contains(nn.SpatialConvolutionFFT.FFTWeightBufferType)
+   then
+      -- Potentially reuse buffer if so told
+      -- Makes sense because we could asynchronously compute these AoT
+      local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+      cutorch.setStream(weightFFTStream)
+      fftWrapper:fft(self.weight, self.weightBuffer, cublasBatchDims)
+      cutorch.setStream(fbmmStream)
+      cutorch.streamWaitFor(fbmmStream, {weightFFTStream})
+   end
+
+   -- 2. GEMM with in place transpose
+   -- stream must match the IFFT stream for sync without waiting
+   -- explicitly
+   cutorch.setStream(fbmmStream)
+   local norm = self:getNormalizationFactor(commonSize, input)
+   FBMMFFI.transposeMMFFI(cutorch._state,
+                          self.inputBuffer:cdata(),
+                          self.weightBuffer:cdata(),
+                          self.outputBuffer:cdata(),
+                          1.0 / norm,
+                          false,
+                          true,
+                          false)
+
+   -- 3. IFFT
+   local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+   cutorch.setStream(fbmmStream)
+   fftWrapper:ffti(self.output, self.outputBuffer, cublasBatchDims)
+
+   -- 4. Finally, bias update
+   if not metaData.skipBias then
+      cutorch.setStream(fbmmStream)
+      ConvolutionBiasFFI.updateOutputBiasFFI(
+         cutorch._state, self.output:cdata(), self.bias:cdata())
+   end
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   return self.output
+end
+
+
+--[[
+   Update input gradients
+--]]
+function SpatialConvolutionFBFFT:updateGradInputFFTImpl(
+      input, gradOutput, reuseList, metaData)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   if not metaData then
+      metaData = {}
+      metaData.pass = nn.SpatialConvolutionFFT.BackwardFFTPass
+   end
+
+   local commonSize =
+      self:prepareSizeAndBuffers(input, self.weight, gradOutput, metaData)
+
+   local cublasBatchDims = 2
+   -- 2D convolutions on 4D tensors atm
+   assert(#input:size() == cublasBatchDims + 2)
+
+   local gradInputFFTStream = 1
+   local gradOutputFFTStream = 2
+   local fbmmStream = 3
+   assert(cutorch.getNumStreams() >= 3)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 1. FFTs
+   if (not reuseList or
+          not reuseList:contains(nn.SpatialConvolutionFFT.FFTOutputBufferType))
+   then
+      -- Potentially reuse buffer if so told
+      local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+      cutorch.setStream(gradOutputFFTStream)
+      fftWrapper:fft(gradOutput, self.outputBuffer, cublasBatchDims)
+      cutorch.setStream(fbmmStream)
+      cutorch.streamWaitFor(fbmmStream, {gradOutputFFTStream})
+   end
+
+   if (not reuseList or
+          not reuseList:contains(nn.SpatialConvolutionFFT.FFTWeightBufferType))
+      and not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseWeight)
+   then
+      -- Potentially reuse buffer if so told
+      local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+      cutorch.setStream(gradInputFFTStream)
+      fftWrapper:fft(self.weight, self.weightBuffer, cublasBatchDims)
+      cutorch.setStream(fbmmStream)
+      cutorch.streamWaitFor(fbmmStream, {gradInputFFTStream})
+   end
+
+   -- 2. GEMM with in place transpose
+   -- stream must match the IFFT stream for sync without waiting
+   -- explicitly
+   cutorch.setStream(fbmmStream)
+   local norm = self:getNormalizationFactor(commonSize, gradOutput)
+   FBMMFFI.transposeMMFFI(cutorch._state,
+                          self.outputBuffer:cdata(),
+                          self.weightBuffer:cdata(),
+                          self.inputBuffer:cdata(),
+                          1.0 / norm,
+                          false,
+                          false,
+                          false)
+
+   -- 3. IFFT
+   cutorch.setStream(fbmmStream)
+   local fftWrapperPadded = nn.FFTWrapper(
+         self.fftImplementation, self.padLeft, self.padUp)
+   fftWrapperPadded:ffti(self.gradInput, self.inputBuffer, cublasBatchDims)
+
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   return self.gradInput
+end
+
+
+--[[
+   Accumulate weight gradients
+--]]
+function SpatialConvolutionFBFFT:accGradParametersFFTImpl(
+      input, gradOutput, scale, reuseList, metaData)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   local scale = scale or 1
+
+   if not metaData then
+      metaData = {}
+      metaData.pass = nn.SpatialConvolutionFFT.AccGradientFFTPass
+   end
+
+   local commonSize =
+      self:prepareSizeAndBuffers(input, self.gradWeight, gradOutput, metaData)
+
+   local cublasBatchDims = 2
+   -- 2D convolutions on 4D tensors atm
+   assert(#input:size() == cublasBatchDims + 2)
+
+   local inputFFTStream = 1
+   local gradOutputFFTStream = 2
+   local gradBiasFFTStream = 3
+   local fbmmStream = 4
+   assert(cutorch.getNumStreams() >= 4)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- #########################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 0. Bias update is independent
+   if not metaData.skipBias then
+      cutorch.setStream(gradBiasFFTStream)
+      ConvolutionBiasFFI.accGradParametersBiasFFI(
+         cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale)
+   end
+
+   -- 1. FFTs
+   if (not reuseList or not reuseList:contains(
+          nn.SpatialConvolutionFFT.FFTOutputBufferType)) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseOutput)
+   then
+      -- Potentially reuse buffer if so told
+      local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+      cutorch.setStream(gradOutputFFTStream)
+      fftWrapper:fft(gradOutput, self.outputBuffer, cublasBatchDims)
+      cutorch.setStream(fbmmStream)
+      cutorch.streamWaitFor(fbmmStream, {gradOutputFFTStream})
+   end
+
+   if (not reuseList or not reuseList:contains(
+          nn.SpatialConvolutionFFT.FFTInputBufferType)) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseInput)
+   then
+      -- Potentially reuse buffer if so told
+      cutorch.setStream(inputFFTStream)
+      local fftWrapperPadded = nn.FFTWrapper(
+         self.fftImplementation, self.padLeft, self.padUp)
+      fftWrapperPadded:fft(input, self.inputBuffer, cublasBatchDims)
+      cutorch.setStream(fbmmStream)
+      cutorch.streamWaitFor(fbmmStream, {inputFFTStream})
+   end
+
+   -- 2. GEMM with in place transpose
+   -- stream must match the IFFT stream for sync without waiting
+   -- explicitly
+   cutorch.setStream(fbmmStream)
+   local norm = self:getNormalizationFactor(commonSize, gradOutput)
+   FBMMFFI.transposeMMFFI(cutorch._state,
+                          self.outputBuffer:cdata(),
+                          self.inputBuffer:cdata(),
+                          self.weightBuffer:cdata(),
+                          (1.0 * scale) / norm,
+                          true,
+                          false,
+                          false)
+
+   -- 3. IFFT
+   cutorch.setStream(fbmmStream)
+   local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+   fftWrapper:ffti(self.gradWeight, self.weightBuffer, cublasBatchDims)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- #########################################
+   cutorch.streamBarrier(self.allStreams)
+end
+
+
+function SpatialConvolutionFBFFT:getBufferKey(BufferType, bufferSizes, metaData)
+   assert(torch.type(bufferSizes) == 'torch.LongStorage',
+          torch.type(bufferSizes))
+   assert(torch.type(metaData) == 'table',
+          torch.type(metaData))
+
+   if self.memoryReusePolicy:contains(
+      nn.SpatialConvolutionFFT.memoryReuseNone)
+   then
+      return parent.getBufferKeyGeneric(self, BufferType)
+   end
+
+   if not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseWeight) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseInput) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseOutput)
+   then
+         assert(false, "unknown memory reuse policy " .. self.memoryReusePolicy)
+   end
+
+   -- TODO: needs semantics for proper producer consumer dependences and
+   -- ordering for RAW dependences by using self.moduleTimeStep properly
+   local md = {}
+   if metaData then
+      -- This is an adhoc way to discriminate between
+      --   updateOutput   / updateGradInput      / accGradParameters
+      --   input  (false) /   gradInput  (true)  / input      (false)
+      --   output (true)  /   gradOutput (false) / input      (false)
+      --   weight (false) /   weight     (false) / gradWeight (true)
+      --
+      local isOutputOfAlgorithm = false
+      -- In cufft mode, the complex buffers are reused
+      if (metaData.pass == nn.SpatialConvolutionFFT.ForwardFFTPass and
+             BufferType == nn.SpatialConvolutionFFT.FFTOutputBufferType) or
+         (metaData.pass == nn.SpatialConvolutionFFT.BackwardFFTPass and
+             BufferType == nn.SpatialConvolutionFFT.FFTInputBufferType) or
+         (metaData.pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and
+             BufferType == nn.SpatialConvolutionFFT.FFTWeightBufferType)
+      then
+         isOutputOfAlgorithm = true
+      end
+      md.isOutputOfAlgorithm = isOutputOfAlgorithm
+   end
+
+   -- If no memory reuse, all modules must use the same buffers, only
+   -- discriminate by buffer type and device id.
+   local moduleDiscr = self.moduleUID
+   if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone)
+   then
+      moduleDiscr = nil
+      bufferSizes = nil
+      md = nil
+   end
+
+   local bufferKey = {
+      self.cudaTensorBuffers,
+      cutorch.getDevice(),
+      BufferType,
+      bufferSizes,
+      moduleDiscr,
+      -- Be sure to put a counter for buffer and reuse btw timesteps or
+      -- memory will be blown (i.e. full DSA = ouch)
+      -- self.moduleTimeStep,
+      md
+   }
+   local res = thrift.to_string(bufferKey)
+   if not self.bufferKeys:contains(res) then
+      self.bufferKeys:append(res)
+   end
+   return res
+end
+
+function SpatialConvolutionFBFFT:cleanupBuffers()
+   parent.cleanupBuffers(self)
+end
diff --git a/fbcunn/SpatialConvolutionFBFFTGemm.lua b/fbcunn/SpatialConvolutionFBFFTGemm.lua
new file mode 100644
index 0000000..af73204
--- /dev/null
+++ b/fbcunn/SpatialConvolutionFBFFTGemm.lua
@@ -0,0 +1,599 @@
+-- Copyright 2004-present Facebook. All Rights Reserved.
+
+require 'cudnn'
+local ffi = require 'ffi'
+-- TODO: @soumith, any better way than this fully convoluted path ?
+local ConvolutionBiasFFI = ffi.load('torch_fb_fbcunn_convolution_bias')
+local thrift = require('fb.thrift')
+
+ffi.cdef[[
+   void updateOutputBiasFFI(THCState*, THCudaTensor*, THCudaTensor*);
+   void accGradParametersBiasFFI(
+      THCState*, THCudaTensor*, THCudaTensor*, float scale);
+]]
+
+--[[
+   Actual module
+--]]
+local SpatialConvolutionFBFFTGemm, parent =
+   torch.class('nn.SpatialConvolutionFBFFTGemm', 'nn.SpatialConvolutionFFT')
+
+function SpatialConvolutionFBFFTGemm:__init(nInputPlane,
+                                        nOutputPlane,
+                                        kW,
+                                        kH,
+                                        dW,
+                                        dH,
+                                        padLeft,
+                                        padUp,
+                                        memoryReusePolicy,
+                                        numCudaStreams)
+   assert(torch.type(nInputPlane) == 'number')
+   assert(torch.type(nOutputPlane) == 'number')
+   assert(torch.type(kW) == 'number')
+   assert(torch.type(kH) == 'number')
+   assert(torch.type(dW) == 'number')
+   assert(torch.type(dH) == 'number')
+   assert(memoryReusePolicy == nil or
+             torch.type(memoryReusePolicy) == 'string' or
+             torch.type(memoryReusePolicy) == 'table')
+   assert(numCudaStreams == nil or torch.type(numCudaStreams) == 'number')
+
+   parent.__init(self,
+                 nInputPlane,
+                 nOutputPlane,
+                 kW,
+                 kH,
+                 dW,
+                 dH,
+                 padLeft,
+                 padUp,
+                 memoryReusePolicy,
+                 numCudaStreams)
+
+   parent.fftImplementation = 'fbfft'
+
+   -- Sanity assertions
+   assert(self.printDebugLevel == -1)
+   assert(self.nInputPlane == nInputPlane)
+   assert(self.nOutputPlane == nOutputPlane)
+   assert(self.kW == kW)
+   assert(self.kH == kH)
+   assert(self.dH == 1, "fft only supports stride-1 convolutions atm")
+   assert(self.dW == 1, "fft only supports stride-1 convolutions atm")
+
+   assert(self.weight:size(1) == nOutputPlane and
+             self.weight:size(2) == nInputPlane and
+             self.weight:size(3) == kH and
+             self.weight:size(4) == kW)
+   assert(self.bias:size(1) == nOutputPlane)
+   assert(self.gradWeight:size(1) == nOutputPlane and
+             self.gradWeight:size(2) == nInputPlane and
+             self.gradWeight:size(3) == kH and
+             self.gradWeight:size(4) == kW)
+   assert(self.gradBias:size(1) == nOutputPlane)
+
+   -- Temporary buffers
+   assert(not self.inputBuffer)
+   assert(not self.inputTransposeBuffer)
+   assert(not self.outputBuffer)
+   assert(not self.outputTransposeBuffer)
+   assert(not self.weightBuffer)
+   assert(not self.weightTransposeBuffer)
+end
+
+function SpatialConvolutionFBFFTGemm:prepareSizeAndBuffers(i, w, o, metaData)
+   return self:prepareFBFFTGemmSizeAndBuffers(i, w, o, metaData, metaData.pass)
+end
+
+--[[
+   Update output
+--]]
+function SpatialConvolutionFBFFTGemm:updateOutputFFTImpl(input, reuseList)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   local metaData = {}
+   metaData.pass = nn.SpatialConvolutionFFT.ForwardFFTPass
+
+   local commonSize =
+      self:prepareSizeAndBuffers(input, self.weight, self.output, metaData)
+
+   local cublasBatchDims = 2
+   -- 2D convolutions on 4D tensors atm
+   assert(#input:size() == cublasBatchDims + 2)
+
+   local inputFFTStream = 1
+   local weightFFTStream = 2
+   local gemmStream = 3
+   assert(cutorch.getNumStreams() >= 3)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+   -- 1. FFT + transpose input and weights
+   if not reuseList or
+      not reuseList:contains(
+         nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType)
+   then
+      cutorch.setStream(inputFFTStream)
+      local fftWrapperPadded = nn.FFTWrapper(
+         self.fftImplementation, self.padLeft, self.padUp)
+      fftWrapperPadded:fftTranspose(input,
+                                    self.inputBuffer,
+                                    self.inputTransposeBuffer,
+                                    cublasBatchDims,
+                                    1, -- handle
+                                    inputFFTStream -- stream
+                                    )
+   end
+
+   if not reuseList or
+      not reuseList:contains(
+         nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType)
+   then
+      cutorch.setStream(weightFFTStream)
+      fftWrapper:fftTranspose(self.weight,
+                              self.weightBuffer,
+                              self.weightTransposeBuffer,
+                              cublasBatchDims,
+                              2, -- handle
+                              weightFFTStream -- stream
+                              )
+   end
+
+   -- 2. CGEMM on transposed tensors
+   -- This call uses all the handles and streams available
+   -- CuBLAS is column major and computes C' = B' * A'
+   local useBatchedMM = (commonSize[3] * commonSize[4] >= 128)
+   local cublasWrapper = nn.CuBLASWrapper()
+   local norm = self:getNormalizationFactor(commonSize, input)
+
+   if not useBatchedMM then
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+
+      -- a. multiple GEMMs on multiple streams
+      cublasWrapper:matmultComplex(self.inputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   self.outputTransposeBuffer,
+                                   {0, 1}, -- iterDims == 2
+                                   { },    -- cublasBatchDims
+                                   'n',
+                                   'c',
+                                   1.0 / norm)
+
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+   else
+      -- stream must match the IFFT stream for sync without waiting
+      -- explicitly
+      cutorch.setStream(gemmStream)
+      cutorch.streamWaitFor(gemmStream, {inputFFTStream, weightFFTStream})
+      cublasWrapper:matmultComplex(self.inputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   self.outputTransposeBuffer,
+                                   {},     -- iterDims
+                                   {0, 1}, -- cublasBatchDims == 2
+                                   'n',
+                                   'c',
+                                   1.0 / norm)
+   end
+
+   -- 3. transpose + IFFT output
+   cutorch.setStream(gemmStream)
+   fftWrapper:transposeIFFT(self.output,
+                            self.outputBuffer,
+                            self.outputTransposeBuffer,
+                            cublasBatchDims,
+                            1, -- handle
+                            gemmStream -- stream
+                            )
+
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- 4. Finally, bias update
+   cutorch.setStream(gemmStream)
+   ConvolutionBiasFFI.updateOutputBiasFFI(
+      cutorch._state, self.output:cdata(), self.bias:cdata())
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   return self.output
+end
+
+--[[
+   Update input gradients
+--]]
+
+
+function SpatialConvolutionFBFFTGemm:updateGradInputFFTImpl(
+      input, gradOutput, reuseList)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   local metaData = {}
+   metaData.pass = nn.SpatialConvolutionFFT.BackwardFFTPass
+
+   local commonSize =
+      self:prepareSizeAndBuffers(input, self.weight, gradOutput, metaData)
+
+   local cublasBatchDims = 2
+   -- 2D convolutions on 4D tensors atm
+   assert(#input:size() == cublasBatchDims + 2)
+
+   local weightFFTStream = 1
+   local gradOutputFFTStream = 2
+   local gemmStream = 3
+   assert(cutorch.getNumStreams() >= 3)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+
+   -- 1. FFT + transpose gradOutput and weights
+   if not reuseList or
+      not reuseList:contains(
+         nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType)
+   then
+      cutorch.setStream(gradOutputFFTStream)
+      fftWrapper:fftTranspose(gradOutput,
+                              self.outputBuffer,
+                              self.outputTransposeBuffer,
+                              cublasBatchDims,
+                              1, -- handle
+                              gradOutputFFTStream -- stream
+                              )
+   end
+
+   if (not reuseList or
+      not reuseList:contains(
+         nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType)) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseWeight)
+   then
+      -- TODO: fix this: transpose changes the TH metadata post buffer
+      -- get/put which screws up the tensor
+      cutorch.setStream(weightFFTStream)
+      fftWrapper:fftTranspose(self.weight,
+                              self.weightBuffer,
+                              self.weightTransposeBuffer,
+                              cublasBatchDims,
+                              2, -- handle
+                              weightFFTStream -- stream
+                              )
+   end
+
+   -- 2. CGEMM on transposed tensors
+   -- This call uses all the handles and streams available
+   -- CuBLAS is column major and computes C' = B' * A'
+   local useBatchedMM = (commonSize[3] * commonSize[4] >= 128)
+   local cublasWrapper = nn.CuBLASWrapper()
+   local norm = self:getNormalizationFactor(commonSize, gradOutput)
+   if not useBatchedMM then
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+
+      cublasWrapper:matmultComplex(self.outputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   self.inputTransposeBuffer,
+                                   {0, 1}, -- iterDims == 2
+                                   { },    -- cublasBatchDims
+                                   'n',
+                                   'n',
+                                   1.0 / norm)
+
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+   else
+      -- stream must match the IFFT stream for sync without waiting
+      -- explicitly
+      cutorch.setStream(gemmStream)
+      cutorch.streamWaitFor(gemmStream, {weightFFTStream, gradOutputFFTStream})
+
+      cublasWrapper:matmultComplex(self.outputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   self.inputTransposeBuffer,
+                                   { },    -- iterDims
+                                   {0, 1}, -- cublasBatchDims == 2
+                                   'n',
+                                   'n',
+                                   1.0 / norm)
+   end
+
+   -- 3. transpose + IFFT gradInput
+   cutorch.setStream(gemmStream)
+
+   local fftWrapperPadded = nn.FFTWrapper(
+      self.fftImplementation, self.padLeft, self.padUp)
+   fftWrapperPadded:transposeIFFT(self.gradInput,
+                                  self.inputBuffer,
+                                  self.inputTransposeBuffer,
+                                  cublasBatchDims,
+                                  1, -- handle
+                                  gemmStream -- stream
+                                  )
+
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   return self.gradInput
+end
+
+
+--[[
+   Accumulate weight gradients
+--]]
+function SpatialConvolutionFBFFTGemm:accGradParametersFFTImpl(
+      input, gradOutput, scale, reuseList)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+   scale = scale or 1
+
+   local metaData = {}
+   metaData.pass = nn.SpatialConvolutionFFT.AccGradientFFTPass
+
+   local commonSize =
+      self:prepareSizeAndBuffers(input, self.gradWeight, gradOutput, metaData)
+
+   local cublasBatchDims = 2
+   -- 2D convolutions on 4D tensors atm
+   assert(#input:size() == cublasBatchDims + 2)
+
+   local inputFFTStream = 1
+   local gradOutputFFTStream = 2
+   local gradBiasFFTStream = 3
+   local gemmStream = 4
+   assert(cutorch.getNumStreams() >= gemmStream)
+
+   -- Synchronize all streams on SESE, change when we have a proper DAG impl
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+
+   -- 0. gradBIas update is independent
+   cutorch.setStream(gradBiasFFTStream)
+   ConvolutionBiasFFI.accGradParametersBiasFFI(
+      cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale)
+
+   -- 1. FFT + transpose gradOutput and weights
+   if (not reuseList or
+          not reuseList:contains(
+             nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType)) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseOutput)
+   then
+      -- TODO: fix this: transpose changes the TH metadata post buffer
+      -- get/put which screws up the tensor
+      cutorch.setStream(gradOutputFFTStream)
+      fftWrapper:fftTranspose(gradOutput,
+                              self.outputBuffer,
+                              self.outputTransposeBuffer,
+                              cublasBatchDims,
+                              1,
+                              gradOutputFFTStream)
+   end
+
+   if (not reuseList or
+          not reuseList:contains(
+             nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType)) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseInput)
+   then
+      cutorch.setStream(inputFFTStream)
+      local fftWrapperPadded = nn.FFTWrapper(
+         self.fftImplementation, self.padLeft, self.padUp)
+      fftWrapperPadded:fftTranspose(input,
+                                    self.inputBuffer,
+                                    self.inputTransposeBuffer,
+                                    cublasBatchDims,
+                                    2,
+                                    inputFFTStream)
+   end
+
+   -- 2. CGEMM on transposed tensors
+   -- This call uses all the handles and streams available
+   -- CuBLAS is column major and computes C' = B' * A'
+   local useBatchedMM = (commonSize[3] * commonSize[4] >= 128)
+   local cublasWrapper = nn.CuBLASWrapper()
+   local norm = self:getNormalizationFactor(commonSize, gradOutput)
+   if not useBatchedMM then
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+
+      cublasWrapper:matmultComplex(self.outputTransposeBuffer,
+                                   self.inputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   {0, 1}, -- iterDims == 2
+                                   { },    -- cublasBatchDims
+                                   'c',
+                                   'n',
+                                   (1.0 * scale) / norm)
+
+      -- Synchronize all streams: iterated GEMMS use all available streams
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+   else
+      -- stream must match the IFFT stream for sync without waiting
+      -- explicitly
+      cutorch.setStream(gemmStream)
+      cutorch.streamWaitFor(gemmStream, {inputFFTStream, gradOutputFFTStream})
+
+      cublasWrapper:matmultComplex(self.outputTransposeBuffer,
+                                   self.inputTransposeBuffer,
+                                   self.weightTransposeBuffer,
+                                   { },    -- iterDims
+                                   {0, 1}, -- cublasBatchDims == 2
+                                   'c',
+                                   'n',
+                                   (1.0 * scale) / norm)
+   end
+
+   -- 3. transpose + IFFT gradInput
+   cutorch.setStream(gemmStream)
+   fftWrapper:transposeIFFT(self.gradWeight,
+                            self.weightBuffer,
+                            self.weightTransposeBuffer,
+                            cublasBatchDims,
+                            1,          -- handle
+                            gemmStream -- stream
+                            )
+
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+end
+
+
+--[[
+   -- Buffer creation and reuse given a size and a pass.
+   -- Different passes use different tensors as the 'output of the pass'.
+   --   nn.SpatialConvolutionFFT.ForwardFFTPass -> output
+   --   nn.SpatialConvolutionFFT.BackwardFFTPass -> input
+   --   nn.SpatialConvolutionFFT.AccGradientFFTPass -> weight
+   -- The buffers corresponding to the tensors that is the 'output of the pass'
+   -- must be properly transposed in order for the CGemm call to be consistent.
+   -- This is a simple metadata transposition, might as well construct properly.
+--]]
+function SpatialConvolutionFBFFTGemm:prepareBuffers(commonSize, pass, metaData)
+   assert(commonSize and pass and self.fftImplementation)
+   assert(torch.type(metaData) == 'table', torch.type(metaData))
+
+   if not parent.prepareBuffers(self, commonSize, pass, metaData)
+   then
+      return false
+   end
+
+   local bufferSizesO = torch.LongStorage({
+         commonSize[1], self.nOutputPlane, commonSize[3], commonSize[4]})
+   local bufferSizesW = torch.LongStorage({
+         self.nOutputPlane, self.nInputPlane, commonSize[3], commonSize[4]})
+
+   self.inputTransposeBuffer = self:getBuffer(
+      nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType,
+      commonSize,
+      true,
+      metaData)
+   self.outputTransposeBuffer = self:getBuffer(
+      nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType,
+      bufferSizesO,
+      true,
+      metaData)
+   self.weightTransposeBuffer = self:getBuffer(
+      nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType,
+      bufferSizesW,
+      true,
+      metaData)
+
+   if self.inputTransposeBuffer and
+      self.outputTransposeBuffer and
+      self.weightTransposeBuffer then
+         return true
+   end
+
+   print('Not enough memory for FBFFTGemm buffers, need to fall back')
+
+   -- TODO: From here on, we should failsafe to another SpatialConvolution
+   self:cleanupBuffers()
+
+   assert(false, 'Out of memory!')
+end
+
+function SpatialConvolutionFBFFTGemm:cleanupBuffers()
+   parent.cleanupBuffers(self)
+
+   -- Kill local references to global buffers
+   self.inputTransposeBuffer = nil
+   self.outputTransposeBuffer = nil
+   self.weightTransposeBuffer = nil
+end
+
+
+function SpatialConvolutionFBFFTGemm:getBufferKey(
+      BufferType, bufferSizes, metaData)
+   assert(torch.type(bufferSizes) == 'torch.LongStorage',
+          torch.type(bufferSizes))
+   assert(torch.type(metaData) == 'table', torch.type(metaData))
+
+   -- If no reuse, we hit into the buffers discrimianted by device and
+   -- BufferType. These buffers are shared with all FFT convolution modules
+   -- and do not allow reuse for long dependences (i.e. only gradOutput can
+   -- only be reused from a supporting backward implementation)
+   if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone)
+   then
+      return parent.getBufferKeyGeneric(self, BufferType)
+   end
+
+   if not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseWeight) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseInput) and
+      not self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseOutput)
+   then
+         assert(false, "unknown memory reuse policy " .. self.memoryReusePolicy)
+   end
+
+   -- TODO: needs semantics for proper producer consumer dependences and
+   -- ordering for RAW dependences by using self.moduleTimeStep properly
+   local md = {}
+   if metaData then
+      -- This is an adhoc way to discriminate between
+      --   updateOutput   / updateGradInput      / accGradParameters
+      --   input  (false) /   gradInput  (true)  / input      (false)
+      --   output (true)  /   gradOutput (false) / input      (false)
+      --   weight (false) /   weight     (false) / gradWeight (true)
+      --
+      local isOutputOfAlgorithm = false
+      -- In cufft mode, the transposed complex buffers are reused
+      if (metaData.pass == nn.SpatialConvolutionFFT.ForwardFFTPass and
+             BufferType ==
+             nn.SpatialConvolutionFFT.CuFFTOutputTransposeBufferType) or
+         (metaData.pass == nn.SpatialConvolutionFFT.BackwardFFTPass and
+             BufferType ==
+             nn.SpatialConvolutionFFT.CuFFTInputTransposeBufferType) or
+         (metaData.pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and
+             BufferType ==
+             nn.SpatialConvolutionFFT.CuFFTWeightTransposeBufferType)
+      then
+         isOutputOfAlgorithm = true
+      end
+      md.isOutputOfAlgorithm = isOutputOfAlgorithm
+   end
+
+   -- If no memory reuse, all modules must use the same buffers, only
+   -- discriminate by buffer type and device id.
+   local moduleDiscr = self.moduleUID
+   if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone)
+   then
+      moduleDiscr = nil
+      bufferSizes = nil
+      md = nil
+   end
+
+   local bufferKey = {
+      self.cudaTensorBuffers,
+      cutorch.getDevice(),
+      BufferType,
+      bufferSizes,
+      moduleDiscr,
+      -- Be sure to put a counter for buffer and reuse btw timesteps or
+      -- memory will be blown (i.e. full DSA = ouch)
+      -- self.moduleTimeStep,
+      md
+   }
+   local res = thrift.to_string(bufferKey)
+   if not self.bufferKeys:contains(res) then
+      self.bufferKeys:append(res)
+   end
+   return res
+end
diff --git a/fbcunn/SpatialConvolutionFFT.lua b/fbcunn/SpatialConvolutionFFT.lua
new file mode 100644
index 0000000..1aa195e
--- /dev/null
+++ b/fbcunn/SpatialConvolutionFFT.lua
@@ -0,0 +1,1012 @@
+-- Copyright 2004-present Facebook. All Rights Reserved.
+
+-- TODO: Catch errors in general
+-- TODO: Catch errors on cufft plan creation and cleanupBuffers
+-- TODO: Cleanup buffers and make them independent of tasks
+-- TODO: Auto-tuning
+
+require 'cudnn'
+local List = require 'pl.List'
+local thrift = require('fb.thrift')
+
+-- Float assumed, 4 bytes
+local sizeOfElem = 4
+
+local prec = 0.00002
+
+local function isnan(n) return tostring(n) == tostring((-1)^.5) end
+
+-- Module
+
+local SpatialConvolutionFFT, parent =
+   torch.class('nn.SpatialConvolutionFFT', 'nn.Module')
+
+-- multi-key map indexed by {BufferType, deviceId, [size], [metaData]}
+SpatialConvolutionFFT.cudaTensorBuffers = {}
+SpatialConvolutionFFT.bufferMap = {}
+
+-- BufferType
+SpatialConvolutionFFT.FFTInputBufferType =
+   "FFTInputBufferType"
+SpatialConvolutionFFT.FFTOutputBufferType =
+   "FFTOutputBufferType"
+SpatialConvolutionFFT.FFTWeightBufferType =
+   "FFTWeightBufferType"
+SpatialConvolutionFFT.CuFFTInputTransposeBufferType =
+   "CuFFTInputTransposeBufferType"
+SpatialConvolutionFFT.CuFFTOutputTransposeBufferType =
+   "CuFFTOutputTransposeBufferType"
+SpatialConvolutionFFT.CuFFTWeightTransposeBufferType =
+   "CuFFTWeightTransposeBufferType"
+SpatialConvolutionFFT.CuFFTPaddedInputBuffer =
+   "CuFFTPaddedInputBuffer"
+SpatialConvolutionFFT.CuFFTPaddedWeightBuffer =
+   "CuFFTPaddedWeightBuffer"
+SpatialConvolutionFFT.CuFFTPaddedOutputBuffer =
+   "CuFFTPaddedOutputBuffer"
+
+-- Convenience lists
+SpatialConvolutionFFT.cudaRealBufferTypes = List{
+   SpatialConvolutionFFT.CuFFTPaddedInputBuffer,
+   SpatialConvolutionFFT.CuFFTPaddedWeightBuffer,
+   SpatialConvolutionFFT.CuFFTPaddedOutputBuffer}
+SpatialConvolutionFFT.cudaPaddedBufferTypes = List{
+   SpatialConvolutionFFT.CuFFTPaddedInputBuffer,
+   SpatialConvolutionFFT.CuFFTPaddedWeightBuffer,
+   SpatialConvolutionFFT.CuFFTPaddedOutputBuffer}
+
+-- Memory reuse policy
+SpatialConvolutionFFT.memoryReuseNone = "none"
+SpatialConvolutionFFT.memoryReuseInput = "input"
+SpatialConvolutionFFT.memoryReuseOutput = "output"
+SpatialConvolutionFFT.memoryReuseWeight = "weight"
+SpatialConvolutionFFT.memoryReuseAll = "all"
+
+-- Use to uniquely identify steps of this module and to properly track
+-- producer-consumer dependences in the tagspace.
+-- TODO: increment atomically in a multi-threaded environment
+SpatialConvolutionFFT.moduleInstance = 0
+
+-- Debug helper functions
+local function wrapCUDNN_accGradParameters_gradWeight(
+      module, input, gradOutput, scale)
+   -- Needed to initialize all cudnn state properly
+   module:updateOutput(input)
+   module.gradBias:zero()
+   module.gradWeight:zero()
+   module:accGradParameters(input, gradOutput, scale)
+   return module.gradWeight
+end
+
+local function wrapCUDNN_accGradParameters_gradBias(
+      module, input, gradOutput, scale)
+   -- Needed to initialize all cudnn state properly
+   module:updateOutput(input)
+   module.gradBias:zero()
+   module.gradWeight:zero()
+   module:accGradParameters(input, gradOutput, scale)
+   return module.gradBias
+end
+
+function SpatialConvolutionFFT:debugVSCUDNN(
+      pass, module, selfModule, toTest, fun, param1, param2, param3)
+   local fftRes = toTest:float():clone()
+
+   module.weight = selfModule.weight:clone()
+   module.bias = selfModule.bias:clone()
+   module.gradWeight = selfModule.gradWeight:clone()
+   module.gradBias = selfModule.gradBias:clone()
+   module.output = selfModule.output:clone()
+   module.gradInput = selfModule.gradInput:clone()
+
+   local p1 = param1:contiguous()
+   local p2
+   if param2 then
+      p2 = param2:contiguous()
+   end
+   local p3 = param3
+   local cudnnRes = fun(module, p1, p2, p3)
+
+   if self.printDebugLevel >= 2 then
+      print('FFTRES', {fftRes}, 'CUDNN', {cudnnRes})
+   end
+
+   local norm = math.sqrt(cudnnRes:float():dot(cudnnRes:float()) + 1e-8)
+   if isnan(fftRes:sum()) or
+   cudnnRes:float():dist(fftRes:float()) / norm > prec then
+      print(torch.type(self), ' error', pass,
+            cudnnRes:float():dist(fftRes:float()) / norm, prec)
+      print(torch.type(self), ' error', pass,
+            fftRes:min(), fftRes:max(), fftRes:mean(), fftRes:sum())
+      if self.printDebugLevel >= 2 then
+         local diff = fftRes:float() - cudnnRes:float()
+         print('Expected\n', cudnnRes:float())
+         print('Actual\n', fftRes:float())
+         print('DIFFTENSOR\n', diff)
+      end
+      return false
+   elseif self.printDebugLevel >= 0 then
+      print(torch.type(self), ' debug vs CUDNN check passes ',
+            pass, fftRes:min(), fftRes:max(), fftRes:mean(), fftRes:sum())
+   end
+   return true
+end
+
+function SpatialConvolutionFFT:initCudaResources(numHandles, numStreams)
+   -- Init streams, handles and synchronization groups
+   cutorch.reserveBlasHandles(numHandles)
+   cutorch.reserveStreams(numStreams)
+   local allStreams = {}
+   for stream = 0, numStreams do
+      table.insert(allStreams, stream)
+   end
+   local allStreamsButDefault = {}
+   for stream = 1, numStreams do
+      table.insert(allStreamsButDefault, stream)
+   end
+   return allStreams, allStreamsButDefault
+end
+
+function SpatialConvolutionFFT:__init(nInputPlane,
+                                      nOutputPlane,
+                                      kW,
+                                      kH,
+                                      dW,
+                                      dH,
+                                      padLeft,
+                                      padUp,
+                                      memoryReusePolicy,
+                                      numCudaStreams)
+   parent.__init(self)
+
+   self.printDebugLevel = -1 -- override manually
+   self.cudnnDebug = false -- override manually
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW or 1
+   self.dH = dH or 1
+
+   self.padLeft = padLeft or 0
+   self.padUp = padUp or 0
+   self.padRight = self.padLeft
+   self.padDown = self.padUp
+
+   assert(self.dW == 1, "fft only supports stride-1 convolutions atm")
+
+   self.weight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
+   self.bias = torch.Tensor(nOutputPlane)
+   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
+   self.gradBias = torch.Tensor(nOutputPlane)
+
+   -- Temporary buffers, would be nice to reduce code size here
+   self.inputBuffer  = nil
+   self.inputTransposeBuffer  = nil
+   self.inputPadded  = nil
+   self.outputBuffer = nil
+   self.outputTransposeBuffer = nil
+   self.outputPadded = nil
+   self.weightBuffer = nil
+   self.weightTransposeBuffer = nil
+   self.weightPadded = nil
+
+   -- CuFFT plans, useless for fbfft
+   self.cufftPlanInputFFT = nil
+   self.cufftPlanWeightFFT = nil
+   self.cufftPlanOutputFFT = nil
+   self.cufftPlanInputIFFT = nil
+   self.cufftPlanWeightIFFT = nil
+   self.cufftPlanOutputIFFT = nil
+
+   self:reset()
+
+   self.numCudaStreams = numCudaStreams or 16
+   self.numCublasHandles = self.numCudaStreams
+   self.allStreams = nil
+   self.allStreamsButDefault = nil
+   self.allStreams, self.allStreamsButDefault =
+      self:initCudaResources(self.numCublasHandles, self.numCudaStreams)
+
+   -- List of buffers into multikey that we need to free
+   self.bufferKeys = List{}
+
+   -- Memory reuse strategy
+   if not memoryReusePolicy or
+         memoryReusePolicy == nn.SpatialConvolutionFFT.memoryReuseNone
+   then
+      self.memoryReusePolicy = List{nn.SpatialConvolutionFFT.memoryReuseNone}
+   elseif memoryReusePolicy == nn.SpatialConvolutionFFT.memoryReuseAll
+   then
+      self.memoryReusePolicy = List{nn.SpatialConvolutionFFT.memoryReuseInput,
+                                    nn.SpatialConvolutionFFT.memoryReuseOutput,
+                                    nn.SpatialConvolutionFFT.memoryReuseWeight}
+   elseif torch.type(self.memoryReusePolicy) == 'table'
+   then
+      if memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseAll)
+      then
+         self.memoryReusePolicy =
+            List{nn.SpatialConvolutionFFT.memoryReuseInput,
+                 nn.SpatialConvolutionFFT.memoryReuseOutput,
+                 nn.SpatialConvolutionFFT.memoryReuseWeight}
+      else
+         self.memoryReusePolicy = memoryReusePolicy
+      end
+   else
+      self.memoryReusePolicy = List{memoryReusePolicy}
+   end
+
+   -- Use to uniquely identify steps of this module and to properly track
+   -- producer-consumer dependences in the tagspace.
+   SpatialConvolutionFFT.moduleInstance =
+      SpatialConvolutionFFT.moduleInstance + 1 -- TODO: increment atomically
+   -- Must be a unique name
+   self.moduleUID =
+      torch.type(self) .. "--instance=" .. SpatialConvolutionFFT.moduleInstance
+   -- set once at the beginning of every operation to keep track of the
+   -- 'timestep'
+   self.timeSteps =
+      { updateOutput = 0, updateGradInput = 0, accGradParameters = 0 }
+
+   if self.printDebugLevel >= 0 then
+      print('Post init ', self.moduleUID, ' memory usage: ',
+            cutorch.getMemoryUsage())
+   end
+
+   -- List of fallback modules, one for each function (updateOutput,
+   -- updateGradInput, accGradParameters)
+   -- When they are set, just use the specified fallback for each pass.
+   self.fallbackModules = nil
+   self.recoverFromError = true
+
+   -- Check vs reference result
+   self.cudnnChecks = true
+
+   -- Support for tuned SpatialConvolution.lua
+   self.success = true
+   self.autotuningPass = false
+   self.reportErrors = true
+end
+
+function SpatialConvolutionFFT:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW * self.kH * self.nInputPlane)
+   end
+
+   if nn.oldSeed then
+      self.weight:apply(function()
+            return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+            return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+-- Update output (i.e. forward prop)
+function SpatialConvolutionFFT:updateOutput(input)
+   self.timeSteps.updateOutput = self.timeSteps.updateOutput + 1
+   self.originalStream = cutorch.getStream()
+   local res = self:wrapFallback(self.updateOutputFFT, input)
+   cutorch.setStream(self.originalStream)
+   return res
+end
+
+function SpatialConvolutionFFT:updateGradInput(input, gradOutput)
+   self.timeSteps.updateGradInput = self.timeSteps.updateGradInput + 1
+   self.originalStream = cutorch.getStream()
+   local res = self:wrapFallback(self.updateGradInputFFT, input, gradOutput)
+   cutorch.setStream(self.originalStream)
+   return res
+end
+
+function SpatialConvolutionFFT:accGradParameters(
+      input, gradOutput, scale)
+   self.timeSteps.accGradParameters = self.timeSteps.accGradParameters + 1
+   self.originalStream = cutorch.getStream()
+   self:wrapFallback(
+      self.accGradParametersFFT, input, gradOutput, scale)
+   cutorch.setStream(self.originalStream)
+end
+
+-- This function wraps calls to updateOutput, updateGradInput and
+-- accGradParameters. If any error is encountered it cleans after itself and
+-- calls the corresponding cudnn function. This acts as a failsafe mechanism in
+-- case FFT runs out of memory which is not a trivial thing to determine
+-- beforehand. The overhead is only paid on the first invocations, all
+-- subsequent ones will default to cudnn after the first failure.
+function SpatialConvolutionFFT:wrapFallback(
+      fun, input, gradOutput, scale, reuseList)
+
+   if not self.fallbackModules then
+      local ok, res = pcall(fun, self, input, gradOutput, scale, reuseList)
+      if ok then
+         return res
+      end
+      if not self.recoverFromError then
+         error(res)
+      end
+
+      if self.reportErrors then
+         print("Error: " .. res .. " -> fallback to cudnn")
+      end
+      -- This path exits early for tuned SpatialConvolution.lua
+      self.success = false
+      if self.autotuningPass then
+         if self.reportErrors then
+            print('Using tuned SpatialConvolution: found an error, early exit')
+         end
+         return nil
+      end
+   end
+
+   -- This path is the fallback path where cudnn is subsituted for our module
+   -- This is becoming obsolete as everyone should now use
+   -- tuned SpatialConvolution.lua
+   if not self.collectedGarbage then
+      self:cleanupBuffers()
+      collectgarbage()
+      collectgarbage()
+      self.collectedGarbage = true
+   end
+
+   self.fallbackModules = {}
+   if not self.fallbackModules[fun] then
+      cutorch.synchronize()
+      self.fallbackModules[fun] = cudnn.SpatialConvolution(self.nInputPlane,
+                                                     self.nOutputPlane,
+                                                     self.kW,
+                                                     self.kH,
+                                                     self.dW,
+                                                     self.dH,
+                                                     self.padLeft,
+                                                     self.padUp):cuda()
+      -- run updateOutput once to initialize
+      self.fallbackModules[fun]:updateOutput(input)
+   end
+
+   -- Pass along to cudnn module
+   self.fallbackModules[fun].weight = self.weight
+   self.fallbackModules[fun].bias = self.bias
+   self.fallbackModules[fun].gradWeight = self.gradWeight
+   self.fallbackModules[fun].gradBias = self.gradBias
+   local res = nil
+   if fun == self.updateOutputFFT then
+      res = self.fallbackModules[fun]:updateOutput(input)
+      self.output = res
+   elseif fun == self.updateGradInputFFT then
+      res = self.fallbackModules[fun]:updateGradInput(input, gradOutput)
+      self.gradInput = res
+   elseif fun == self.accGradParametersFFT then
+      self.fallbackModules[fun]:accGradParameters(input, gradOutput, scale)
+      self.gradWeight = self.fallbackModules[fun].gradWeight
+      self.gradBias = self.fallbackModules[fun].gradBias
+   else
+      error('Unknown call ' .. fun)
+   end
+   return res
+end
+
+function SpatialConvolutionFFT:getNormalizationFactor(commonSizes, input)
+   if self.fftImplementation == 'fbfft' then
+      return commonSizes[3] * commonSizes[4]
+   elseif self.fftImplementation then
+      return (input:size(3) + self.padUp + self.padDown) *
+         (input:size(4) + self.padLeft + self.padRight)
+   end
+   error("Unknown fftImpl: " .. self.fftImplementation)
+end
+
+function SpatialConvolutionFFT:backward(input, gradOutput, scale)
+   self.originalStream = cutorch.getStream()
+   scale = scale or 1
+   self:updateGradInput(input, gradOutput)
+   self:wrapFallback(self.accGradParametersFFT,
+                     input,
+                     gradOutput,
+                     scale,
+                     List{self.outputTransposeBuffer})
+   cutorch.setStream(self.originalStream)
+   return self.gradInput
+end
+
+function SpatialConvolutionFFT:updateOutputFFTImpl()
+   assert(false, 'This is an abstract class, must use a derived implementation')
+end
+
+function SpatialConvolutionFFT:updateGradInputFFTImpl()
+   assert(false, 'This is an abstract class, must use a derived implementation')
+end
+
+function SpatialConvolutionFFT:accGradParametersFFTImpl()
+   assert(false, 'This is an abstract class, must use a derived implementation')
+end
+
+function SpatialConvolutionFFT:updateOutputFFT(input, reuseList)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+   local nBatches = input:size(1)
+
+   -- Allocate the output for this module, only once
+   if not self.output or self.output:nElement() == 0 then
+      self.output = torch.CudaTensor(torch.LongStorage({
+         nBatches,
+         self.nOutputPlane,
+         input:size(3) + self.padUp + self.padDown - self.kH + 1,
+         input:size(4) + self.padLeft + self.padRight - self.kW + 1}))
+   end
+
+   if self.printDebugLevel >= 2 then
+      print('PAD ', self.padUp, 'x', self.padLeft)
+      print('ORIGINAL INPUT', {input})
+      print('ORIGINAL WEIGHT', {self.weight})
+      self.output:zero()
+      print('ORIGINAL OUTPUT', {self.output})
+   end
+
+   -- Call the proper Impl
+   self:updateOutputFFTImpl(input, reuseList)
+
+   if self.printDebugLevel >= 0 then
+      print('Post updateOutput ', self.moduleUID, ' memory usage: ',
+            cutorch.getMemoryUsage())
+   end
+
+   if self.printDebugLevel >= 2 then
+      print('FINAL INPUT', {input})
+      print('COMPLEX INPUT POST FFT', {self.inputBuffer})
+      print('COMPLEX INPUT POST TRANSPOSE', {self.inputTransposeBuffer})
+      print('ORIGINAL WEIGHT', {self.weight})
+      print('COMPLEX WEIGHT POST FFT', {self.weightBuffer})
+      print('COMPLEX WEIGHT POST TRANSPOSE', {self.weightTransposeBuffer})
+      print('OUTPUT CPLX TRANSPOSE POST MM', {self.outputTransposeBuffer})
+      print('OUTPUT COMPLEX POST TRANSPOSE', {self.outputBuffer})
+      print('OUTPUT REAL', {self.output})
+   end
+
+   if self.cudnnDebug then
+      local sp = cudnn.SpatialConvolution(self.nInputPlane,
+                                          self.nOutputPlane,
+                                          self.kW,
+                                          self.kH,
+                                          self.dW,
+                                          self.dH,
+                                          self.padLeft,
+                                          self.padUp):cuda()
+      self.cudnnChecks = self.cudnnChecks and
+         self:debugVSCUDNN("updateOutput",
+                           sp,
+                           self,
+                           self.output,
+                           sp.updateOutput,
+                           input)
+      sp = nil
+      collectgarbage()
+      collectgarbage()
+   end
+
+   return self.output
+end
+
+
+-- Update input gradients
+function SpatialConvolutionFFT:updateGradInputFFT(input, gradOutput, reuseList)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+   local nBatches = input:size(1)
+   -- Allocate the gradInput for this module, only once
+   if not self.gradInput or self.gradInput:nElement() == 0 then
+      self.gradInput = torch.CudaTensor(torch.LongStorage({
+                                              nBatches,
+                                              self.nInputPlane,
+                                              input:size(3),
+                                              input:size(4)}))
+   end
+
+   if self.printDebugLevel >= 2 then
+      print('PAD ', self.padUp, 'x', self.padLeft)
+      print('ORIGINAL gradOutput', gradOutput)
+      print('ORIGINAL WEIGHT', self.weight)
+      print('ORIGINAL GRADINPUT', self.gradInput)
+   end
+
+   -- Call the proper Impl
+   self:updateGradInputFFTImpl(input, gradOutput, reuseList)
+
+   if self.printDebugLevel >= 0 then
+      print('Post updateGradInput ', self.moduleUID, ' memory usage: ',
+            cutorch.getMemoryUsage())
+   end
+
+   if self.printDebugLevel >= 2 then
+      print('COMPLEX WEIGHT POST FFT', self.weightBuffer)
+      print('COMPLEX WEIGHT POST TRANSPOSE', self.weightTransposeBuffer)
+      print('COMPLEX GRADOUTPUT POST FFT', self.outputBuffer)
+      print('COMPLEX GRADOUTPUT POST TRANSPOSE', self.outputTransposeBuffer)
+      print('GRADINPUT COMPLEX POST MM', self.inputTransposeBuffer)
+      print('GRADINPUT COMPLEX PRE IFFT', self.inputBuffer)
+      print('REAL GRADINPUT', self.gradInput)
+      print('REAL GRADINPUT PADDED (cufft only)', self.inputPadded)
+   end
+
+   if self.cudnnDebug then
+      local sp = cudnn.SpatialConvolution(self.nInputPlane,
+                                          self.nOutputPlane,
+                                          self.kW,
+                                          self.kH,
+                                          self.dW,
+                                          self.dH,
+                                          self.padLeft,
+                                          self.padUp):cuda()
+      self.cudnnChecks = self.cudnnChecks and
+         self:debugVSCUDNN("updateGradInput",
+                           sp,
+                           self,
+                           self.gradInput,
+                           sp.updateGradInput,
+                           input,
+                           gradOutput)
+      sp = nil
+      collectgarbage()
+      collectgarbage()
+   end
+
+   return self.gradInput
+end
+
+
+-- Accumulate weight gradients
+function SpatialConvolutionFFT:accGradParametersFFT(
+      input, gradOutput, scale, reuseList)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+   if not self.gradWeight or self.gradWeight:nElement() == 0 then
+      assert(false, "GradWeight must already be allocated at module creation")
+   end
+
+   if self.printDebugLevel >= 2 then
+      print('PAD ', self.padUp, 'x', self.padLeft)
+      print('ORIGINAL INPUT', {input})
+      print('ORIGINAL OUTPUT', {gradOutput})
+      print('ORIGINAL WEIGHT', {self.gradWeight})
+   end
+
+   -- Call the proper Impl
+   self:accGradParametersFFTImpl(input, gradOutput, scale, reuseList)
+
+   if self.printDebugLevel >= 0 then
+      print('Post accGradParameters ', self.moduleUID, ' memory usage: ',
+            cutorch.getMemoryUsage())
+   end
+
+   if self.printDebugLevel >= 2 then
+      print('OUTPUT COMPLEX POST TRANSPOSE', {self.outputBuffer})
+      print('OUTPUT CPLX TRANSPOSE POST MM', {self.outputTransposeBuffer})
+      print('COMPLEX INPUT POST TRANSPOSE', {self.inputTransposeBuffer})
+      print('COMPLEX INPUT POST FFT', {self.inputBuffer})
+      print('COMPLEX WEIGHT POST FFT', {self.weightBuffer})
+      print('COMPLEX WEIGHT POST TRANSPOSE', {self.weightTransposeBuffer})
+      print('REAL GRADWEIGHT', {self.weightPadded})
+      print('REAL GRADWEIGHT', {self.gradWeight})
+      print("SCALE: " .. scale)
+   end
+
+   if self.cudnnDebug then
+      local saveBias = self.gradBias:float():clone()
+      local sp = cudnn.SpatialConvolution(self.nInputPlane,
+                                          self.nOutputPlane,
+                                          self.kW,
+                                          self.kH,
+                                          self.dW,
+                                          self.dH,
+                                          self.padLeft,
+                                          self.padUp):cuda()
+      self.cudnnChecks = self.cudnnChecks and
+         self:debugVSCUDNN("accGradParameters_gradWeight",
+                           sp,
+                           self,
+                           self.gradWeight,
+                           wrapCUDNN_accGradParameters_gradWeight,
+                           input,
+                           gradOutput,
+                           scale)
+
+      self.cudnnChecks = self.cudnnChecks and
+         self:debugVSCUDNN("accGradParameters_gradBias",
+                           sp,
+                           self,
+                           saveBias,
+                           wrapCUDNN_accGradParameters_gradBias,
+                           input,
+                           gradOutput,
+                           scale)
+      sp = nil
+      collectgarbage()
+      collectgarbage()
+   end
+end
+
+
+-- Buffer creation and reuse given a size and a pass.
+-- Different passes use different tensors as the 'output of the pass'.
+--   SpatialConvolutionFFT.ForwardFFTPass -> output
+--   SpatialConvolutionFFT.BackwardFFTPass -> input
+--   SpatialConvolutionFFT.AccGradientFFTPass -> weight
+-- The buffers corresponding to the tensors that is the 'output of the pass'
+-- must be properly transposed in order for the CGemm call to be consistent.
+-- This is a simple metadata transposition, might as well construct properly.
+--
+-- This function contains the least common denominator of buffers needed for
+-- all implementations.
+
+SpatialConvolutionFFT.ForwardFFTPass = 1
+SpatialConvolutionFFT.BackwardFFTPass = 2
+SpatialConvolutionFFT.AccGradientFFTPass = 3
+
+-- Meta-data is user specific metadata which influences the lifetime of the
+-- buffers. Atm this is SpatialConvolutionFFTTiled-specific but if the network
+-- is not too large, especially with parallel containers, this is a good
+-- opportunity to reuse FFT computations.
+function SpatialConvolutionFFT:prepareBuffers(commonSize, pass, metaData)
+   assert(commonSize and self.fftImplementation)
+   assert(torch.type(metaData) == 'table', torch.type(metaData))
+
+   local bufferSizesO = torch.LongStorage({
+         commonSize[1], self.nOutputPlane, commonSize[3], commonSize[4]})
+   local bufferSizesW = torch.LongStorage({
+         self.nOutputPlane, self.nInputPlane, commonSize[3], commonSize[4]})
+
+   self.inputBuffer =
+      self:getBuffer(
+         SpatialConvolutionFFT.FFTInputBufferType, -- buffer type
+         commonSize,                               -- buffer size
+         false,                                    -- transposeLayout
+         metaData)                   -- SpatialConvolutionFFTTiled-specific
+   self.outputBuffer =
+      self:getBuffer(
+         SpatialConvolutionFFT.FFTOutputBufferType,
+         bufferSizesO,
+         false,
+         metaData)                   -- SpatialConvolutionFFTTiled-specific
+   self.weightBuffer =
+      self:getBuffer(
+         SpatialConvolutionFFT.FFTWeightBufferType,
+         bufferSizesW,
+         false,
+         metaData)                   -- SpatialConvolutionFFTTiled-specific
+
+   if self.inputBuffer and self.outputBuffer and self.weightBuffer then
+      return true
+   end
+
+   -- TODO: From here on, we should failsafe to another SpatialConvolution
+   self:cleanupBuffers()
+
+   error('Not enough memory for FFT buffers, need to fall back')
+end
+
+
+-- Returns nil if it cannot allocate a new buffer (for error recovery cases)
+function SpatialConvolutionFFT:getBuffer(
+      BufferType, tensorSizes, transposedLayout, metaData)
+   assert(torch.type(metaData) == 'table', torch.type(metaData))
+
+   local d1 = tensorSizes[1]
+   local d2 = tensorSizes[2]
+   local d3 = tensorSizes[3]
+   local d4 = tensorSizes[4]
+
+   local numElements = 0
+   local sizes = torch.LongStorage({0})
+   local isRealBuffer = SpatialConvolutionFFT.cudaRealBufferTypes:contains(
+      BufferType)
+   local isComplexBuffer = not isRealBuffer
+
+   if isComplexBuffer then
+      -- fbfft and cufft have different layouts
+      assert(self.fftImplementation)
+      if self.fftImplementation == 'fbfft' then
+         numElements = d1 * d2 * (d3 / 2 + 1) * d4 * 2
+         if transposedLayout then
+            -- The buffers corresponding to the tensors that is the
+            -- 'output of the pass' must be properly transposed in order for the
+            -- CGemm call to be consistent.
+            -- This is a simple metadata transposition, might as well construct
+            -- properly.
+            sizes = torch.LongStorage({d3 / 2 + 1, d4, d1, d2, 2})
+         else
+            sizes = torch.LongStorage({d1, d2, d3 / 2 + 1, d4, 2})
+         end
+      else
+         numElements = d1 * d2 * d3 * (d4 / 2 + 1) * 2
+         if transposedLayout then
+            -- The buffers corresponding to the tensors that is the
+            -- 'output of the pass' must be properly transposed in order for the
+            -- CGemm call to be consistent.
+            -- This is a simple metadata transposition, might as well construct
+            -- properly.
+            sizes = torch.LongStorage({d3, d4 / 2 + 1, d1, d2, 2})
+         else
+            sizes = torch.LongStorage({d1, d2, d3, d4 / 2 + 1, 2})
+         end
+      end
+   else
+      -- Real buffers, for padding purposes in first approx
+      if self.fftImplementation == 'cufft' and
+      SpatialConvolutionFFT.cudaPaddedBufferTypes:contains(BufferType) then
+         numElements = d1 * d2 * d3 * d4
+         -- TODO: potentially wasteful if original tensor is already of
+         -- tensorSizes. Could clean this up but requires knowing the original
+         -- tensor as a model for which we pad.
+         sizes = torch.LongStorage({d1, d2, d3, d4})
+      end
+      -- else allocate an empty tensor, nil is reserved for errors
+   end
+
+   assert(sizes and #sizes > 0)
+
+   -- Conservative max buffer size, always needed at least by fbfft
+   -- Handle memory bloat by tiled convolutions + inplace fft
+   local bufferKey = self:getBufferKey(BufferType, sizes, metaData)
+   if SpatialConvolutionFFT.bufferMap[bufferKey] == nil then
+      local free_bytes = cutorch.getMemoryUsage()
+      if numElements * sizeOfElem > free_bytes then
+         return nil
+      end
+
+      local before = cutorch.getMemoryUsage()
+      SpatialConvolutionFFT.bufferMap[bufferKey] = torch.CudaTensor(sizes)
+      local after = cutorch.getMemoryUsage()
+      if self.printDebugLevel >= 1 then
+         print('FFT Buffer Create Allocated ', before - after)
+      end
+   else
+      -- Storage already exists but may need resizing.
+      -- If resizing means expanding, make sure we have enough space
+      local t = SpatialConvolutionFFT.bufferMap[bufferKey]
+      if numElements > t:nElement() then
+         -- Don't call cuda API unless really needed
+         local free_bytes = cutorch.getMemoryUsage()
+         -- Resize is not in place, need to hold both in memory at some point
+         -- The subsequent resize cannot fail in cuda land or we're hosed and
+         -- cudaGetLastError will be 2.
+         if (numElements + t:nElement()) * sizeOfElem > free_bytes then
+            assert(false, 'Out of memory: cannot hold both tensors for resize')
+         end
+         local before = cutorch.getMemoryUsage()
+         t:resize(sizes)
+         local after = cutorch.getMemoryUsage()
+         if self.printDebugLevel >= 1 then
+            print('FFT Buffer Resize Allocated ', before - after)
+         end
+      else
+         -- Still need to resize to make the sizes / strides as expected but
+         -- this does cost extra memory
+         t:resize(sizes)
+      end
+   end
+
+   local t = SpatialConvolutionFFT.bufferMap[bufferKey]
+   assert(t, 'Tensor buffer improperly set')
+
+   for d = 1, t:nDimension() do
+      if (sizes[d] ~= t:size(d)) then
+         print("Put / get buffer dimension mismatch! d = ", d, " expected = ",
+               sizes, " actual = ", {t})
+         assert(sizes[d] == t:size(d))
+      end
+   end
+
+   return t
+end
+
+function SpatialConvolutionFFT:freeBuffer(bufferKey)
+   local tensor = SpatialConvolutionFFT.bufferMap[bufferKey]
+   if tensor then
+      SpatialConvolutionFFT.bufferMap[bufferKey] = nil
+   end
+end
+
+-- Returns a string key, not hashed atm.
+-- For instance, in SpatialConvolutionFFTTiled, this helps the creation of
+-- different buffers for various tile tensorSize, tileSize and tileIndices.
+-- This is important in order to reuse frequency domain representation
+-- of tiled pieces of the tensors.
+-- This allows trading off reuse for memory consumption.
+--
+-- In FBFFT and CuFFT however, memory consumption can grow quickly so one should
+-- only use a single buffer per BufferType.
+-- If we had some user information that the buffers remain small enough, we
+-- could have per module persistent buffers that would allow reuse.
+function SpatialConvolutionFFT:getBufferKey(BufferType, bufferSizes, metaData)
+   assert(false, "getBufferKey controls buffers lifetime: must be overridden")
+end
+
+
+-- This implementation reuses buffers and keeps memory consumption minimal
+-- (but this can still be a lot).
+-- In particular, we only discriminate buffers by deviceId and type of buffer
+-- by default.
+-- This means we only have 1 copy of each type of buffer per device.
+-- The same buffers are reused across any call of any module so the only
+-- possible reuse is the reuse of gradOutput in the backward function.
+-- This requires that backward be properly implemented in container modules
+-- to allow such reuse.
+-- For more advanced reuses, a proper getBufferKey function needs to be
+-- implemented, tradeoffs will be made between reuse and memory consumption.
+function SpatialConvolutionFFT:getBufferKeyGeneric(BufferType)
+   local bufferKey = {
+      SpatialConvolutionFFT.cudaTensorBuffers,
+      cutorch.getDevice(),
+      BufferType,
+   }
+   local res = thrift.to_string(bufferKey)
+   if not self.bufferKeys:contains(res) then
+      self.bufferKeys:append(res)
+   end
+   return res
+end
+
+function SpatialConvolutionFFT:cleanupBuffers()
+   -- release all local result tensors and all buffers
+   self.output = nil
+   self.gradInput = nil
+
+   -- Kill local references to global buffers
+   self.inputBuffer = nil
+   self.outputBuffer = nil
+   self.weightBuffer = nil
+
+   -- Free all buffers
+   local len = self.bufferKeys:len()
+   for i = 1, len do
+      self:freeBuffer(self.bufferKeys:pop())
+   end
+
+   self.fallbackModules = {}
+   SpatialConvolutionFFT.cudaTensorBuffers = {}
+end
+
+
+-- Type: input/gradInput, output/gradOutput or weight/gradWeight
+-- Could lookup bit operations in lua and do in 1 line, just use a loop atm
+local function nextPowerOf2(val)
+   for i = 1, 10 do
+      if (2 ^ i) >= val then
+         return (2 ^ i)
+      end
+   end
+   assert(false, 'Too large a convolution dimensions: ', val)
+end
+
+function SpatialConvolutionFFT:prepareCuFFTSizeAndBuffers(
+      i, w, o, metaData, pass)
+   local commonSize = i:size()
+   -- If we use cufft we should use rectangular regions where the width is a
+   -- power of 2. This is usually good enough approximation between FFT
+   -- efficiency and avoiding spurious work.
+   commonSize[3] =
+      math.max(i:size(3) + self.padUp + self.padDown,
+               w:size(3),
+               o:size(3))
+   commonSize[4] =
+      nextPowerOf2(math.max(i:size(4) + self.padLeft + self.padRight,
+                            w:size(4),
+                            o:size(4)))
+   self:prepareBuffers(commonSize, pass, metaData)
+
+   assert(self.fftImplementation == "cufft",
+          "CuFFT convolution module expected!")
+   assert(self.inputPadded and self.weightPadded and self.outputPadded,
+          "CuFFT requires padded input, weight and output")
+
+   if o == self.output then
+      self.inputPadded:zero()
+      self.weightPadded:zero()
+   elseif w == self.weight then
+      self.weightPadded:zero()
+      self.outputPadded:zero()
+   else
+      self.inputPadded:zero()
+      self.outputPadded:zero()
+   end
+
+   return commonSize -- needed for normalization factor
+end
+
+function SpatialConvolutionFFT:prepareFBFFTGemmSizeAndBuffers(
+      i, w, o, metaData, pass)
+   local commonSize = i:size()
+   -- If we use cufft we should use rectangular regions where the width is a
+   -- power of 2. This is usually good enough approximation between FFT
+   -- efficiency and avoiding spurious work.
+   commonSize[3] =
+      nextPowerOf2(math.max(i:size(3) + self.padUp + self.padDown,
+                            i:size(4) + self.padLeft + self.padRight,
+                            w:size(3),
+                            w:size(4),
+                            o:size(3),
+                            o:size(4)))
+   commonSize[4] = commonSize[3]
+   self:prepareBuffers(commonSize, pass, metaData)
+
+   assert(self.fftImplementation == "fbfft",
+          "FBFFT convolution module expected!")
+   assert(not self.inputPadded and not self.weightPadded and
+             not self.outputPadded,
+          "CuFFT requires padded input, weight and output")
+
+   return commonSize -- needed for normalization factor
+end
+
+local NO_TRANSPOSE = nil
+
+-- Makes or reuses square FFT buffers up to the next power of 2
+function SpatialConvolutionFFT:prepareFBFFTSizeAndBuffers(i, w, o, metaData)
+   local commonSize = i:size()
+   commonSize[3] =
+      nextPowerOf2(math.max(i:size(3) + self.padUp + self.padDown,
+                            i:size(4) + self.padLeft + self.padRight,
+                            w:size(3),
+                            w:size(4),
+                            o:size(3),
+                            o:size(4)))
+   commonSize[4] = commonSize[3]
+   self:prepareBuffers(commonSize, NO_TRANSPOSE, metaData)
+   assert(self.fftImplementation == "fbfft",
+          "FBFFT convolution module expected!")
+   assert(not self.inputPadded and not self.weightPadded and
+             not self.outputPadded,
+          "FBFFT does not expect padded input, weight and output")
+   return commonSize -- needed for normalization factor
+end
+
+function SpatialConvolutionFFT:setReuseInputs(val)
+   assert(type(val) == 'boolean')
+   self:_setReuse(val, nn.SpatialConvolutionFFT.memoryReuseInput)
+end
+
+function SpatialConvolutionFFT:setReuseOutputs(val)
+   assert(type(val) == 'boolean')
+   self:_setReuse(val, nn.SpatialConvolutionFFT.memoryReuseOutput)
+end
+
+function SpatialConvolutionFFT:setReuseWeights(val)
+   assert(type(val) == 'boolean')
+   self:_setReuse(val, nn.SpatialConvolutionFFT.memoryReuseWeight)
+end
+
+function SpatialConvolutionFFT:_setReuse(val, toReuse)
+   assert(type(val) == 'boolean')
+   assert(toReuse == nn.SpatialConvolutionFFT.memoryReuseInput or
+             toReuse == nn.SpatialConvolutionFFT.memoryReuseOutput or
+             toReuse == nn.SpatialConvolutionFFT.memoryReuseWeight,
+          toReuse)
+
+   if val then
+      if self.memoryReusePolicy:contains(
+            nn.SpatialConvolutionFFT.memoryReuseNone) then
+         -- Override
+         self.memoryReusePolicy = List{toReuse}
+      elseif self.memoryReusePolicy:contains(toReuse) then
+         -- Do nothing
+         return
+      else
+         self.memoryReusePolicy:append(toReuse)
+      end
+   else
+      if self.memoryReusePolicy:contains(toReuse) then
+         self.memoryReusePolicy:remove_value(toReuse)
+         -- Set at least "none"
+         if self.memoryReusePolicy:len() == 0 then
+            self.memoryReusePolicy:append(
+               nn.SpatialConvolutionFFT.memoryReuseNone)
+         end
+      else
+         -- Do nothing
+         return
+      end
+   end
+end
diff --git a/fbcunn/SpatialConvolutionFFTTiled.lua b/fbcunn/SpatialConvolutionFFTTiled.lua
new file mode 100644
index 0000000..bf69280
--- /dev/null
+++ b/fbcunn/SpatialConvolutionFFTTiled.lua
@@ -0,0 +1,924 @@
+-- Copyright 2004-present Facebook. All Rights Reserved.
+
+require 'cudnn'
+local List = require 'pl.List'
+local thrift = require('fb.thrift')
+
+local function errorIf(cond, msg)
+   if cond then
+      error(msg)
+   end
+end
+
+local function errorIfNot(cond, msg)
+   errorIf(not cond, msg)
+end
+
+--[[
+   Move to Tensor.lua
+
+   This helper funtion returns a pl.List of 2-D tiled views into the tensor
+   passed in input, corresponding to tiling by the specified tiles sizes, with
+   specfied step sizes and implicit padding sizes.
+   Tiling is performed on the innermost 2 dimensions so tensor:nDimension must
+   be >= 2.
+
+   -- TileDescriptor "declaration"
+   local TiledTensorDescriptor = {}
+   -- Original tile sizes asked for for proper Fourier basis decomposition
+   TiledTensorDescriptor.tileSizeH = tileSizeH
+   TiledTensorDescriptor.tileSizeW = tileSizeW
+   -- Index of the tile in tile space
+   TiledTensorDescriptor.tileIndexH = tileIndexH
+   TiledTensorDescriptor.tileIndexW = tileIndexW
+   -- Actual tensor size, full tiles have tensorSize == tileSize
+   TiledTensorDescriptor.tensorSizeH = tensorSizeH
+   TiledTensorDescriptor.tensorSizeW = tensorSizeW
+   -- Up and Left padding for up and left boundary tile.
+   -- Down and Right are obtained by implicit zero padding up to
+   -- original tile size
+   TiledTensorDescriptor.padUp = padUp
+   TiledTensorDescriptor.padLeft = padLeft
+   -- The view in the original tensor
+   TiledTensorDescriptor.tensor = torch.Tensor()
+
+   By default tiling returns all the subtensors, including partial tensors on
+   the boundaries, that have at least one element when traversed by
+   tileSizeH x tileSizeW with stride stepH x stepW.
+   When performing convolutions, tiling semantics may not be sufficient.
+   For consistency, the tiling of the tensor written into, informs how many
+   tiles we should obtain from the tensor read from; this information is
+   conveyed by numTilesH x numTilesW.
+   The consistency check is that the tiling of the tensor read from, must always
+   cover the full read tensor.
+--]]
+local function TiledView2D(tensor,
+                           tileSizeH,
+                           tileSizeW,
+                           stepH,
+                           stepW,
+                           padLeft,
+                           padUp,
+                           padRight,
+                           padDown,
+                           numTilesH,
+                           numTilesW)
+   -- Initialization
+   local stepH = stepH or tileSizeH
+   local stepW = stepW or tileSizeW
+   local padUp = padUp or 0
+   local padLeft = padLeft or 0
+   local padDown = padDown or 0
+   local padRight = padRight or 0
+   local dimIndexH = tensor:nDimension() - 1
+   local dimIndexW = tensor:nDimension()
+   local numTilesH = numTilesH or 1e100 -- maxint would be nice
+   local numTilesW = numTilesW or 1e100 -- maxint would be nice
+
+   local printDebugLevel = -1
+   if printDebugLevel >= 1 then
+      print("Tile ", tensor:size(), " by ", tileSizeH, "x", tileSizeW,
+            " with step ", stepH, "x", stepW, " and pad ",
+            padUp, "x", padLeft, "x", padDown, "x", padRight)
+   end
+
+   -- Input validation, reject padding larger than tile size or kernel size
+   assert(tensor:nDimension() >= 2)
+   assert(tileSizeH and tileSizeW, 'both tile sizes must be specified')
+   assert(padUp >= 0 and padUp < tileSizeH, "padUp = " .. padUp ..
+             " >= (incompatible with) tileSizeH = " .. tileSizeH)
+   assert(padLeft >= 0 and padLeft < tileSizeW, "padLeft = " .. padLeft ..
+             " >= (incompatible with) with tileSizeW = " .. tileSizeW)
+   assert(padDown >= 0 and padDown < tileSizeH, "padDown = " .. padDown ..
+             " >= (incompatible with) with tileSizeH = " .. tileSizeH)
+   assert(padRight >= 0 and padRight < tileSizeW, "padRight = " .. padRight ..
+             " >= (incompatible with) with tileSizeW = " .. tileSizeW)
+   assert(tileSizeW > 0 and tileSizeH > 0, "")
+   assert(stepH > 0 and stepW > 0,
+          "Step sizes " .. stepH .. " x " .. stepW .. " both expected > 1. " ..
+             "Otherwise, tileSize <= kernel size which should not occur")
+   assert(padUp >= 0 and padDown >= 0 and padLeft >= 0 and padRight >= 0)
+   errorIfNot(tileSizeH < tensor:size(dimIndexH),
+              "Tiling must be smaller than tensor size !")
+   errorIfNot(tileSizeW < tensor:size(dimIndexW),
+              "Tiling must be smaller than tensor size !")
+   assert(#tensor:size() == dimIndexW and #tensor:stride() == dimIndexW)
+
+   -- TileDescriptor generating loop
+   local maxTileIndexH = 0
+   local maxTileIndexW = 0
+   local tensors = List{}
+   local tensorStrideH = tensor:stride(dimIndexH)
+   local tensorStrideW = tensor:stride(dimIndexW)
+   local tileIndexH = 0
+   for y = -padUp + 1, tensor:size(dimIndexH), stepH do
+
+      -- Continue would be nice here to avoid level of nesting !
+      if tileIndexH < numTilesH then
+
+         local tileIndexW = 0
+         for x = -padLeft + 1, tensor:size(dimIndexW), stepW do
+
+            -- Continue would be nice here to avoid level of nesting !
+            if tileIndexW < numTilesW then
+
+               -- Descriptor for each tiled tensor
+               local TiledTensorDescriptor = {}
+
+               -- Handle special boundary case for partial tile along y
+               local tensorSizeH = 0
+               if y <= 0 then
+                  tensorSizeH = tileSizeH + (y-1)
+                  TiledTensorDescriptor.padUp = -(y-1) -- padUp
+               else
+                  -- If we generate a tile, make sure its size does not overflow
+                  tensorSizeH = math.max(
+                     1, math.min(tileSizeH, tensor:size(dimIndexH) - (y-1)))
+                  TiledTensorDescriptor.padUp = 0
+               end
+               TiledTensorDescriptor.tensorSizeH = tensorSizeH
+               TiledTensorDescriptor.tileIndexH = tileIndexH
+
+               -- Handle special boundary case for partial tile along x
+               local tensorSizeW = 0
+               if x <= 0 then
+                  tensorSizeW = tileSizeW + (x-1)
+                  TiledTensorDescriptor.padLeft = -(x-1) -- padLeft
+               else
+                  -- If we generate a tile, make sure its size does not overflow
+                  tensorSizeW = math.max(
+                     1, math.min(tileSizeW, tensor:size(dimIndexW) - (x-1)))
+                  TiledTensorDescriptor.padLeft = 0
+               end
+               TiledTensorDescriptor.tensorSizeW = tensorSizeW
+               TiledTensorDescriptor.tileIndexW = tileIndexW
+
+               -- Allocate tensor with partial or full size and full stride
+               -- for proper wraparound
+               local sizes =
+                  torch.LongStorage(tensor:nDimension()):copy(tensor:size())
+               sizes[#sizes - 1] = tensorSizeH
+               sizes[#sizes] = tensorSizeW
+               local tensorTiled = torch.Tensor():typeAs(tensor)
+               tensorTiled:set(
+                  tensor:storage(),
+                  tensor:storageOffset() +
+                     math.max((y-1), 0) * tensorStrideH +
+                     math.max((x-1), 0) * tensorStrideW,
+                  sizes,
+                  tensor:stride())
+
+               TiledTensorDescriptor.tileSizeH = tileSizeH
+               TiledTensorDescriptor.tileSizeW = tileSizeW
+               TiledTensorDescriptor.tensor = tensorTiled
+
+               -- Handling partial til on the bottom and right sides
+               -- Important to get interpolation right in frequency domain
+               tensors:append(TiledTensorDescriptor)
+
+               if printDebugLevel >= 1 then
+                  print('y = ' .. y .. ' x = ' .. x ..
+                           ' tile index = ' .. tileIndexH .. ' x '.. tileIndexW)
+                  print(TiledTensorDescriptor)
+                  if printDebugLevel >= 2 then
+                     print(TiledTensorDescriptor.tensor)
+                  end
+               end
+
+               assert(tensor:size(dimIndexH) + padUp + padDown -
+                         tileIndexH * stepH > 0, "Error tileIndexH = " ..
+                         tileIndexH .. " stepH = " .. stepH)
+               assert(tensor:size(dimIndexW) + padLeft + padRight -
+                         tileIndexW * stepW > 0, "Error tileIndexW = " ..
+                         tileIndexW .. " stepW = " .. stepW)
+               assert(tensorSizeH > 0, 'tensorSizeH = ' .. tensorSizeH)
+               assert(tensorSizeW > 0, 'tensorSizeW = ' .. tensorSizeW)
+               assert(y <= tensor:size(dimIndexH), 'Overflow y = ' .. y ..
+                         ' > size = ' .. tensor:size(dimIndexH))
+               assert(x <= tensor:size(dimIndexW), 'Overflow x = ' .. x ..
+                         ' > size = ' .. tensor:size(dimIndexW))
+
+
+               if maxTileIndexW < tileIndexW then
+                  maxTileIndexW = tileIndexW
+               end
+               tileIndexW = tileIndexW + 1
+            else  -- if tileIndexW < numTilesW
+               assert(x + tileSizeW - stepW >= tensor:size(dimIndexW))
+            end -- if tileIndexW < numTilesW
+         end -- for x
+
+         if maxTileIndexH < tileIndexH then
+            maxTileIndexH = tileIndexH
+         end
+         tileIndexH = tileIndexH + 1
+      else -- if not tileIndexH < numTilesH
+         assert(y + tileSizeH - stepH >= tensor:size(dimIndexH))
+      end -- if tileIndexH < numTilesH
+   end -- for y
+
+   return tensors, maxTileIndexH, maxTileIndexW
+end
+
+-- Not really a string but I want to print this structure
+local function TiledTensorDescriptorToString(TiledTensorDescriptor)
+   local toPrint = {}
+   toPrint.td = TiledTensorDescriptor
+   toPrint.tensorAddress = TiledTensorDescriptor.tensor:cdata()
+   toPrint.storageAddress = TiledTensorDescriptor.tensor:storage():cdata()
+   toPrint.storageOffset = TiledTensorDescriptor.tensor:storageOffset()
+   return toPrint
+end
+
+local function _printDebugAndAssert(
+      debugLevel, index, inputTensorList, outputTensorList)
+   if debugLevel == 1 then
+      print("Convolve input", index, " / ",
+            outputTensorList:len(), " :\n",
+            TiledTensorDescriptorToString(inputTensorList[index]),
+            '\n Convolve output\n',
+            TiledTensorDescriptorToString(outputTensorList[index]))
+   elseif debugLevel >= 2 then
+      print("Convolve input", index, " / ",
+            outputTensorList:len(), " :\n",
+            TiledTensorDescriptorToString(inputTensorList[index]),
+            inputTensorList[index].tensor,
+            '\n Convolve output\n',
+            TiledTensorDescriptorToString(outputTensorList[index]),
+            outputTensorList[index].tensor)
+   end
+
+   -- Assert tiles are traversed in the same order otherwise
+   -- you can forget about correctness
+   assert(outputTensorList[index].tileIndexH ==
+             inputTensorList[index].tileIndexH)
+   assert(outputTensorList[index].tileIndexW ==
+             inputTensorList[index].tileIndexW)
+end
+
+------------------------------------------------------------------------------
+--   Actual Module
+------------------------------------------------------------------------------
+local SpatialConvolutionFFTTiled, parent =
+   torch.class('nn.SpatialConvolutionFFTTiled', 'nn.SpatialConvolutionFBFFT')
+
+function SpatialConvolutionFFTTiled:__init(nInputPlane,
+                                           nOutputPlane,
+                                           kW,
+                                           kH,
+                                           dW,
+                                           dH,
+                                           padLeft,
+                                           padUp,
+                                           tileSizeW,
+                                           tileSizeH,
+                                           memoryReusePolicy,
+                                           numCudaStreams)
+
+   assert(torch.type(nInputPlane) == 'number')
+   assert(torch.type(nOutputPlane) == 'number')
+   assert(torch.type(kW) == 'number')
+   assert(torch.type(kH) == 'number')
+   assert(torch.type(dW) == 'number')
+   assert(torch.type(dH) == 'number')
+   assert(padLeft == nil or torch.type(padLeft) == 'number')
+   assert(padUp == nil or torch.type(padUp) == 'number')
+
+   assert(tileSizeW == nil or torch.type(tileSizeW) == 'number')
+   assert(tileSizeH == nil or torch.type(tileSizeH) == 'number')
+   assert(memoryReusePolicy == nil or
+             torch.type(memoryReusePolicy) == 'string' or
+             torch.type(memoryReusePolicy) == 'table')
+   assert(numCudaStreams == nil or torch.type(numCudaStreams) == 'number')
+
+   parent.__init(self,
+                 nInputPlane,
+                 nOutputPlane,
+                 kW,
+                 kH,
+                 dW,
+                 dH,
+                 padLeft,
+                 padUp,
+                 memoryReusePolicy,
+                 numCudaStreams)
+
+   -- Sanity assertions
+   assert(self.printDebugLevel == -1)
+   assert(self.nInputPlane == nInputPlane)
+   assert(self.nOutputPlane == nOutputPlane)
+   assert(self.kW == kW)
+   assert(self.kH == kH)
+   assert(self.dH == 1, "fft only supports stride-1 convolutions atm")
+   assert(self.dW == 1, "fft only supports stride-1 convolutions atm")
+
+   assert(self.padLeft == padLeft or self.padLeft == 0)
+   assert(self.padUp == padUp or self.padUp == 0)
+   assert(self.padRight == self.padLeft)
+   assert(self.padDown == self.padUp)
+
+   assert(self.fftImplementation == 'fbfft')
+
+   assert(self.padUp < self.kH and self.padDown < self.kH and
+             self.padLeft < self.kW and self.padRight < self.kW,
+          "Padding must be smaller than kernel")
+
+   assert(self.weight:size(1) == nOutputPlane and
+             self.weight:size(2) == nInputPlane and
+             self.weight:size(3) == kH and
+             self.weight:size(4) == kW)
+   assert(self.bias:size(1) == nOutputPlane)
+   assert(self.gradWeight:size(1) == nOutputPlane and
+             self.gradWeight:size(2) == nInputPlane and
+             self.gradWeight:size(3) == kH and
+             self.gradWeight:size(4) == kW)
+   assert(self.gradBias:size(1) == nOutputPlane)
+
+
+   -- Temporary buffers, would be nice to reduce code size here
+   assert(not self.inputBuffer)
+   assert(not self.inputTransposeBuffer)
+   assert(not self.inputPadded)
+   assert(not self.outputBuffer)
+   assert(not self.outputTransposeBuffer)
+   assert(not self.outputPadded)
+   assert(not self.weightBuffer)
+   assert(not self.weightTransposeBuffer)
+   assert(not self.weightPadded)
+
+   -- CuFFT plans, useless for fbfft
+   assert(not self.cufftPlanInputFFT)
+   assert(not self.cufftPlanWeightFFT)
+   assert(not self.cufftPlanOutputFFT)
+   assert(not self.cufftPlanInputIFFT)
+   assert(not self.cufftPlanWeightIFFT)
+   assert(not self.cufftPlanOutputIFFT)
+
+   self:reset()
+
+   -- Tiling metadata
+   self.tileSizeH = tileSizeH or 16
+   self.tileSizeW = tileSizeW or 16
+   -- updateOutput
+   self.inputTensorList = nil
+   self.outputTensorList = nil
+   -- updateGradInput
+   self.gradInputTensorList = nil
+   self.gradOutputTensorList = nil
+   -- accGradParameters
+   self.inputTensorList2 = nil
+   self.gradOutputTensorList2 = nil
+end
+
+
+local function printDebugAndAssert(
+      debugLevel, index, inputTensorList, outputTensorList)
+   if debugLevel == 1 then
+      print("Convolve input", index, " / ",
+            outputTensorList:len(), " :\n",
+            TiledTensorDescriptorToString(inputTensorList[index]),
+            '\n Convolve output\n',
+            TiledTensorDescriptorToString(outputTensorList[index]))
+   elseif debugLevel >= 2 then
+      print("Convolve input", index, " / ",
+            outputTensorList:len(), " :\n",
+            TiledTensorDescriptorToString(inputTensorList[index]),
+            inputTensorList[index].tensor,
+            '\n Convolve output\n',
+            TiledTensorDescriptorToString(outputTensorList[index]),
+            outputTensorList[index].tensor)
+   end
+
+   -- Assert tiles are traversed in the same order otherwise
+   -- you can forget about correctness
+   assert(outputTensorList[index].tileIndexH ==
+             inputTensorList[index].tileIndexH)
+   assert(outputTensorList[index].tileIndexW ==
+             inputTensorList[index].tileIndexW)
+end
+
+
+function SpatialConvolutionFFTTiled:pushPadding(index, tensorList)
+   local savePadUp, savePadLeft, savePadDown, savePadRight
+   savePadUp, self.padUp = self.padUp, tensorList[index].padUp
+   savePadLeft, self.padLeft = self.padLeft, tensorList[index].padLeft
+   -- Complete padding up to tile size so that interpolation
+   -- occurs in the right Fourier basis
+   savePadDown, self.padDown =
+      self.padDown, math.max(
+         0, tensorList[index].tileSizeH -
+            (self.padUp + tensorList[index].tensorSizeH))
+   savePadRight, self.padRight =
+      self.padRight, math.max(
+         0, tensorList[index].tileSizeW -
+            (self.padLeft + tensorList[index].tensorSizeW))
+
+   return savePadUp, savePadLeft, savePadDown, savePadRight
+end
+
+
+function SpatialConvolutionFFTTiled:pushPaddingWithCircularSymmetry(
+      index)
+   local savePadUp, savePadLeft, savePadDown, savePadRight
+   -- Fun with padding and circular symmetry in Fourier domain
+   -- This acts upon shifting the IFFT result into the proper position
+   -- into gradInput
+   savePadUp, self.padUp =
+      self.padUp, self.kH - 1 + self.gradInputTensorList[index].padUp -
+      self.gradOutputTensorList[index].padUp
+   savePadLeft, self.padLeft =
+      self.padLeft, self.kW - 1 + self.gradInputTensorList[index].padLeft -
+      self.gradOutputTensorList[index].padLeft
+   -- Complete padding up to tile size so that interpolation
+   -- occurs in the right Fourier basis.
+   -- The invariant is that the size of gradOutput and gradInput should
+   -- always be padded up to the tiling size. In the particular case
+   -- of gradInput, we must additionally consider input padding.
+
+   assert(self.gradOutputTensorList[index].tensorSizeH)
+   assert(self.gradInputTensorList[index].tensorSizeH)
+
+   savePadDown, self.padDown =
+      self.padDown,
+   math.max(0, self.tileSizeH - math.max(
+               self.gradOutputTensorList[index].tensorSizeH,
+               self.gradInputTensorList[index].tensorSizeH + self.padUp))
+   savePadRight, self.padRight =
+      self.padRight,
+   math.max(0, self.tileSizeW - math.max(
+               self.gradOutputTensorList[index].tensorSizeW,
+               self.gradInputTensorList[index].tensorSizeW + self.padLeft))
+   return savePadUp, savePadLeft, savePadDown, savePadRight
+end
+
+function SpatialConvolutionFFTTiled:updateOutputFFTImpl(input)
+   local ok, res =
+      pcall(SpatialConvolutionFFTTiled.abstractUpdateOutputFFTImpl, self, input)
+   if ok then
+      return res
+   end
+   self.success = false
+   if self.reportErrors then
+      print(res .. " -> updateOutput fallback to untiled FBFFT")
+   end
+
+   -- This path exits early for tuned SpatialConvolution.lua
+   self.success = false
+   if self.autotuningPass then
+      error('Using tuned SpatialConvolution and found an error, early exit')
+   end
+
+   error("Bug in fallback form Tiled to FBFFT on updateOutput" ..
+            " Drop back higher up in the food chain")
+   -- This path is becoming obsolete
+   -- Safety barrier and no reuse for error recovery
+   self.memoryReusePolicy = List{
+      nn.SpatialConvolutionFFT.memoryReuseNone}
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+   return parent.updateOutputFFTImpl(self, input)
+end
+
+
+function SpatialConvolutionFFTTiled:instUpdateOutputFFTImpl(
+      input, gradOutput)
+   assert(false, "Do not call the abstract class directly!")
+end
+
+
+function SpatialConvolutionFFTTiled:abstractUpdateOutputFFTImpl(input)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   local nBatches = input:size(1)
+   -- Allocate the output for this module, only once
+   if not self.output or self.output:nElement() == 0 then
+      self.output = torch.CudaTensor(torch.LongStorage({
+         nBatches,
+         self.nOutputPlane,
+         input:size(3) + self.padUp + self.padDown - self.kH + 1,
+         input:size(4) + self.padLeft + self.padRight - self.kW + 1}))
+   end
+
+   errorIf(self.tileSizeH > self.output:size(3) or
+              self.tileSizeW > self.output:size(4),
+           'Tile size too large (' .. self.tileSizeH .. 'x' .. self.tileSizeW ..
+            ') for output (' .. self.output:size(3) .. 'x' ..
+            self.output:size(4) .. ')')
+
+   -- Perform tiling on meta-tensor list
+   if not self.inputTensorList or
+      not self.outputTensorList or
+      not self.metaDataListUpdateOutput
+   then
+      self.inputTensorList = nil
+      self.outputTensorList = nil
+      self.metaDataListUpdateOutput = nil
+      local maxTileIndexH
+      local maxTileIndexW
+      -- In updateOutputTiled, the tiling of output is without overlap
+      -- and without padding. It informs how the tiling on padded input
+      -- should be performed
+      self.outputTensorList, maxTileIndexH, maxTileIndexW =
+         TiledView2D(self.output,
+                     self.tileSizeH - self.kH + 1,
+                     self.tileSizeW - self.kW + 1,
+                     self.tileSizeH - self.kH + 1,
+                     self.tileSizeW - self.kW + 1)
+      self.inputTensorList = TiledView2D(input,
+                                         self.tileSizeH,
+                                         self.tileSizeW,
+                                         self.tileSizeH - self.kH + 1,
+                                         self.tileSizeW - self.kW + 1,
+                                         self.padLeft,
+                                         self.padUp,
+                                         self.padRight,
+                                         self.padDown,
+                                         maxTileIndexH + 1,
+                                         maxTileIndexW + 1)
+
+      self.metaDataListUpdateOutput = List{}
+      for i = 1, self.inputTensorList:len() do
+         local metaData = self:makeMetaData(
+            nn.SpatialConvolutionFFT.ForwardFFTPass,
+            self.inputTensorList[i].tileIndexW,
+            self.inputTensorList[i].tileIndexH,
+            self.outputTensorList[i].tileIndexW,
+            self.outputTensorList[i].tileIndexH)
+         -- By default skip bias when offloading computation to FBFFT
+         -- and do it at the very end
+         metaData.skipBias = true
+         self.metaDataListUpdateOutput:append(metaData)
+      end
+   end
+
+   errorIfNot(self.outputTensorList:len() == self.inputTensorList:len(),
+              "Error in tile metadata: not the same sizes input = " ..
+                 self.inputTensorList:len() .. " VS output = " ..
+                 self.outputTensorList:len())
+
+   -- At this point tiles / metadata for buffer management / reuse are available
+   -- in self.xyz just call the actual instantiation
+
+   return self:instUpdateOutputFFTImpl(input)
+end
+
+
+function SpatialConvolutionFFTTiled:updateGradInputFFTImpl(input, gradOutput)
+   local ok, res =
+      pcall(SpatialConvolutionFFTTiled.abstractUpdateGradInputFFTImpl,
+            self,
+            input,
+            gradOutput)
+   if ok then
+      return res
+   end
+   self.success = false
+   if self.reportErrors then
+      print(res .. " -> updateGradInput fallback to untiled FBFFT")
+   end
+
+   -- This path exits early for tuned SpatialConvolution.lua
+   self.success = false
+   if self.autotuningPass then
+      error('Using tuned SpatialConvolution and found an error, early exit')
+   end
+
+   error("Bug in fallback form Tiled to FBFFT on updateGradInput" ..
+            " Drop back higher up in the food chain")
+   -- Safety barrier and no reuse for error recovery
+   self.memoryReusePolicy = List{
+      nn.SpatialConvolutionFFT.memoryReuseNone}
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+   return parent.updateGradInputFFTImpl(self, input, gradOutput)
+end
+
+function SpatialConvolutionFFTTiled:instUpdateGradInputFFTImpl(
+      input, gradOutput)
+   assert(false, "Do not call the abstract class directly!")
+end
+
+function SpatialConvolutionFFTTiled:abstractUpdateGradInputFFTImpl(
+      input, gradOutput)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   local nBatches = input:size(1)
+
+   -- Allocate the gradInput for this module, only once
+   if not self.gradInput or self.gradInput:nElement() == 0 then
+      self.gradInput = torch.CudaTensor(torch.LongStorage({
+         nBatches,
+         self.nInputPlane,
+         input:size(3),
+         input:size(4)}))
+   else
+      errorIfNot(self.gradInput:size(1) == input:size(1))
+      errorIfNot(self.gradInput:size(2) == input:size(2))
+      errorIfNot(self.gradInput:size(3) == input:size(3))
+      errorIfNot(self.gradInput:size(4) == input:size(4))
+   end
+
+   errorIf(self.tileSizeH > gradOutput:size(3) or
+              self.tileSizeW > gradOutput:size(4),
+           'Tile size too large (' .. self.tileSizeH .. 'x' .. self.tileSizeW ..
+           ') for gradOutput (' .. gradOutput:size(3) .. 'x' ..
+           gradOutput:size(4) .. ')')
+
+   -- Perform tiling on meta-tensor list
+   if not self.gradOutputTensorList or
+      not self.gradInputTensorList or
+      not self.metaDataListUpdateGradInput
+   then
+      self.gradOutputTensorList = nil
+      self.gradInputTensorList = nil
+      self.metaDataListUpdateGradInput = nil
+      local maxTileIndexH
+      local maxTileIndexW
+      -- In updateGradInputTiled, the tiling of gradInput is without overlap
+      -- and with padding. It informs how the tiling on padded gradOutput
+      -- should be performed.
+      self.gradInputTensorList, maxTileIndexH, maxTileIndexW =
+         TiledView2D(self.gradInput,
+                     self.tileSizeH - self.kH + 1,
+                     self.tileSizeW - self.kW + 1,
+                     self.tileSizeH - self.kH + 1,
+                     self.tileSizeW - self.kW + 1,
+                     self.padLeft,
+                     self.padUp,
+                     self.padRight,
+                     self.padDown)
+      self.gradOutputTensorList = TiledView2D(gradOutput,
+                                              self.tileSizeH,
+                                              self.tileSizeW,
+                                              self.tileSizeH - self.kH + 1,
+                                              self.tileSizeW - self.kW + 1,
+                                              self.kW - 1,
+                                              self.kH - 1,
+                                              self.kW - 1,
+                                              self.kH - 1,
+                                              maxTileIndexH + 1,
+                                              maxTileIndexW + 1)
+      self.metaDataListUpdateGradInput = List{}
+      for i = 1, self.gradInputTensorList:len() do
+         local metaData = self:makeMetaData(
+            nn.SpatialConvolutionFFT.BackwardFFTPass,
+            self.gradInputTensorList[i].tileIndexW,
+            self.gradInputTensorList[i].tileIndexH,
+            self.gradOutputTensorList[i].tileIndexW,
+            self.gradOutputTensorList[i].tileIndexH)
+         self.metaDataListUpdateGradInput:append(metaData)
+      end
+   end
+
+   errorIfNot(self.gradInputTensorList:len() == self.gradOutputTensorList:len(),
+          "Not the same sizes input = " .. self.gradOutputTensorList:len() ..
+             " VS output = " .. self.gradInputTensorList:len())
+
+
+   -- At this point tiles / metadata for buffer management / reuse are available
+   -- in self.xyz just call the actual instantiation
+
+   return self:instUpdateGradInputFFTImpl(input, gradOutput)
+end
+
+
+function SpatialConvolutionFFTTiled:accGradParametersFFTImpl(
+      input, gradOutput, scale)
+   local ok, res =
+      pcall(SpatialConvolutionFFTTiled.abstractAccGradParametersFFTImpl,
+            self,
+            input,
+            gradOutput,
+            scale)
+   if ok then
+      return res
+   end
+   self.success = false
+   if self.reportErrors then
+      print(res .. " -> accGradParameters fallback to untiled FBFFT")
+   end
+
+   -- This path exits early for tuned SpatialConvolution.lua
+   self.success = false
+   if self.autotuningPass then
+      error('Using tuned SpatialConvolution and found an error, early exit')
+   end
+
+   error("Bug in fallback form Tiled to FBFFT on accGradParametersFFTImpl" ..
+            " Drop back higher up in the food chain")
+   -- Safety barrier and no reuse for error recovery
+   self.memoryReusePolicy = List{
+      nn.SpatialConvolutionFFT.memoryReuseNone}
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+   parent.accGradParametersFFTImpl(self, input, gradOutput, scale)
+end
+
+
+function SpatialConvolutionFFTTiled:instAccGradParametersFFTImpl(
+      input, gradOutput)
+   assert(false, "Do not call the abstract class directly!")
+end
+
+
+function SpatialConvolutionFFTTiled:abstractAccGradParametersFFTImpl(
+      input, gradOutput, scale)
+   assert(torch.type(input) == 'torch.CudaTensor', "CUDA support only!")
+
+   local scale = scale or 1
+   local nBatches = input:size(1)
+
+   -- Allocate the gradWeight for this module, only once
+   if not self.gradWeight or self.gradWeight:nElement() == 0 then
+      errorIfNot(false,
+                 "GradWeight must already be allocated at module creation")
+      self.gradWeight = torch.CudaTensor(torch.LongStorage({
+                                               nBatches,
+                                               self.nInputPlane,
+                                               self.kH,
+                                               self.kW}))
+   end
+
+   errorIf(self.tileSizeH > gradOutput:size(3) or
+              self.tileSizeW > gradOutput:size(4),
+           'Tile size too large (' .. self.tileSizeH .. 'x' .. self.tileSizeW ..
+            ') for gradOutput (' .. gradOutput:size(3) .. 'x' ..
+            gradOutput:size(4) .. ')')
+
+   -- Perform tiling on meta-tensor list
+   if not self.gradOutputTensorList2 or
+         not self.inputTensorList2 or
+         not self.metaDataListAccGrad then
+      self.gradOutputTensorList2 = nil
+      self.inputTensorList2 = nil
+      self.metaDataListAccGrad = nil
+      local maxTileIndexH
+      local maxTileIndexW
+      errorIfNot(self.tileSizeH >= self.kH,
+                 'Tiling cannot be smaller than kernel !')
+      errorIfNot(self.tileSizeW >= self.kW,
+                 'Tiling cannot be smaller than kernel !')
+      -- In updateGradInputTiled, the tiling of gradOutput is without overlap
+      -- and without padding. It informs how the tiling on padded input
+      -- should be performed.
+      self.gradOutputTensorList2, maxTileIndexH, maxTileIndexW =
+         TiledView2D(gradOutput,
+                     self.tileSizeH - (self.kH - 1),
+                     self.tileSizeW - (self.kW - 1),
+                     self.tileSizeH - (self.kH - 1),
+                     self.tileSizeW - (self.kW - 1))
+      self.inputTensorList2 = TiledView2D(input,
+                                          self.tileSizeH,
+                                          self.tileSizeW,
+                                          self.tileSizeH - (self.kH - 1),
+                                          self.tileSizeW - (self.kW - 1),
+                                          self.padLeft,
+                                          self.padUp,
+                                          self.padRight,
+                                          self.padDown,
+                                          maxTileIndexH + 1,
+                                          maxTileIndexW + 1)
+
+      self.metaDataListAccGrad = List{}
+      for i = 1, self.inputTensorList2:len() do
+         local metaData = self:makeMetaData(
+            nn.SpatialConvolutionFFT.AccGradientFFTPass,
+            self.inputTensorList2[i].tileIndexW,
+            self.inputTensorList2[i].tileIndexH,
+            self.gradOutputTensorList2[i].tileIndexW,
+            self.gradOutputTensorList2[i].tileIndexH)
+         self.metaDataListAccGrad:append(metaData)
+      end
+   end
+
+   errorIfNot(self.inputTensorList2:len() == self.gradOutputTensorList2:len(),
+          "Not the same sizes input = " .. self.gradOutputTensorList2:len() ..
+             " VS output = " .. self.inputTensorList2:len())
+
+   -- At this point tiles / metadata for buffer management / reuse are available
+
+   self:instAccGradParametersFFTImpl(input, gradOutput, scale)
+end
+
+-- Makes or reuses square FFT buffers up to the next power of 2
+function SpatialConvolutionFFTTiled:prepareSizeAndBuffers(i, w, o, metaData)
+   return parent.prepareSizeAndBuffers(self, i, w, o, metaData)
+end
+
+function SpatialConvolutionFFTTiled:makeMetaData(
+      pass,
+      inputTileIndexW, inputTileIndexH,
+      outputTileIndexW, outputTileIndexH,
+      weightTileIndexW, weightTileIndexH)
+   local metaData = {}
+   metaData.pass = pass
+   metaData.input = {}
+   metaData.input.tileIndexH = inputTileIndexH
+   metaData.input.tileIndexW = inputTileIndexW
+   metaData.output = {}
+   metaData.output.tileIndexH = outputTileIndexH
+   metaData.output.tileIndexW = outputTileIndexW
+   metaData.weight = {}
+   metaData.weight.tileIndexH = weightTileIndexH
+   metaData.weight.tileIndexW = weightTileIndexW
+   return metaData
+end
+
+-- Discriminated buffers based on bufferType, bufferSize, tileIndex and
+-- whether it is an input or an output "of the algorithm"
+function SpatialConvolutionFFTTiled:getBufferKey(
+      BufferType, bufferSizes, metaData)
+   assert(torch.type(bufferSizes) == 'torch.LongStorage',
+          torch.type(bufferSizes))
+   assert(torch.type(metaData) == 'table',
+          torch.type(metaData))
+
+   -- TODO: needs semantics for proper producer consumer dependences and
+   -- ordering for RAW dependences by using self.moduleTimeStep properly
+   local md = {}
+   if metaData then
+      if BufferType == nn.SpatialConvolutionFFT.FFTInputBufferType then
+         md.tileIndices = metaData.input
+      elseif BufferType == nn.SpatialConvolutionFFT.FFTOutputBufferType then
+         md.tileIndices = metaData.output
+      else
+         md.tileIndices = metaData.weight
+      end
+
+      -- This is an adhoc way to discriminate between
+      --   updateOutput   / updateGradInput      / accGradParameters
+      --   input  (false) /   gradInput  (true)  / input      (false)
+      --   output (true)  /   gradOutput (false) / input      (false)
+      --   weight (false) /   weight     (false) / gradWeight (true)
+      --
+      local isOutputOfAlgorithm = false
+      -- In cufft mode, the tiled complex buffers are reused
+      if (metaData.pass == nn.SpatialConvolutionFFT.ForwardFFTPass and
+          BufferType == nn.SpatialConvolutionFFT.FFTOutputBufferType) or
+         (metaData.pass == nn.SpatialConvolutionFFT.BackwardFFTPass and
+             BufferType == nn.SpatialConvolutionFFT.FFTInputBufferType) or
+         (metaData.pass == nn.SpatialConvolutionFFT.AccGradientFFTPass and
+             BufferType == nn.SpatialConvolutionFFT.FFTWeightBufferType)
+      then
+         isOutputOfAlgorithm = true
+      end
+      md.isOutputOfAlgorithm = isOutputOfAlgorithm
+   end
+
+   -- If no memory reuse, all modules must use the same buffers, only
+   -- discriminate by buffer type and device id.
+   local moduleDiscr = self.moduleUID
+   if self.memoryReusePolicy:contains(nn.SpatialConvolutionFFT.memoryReuseNone)
+   then
+      moduleDiscr = nil
+      bufferSizes = nil
+      if torch.type(self) ~= "nn.SpatialConvolutionFFTTiledAsync" then
+         -- if we run async we must have multiple tiles live at the same time,
+         -- just let all tiles be live at the same time
+         md = nil
+      end
+   end
+
+   local bufferKey = {
+      self.cudaTensorBuffers,
+      cutorch.getDevice(),
+      BufferType,
+      bufferSizes,
+      moduleDiscr,
+      -- Be sure to put a counter for buffer and reuse btw timesteps or
+      -- memory will be blown (i.e. full DSA = ouch)
+      -- self.moduleTimeStep,
+      md
+   }
+
+   local res = thrift.to_string(bufferKey)
+   if not self.bufferKeys:contains(res) then
+      self.bufferKeys:append(res)
+   end
+
+   if self.printDebugLevel >= 3 then
+      print("BufferKey: ", bufferKey)
+      print("Serialized to : ", res)
+   end
+
+   return res
+end
+
+function SpatialConvolutionFFTTiled:cleanupBuffers()
+   parent.cleanupBuffers(self)
+
+   -- Tiling metadata
+   -- updateOutput
+   self.inputTensorList = nil
+   self.outputTensorList = nil
+   self.metaDataListUpdateOutput = nil
+   -- updateGradInput
+   self.gradInputTensorList = nil
+   self.gradOutputTensorList = nil
+   self.metaDataListUpdateGradInput = nil
+   -- accGradParameters
+   self.inputTensorList2 = nil
+   self.gradOutputTensorList2 = nil
+   self.metaDataListAccGrad = nil
+
+end
diff --git a/fbcunn/SpatialConvolutionFFTTiledAsync.lua b/fbcunn/SpatialConvolutionFFTTiledAsync.lua
new file mode 100644
index 0000000..55f6198
--- /dev/null
+++ b/fbcunn/SpatialConvolutionFFTTiledAsync.lua
@@ -0,0 +1,369 @@
+-- Copyright 2004-present Facebook. All Rights Reserved.
+
+require 'cudnn'
+local List = require 'pl.List'
+local ffi = require 'ffi'
+
+local lib_name = 'torch_fb_fbcunn_mm'
+local lib_path = package.searchpath(lib_name, package.cpath)
+local FBMMFFI = ffi.load(lib_path and lib_path or lib_name)
+
+local lib_name = 'torch_fb_fbcunn_convolution_bias'
+local lib_path = package.searchpath(lib_name, package.cpath)
+local ConvolutionBiasFFI = ffi.load(lib_path and lib_path or lib_name)
+
+local function errorIf(cond, msg)
+   if cond then
+      error(msg)
+   end
+end
+
+local function errorIfNot(cond, msg)
+   errorIf(not cond, msg)
+end
+
+local function equalsTiledTensorDescriptor(td1, td2)
+   local res = true
+   if td1.tileSizeH ~= td2.tileSizeH then
+      res = res and false
+   end
+   if td1.tileSizeW ~= td2.tileSizeW then
+      res = res and false
+   end
+   if td1.tileIndexH ~= td2.tileIndexH then
+      res = res and false
+   end
+   if td1.tileIndexW ~= td2.tileIndexW then
+      res = res and false
+   end
+   if td1.tensorSizeH ~= td2.tensorSizeH then
+      res = res and false
+   end
+   if td1.tensorSizeW ~= td2.tensorSizeW then
+      res = res and false
+   end
+   if td1.padUp ~= td2.padUp then
+      res = res and false
+   end
+   if td1.padLeft ~= td2.padLeft then
+      res = res and false
+   end
+   if td1.tensor:storage() ~= td2.tensor:storage() then
+      res = res and false
+   end
+   if td1.tensor:storageOffset() ~= td2.tensor:storageOffset() then
+      res = res and false
+   end
+   return res
+end
+
+
+------------------------------------------------------------------------------
+--   Actual Module
+------------------------------------------------------------------------------
+local SpatialConvolutionFFTTiledAsync, parent =
+   torch.class('nn.SpatialConvolutionFFTTiledAsync',
+               'nn.SpatialConvolutionFFTTiled')
+
+function SpatialConvolutionFFTTiledAsync:__init(nInputPlane,
+                                                nOutputPlane,
+                                                kW,
+                                                kH,
+                                                dW,
+                                                dH,
+                                                padLeft,
+                                                padUp,
+                                                tileSizeW,
+                                                tileSizeH,
+                                                memoryReusePolicy,
+                                                numCudaStreams)
+   parent.__init(self,
+                 nInputPlane,
+                 nOutputPlane,
+                 kW,
+                 kH,
+                 dW,
+                 dH,
+                 padLeft,
+                 padUp,
+                 tileSizeW,
+                 tileSizeH,
+                 memoryReusePolicy,
+                 numCudaStreams)
+end
+
+
+function SpatialConvolutionFFTTiledAsync:instUpdateOutputFFTImpl(input)
+   -- Make sure tiling information has been precomputed
+   assert(self.inputTensorList)
+   assert(self.outputTensorList)
+
+   local currentStream = 1
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   for i = 1, self.outputTensorList:len() do
+      -- Assert consistency of tensor dimensions
+      errorIfNot(#self.inputTensorList[i].tensor:size() == #input:size(),
+                 "Tensor size mismatch: " ..
+                    #self.inputTensorList[i].tensor:size() .. " vs " ..
+                    #input:size())
+      errorIfNot(#self.outputTensorList[i].tensor:size() == #self.output:size())
+
+      -- Set padding for this tile which can be partial and on the boundary
+      local savePadUp, savePadLeft, savePadDown, savePadRight =
+         self:pushPadding(i, self.inputTensorList)
+
+      local firstIteration = (i == 1)
+      local reuseList = List{}
+      if not firstIteration then
+         -- Whatever the memory reuse policy, when tiling, we can reuse
+         -- the computed FFT(weight), this is one of the points of tiling
+         reuseList:append(self.FFTWeightBufferType)
+      end
+      local inputLocal = self.inputTensorList[i].tensor
+      local outputLocal = self.outputTensorList[i].tensor
+      local metaData = self.metaDataListUpdateOutput[i]
+      local cublasBatchDims = 2
+      -- 2D convolutions on 4D tensors atm
+      assert(#inputLocal:size() == cublasBatchDims + 2)
+
+      local commonSize = self:prepareSizeAndBuffers(
+         inputLocal, self.weight, outputLocal, metaData)
+
+      -- Run all under this currentStream
+      cutorch.setStream(currentStream)
+      local fftWrapperPadded = nn.FFTWrapper(
+         self.fftImplementation, self.padLeft, self.padUp)
+      fftWrapperPadded:fft(inputLocal, self.inputBuffer, cublasBatchDims)
+      if not reuseList or not reuseList:contains(self.FFTWeightBufferType) then
+         local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+         fftWrapper:fft(self.weight, self.weightBuffer, cublasBatchDims)
+         -- Since we're running async, everyone must wait on my mighty buffers
+         -- ##############################################
+         cutorch.streamBarrier(self.allStreams)
+      end
+      local norm = self:getNormalizationFactor(commonSize, inputLocal)
+      FBMMFFI.transposeMMFFI(cutorch._state,
+                             self.inputBuffer:cdata(),
+                             self.weightBuffer:cdata(),
+                             self.outputBuffer:cdata(),
+                             1.0 / norm,
+                             false,
+                             true,
+                             false)
+      local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+      fftWrapper:ffti(outputLocal, self.outputBuffer, cublasBatchDims)
+      currentStream = currentStream % cutorch.getNumStreams() + 1
+
+      -- Pop back saved padding values
+      self.padUp, self.padLeft, self.padDown, self.padRight =
+         savePadUp, savePadLeft, savePadDown, savePadRight
+   end
+
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+   cutorch.setStream(1)
+   ConvolutionBiasFFI.updateOutputBiasFFI(
+      cutorch._state, self.output:cdata(), self.bias:cdata())
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   return self.output
+end
+
+
+function SpatialConvolutionFFTTiledAsync:instUpdateGradInputFFTImpl(
+      input, gradOutput)
+   -- Make sure tiling information has been precomputed
+   assert(self.gradInputTensorList)
+   assert(self.gradOutputTensorList)
+
+   local currentStream = 1
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   for i = 1, self.gradInputTensorList:len() do
+      -- Assert consistency of tensor dimensions
+      errorIfNot(#self.gradInputTensorList[i].tensor:size() == #input:size(),
+             "Tensor size mismatch: " ..
+                #self.gradInputTensorList[i].tensor:size() ..
+                " vs " .. #self.gradInput:size())
+      errorIfNot(
+         #self.gradOutputTensorList[i].tensor:size() == #gradOutput:size())
+
+      -- Set padding for this tile which can be partial and on the boundary
+      -- Need additional padding for circular symmetry in Fourier domain
+      local savePadUp, savePadLeft, savePadDown, savePadRight =
+         self:pushPaddingWithCircularSymmetry(i, self.tileSizeH, self.tileSizeW)
+
+      local firstIteration = (i == 1)
+      local reuseList = List{}
+      if not firstIteration then
+         -- Whatever the memory reuse policy, when tiling, we can reuse
+         -- the computed FFT(weight), this is one of the points of tiling
+         reuseList:append(self.FFTWeightBufferType)
+      end
+
+      local inputLocal = self.gradInputTensorList[i].tensor
+      local outputLocal = self.gradOutputTensorList[i].tensor
+      local metaData = self.metaDataListUpdateGradInput[i]
+      local cublasBatchDims = 2
+      -- 2D convolutions on 4D tensors atm
+      assert(#inputLocal:size() == cublasBatchDims + 2)
+
+      local commonSize = self:prepareSizeAndBuffers(
+         inputLocal, self.weight, outputLocal, metaData)
+
+      -- Run all under this currentStream
+      cutorch.setStream(currentStream)
+      local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+      fftWrapper:fft(outputLocal, self.outputBuffer, cublasBatchDims)
+      if not reuseList or not reuseList:contains(self.FFTWeightBufferType) then
+         local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+         fftWrapper:fft(self.weight, self.weightBuffer, cublasBatchDims)
+         -- Since we're running async, everyone must wait on my mighty buffers
+         -- ##############################################
+         cutorch.streamBarrier(self.allStreams)
+      end
+      local norm = self:getNormalizationFactor(commonSize, outputLocal)
+      FBMMFFI.transposeMMFFI(cutorch._state,
+                             self.outputBuffer:cdata(),
+                             self.weightBuffer:cdata(),
+                             self.inputBuffer:cdata(),
+                             1.0 / norm,
+                             false,
+                             false,
+                             false)
+      local fftWrapperPadded = nn.FFTWrapper(
+         self.fftImplementation, self.padLeft, self.padUp)
+      fftWrapperPadded:ffti(inputLocal, self.inputBuffer, cublasBatchDims)
+      currentStream = currentStream % cutorch.getNumStreams() + 1
+
+      -- Pop back saved padding values
+      self.padUp, self.padLeft, self.padDown, self.padRight =
+         savePadUp, savePadLeft, savePadDown, savePadRight
+   end
+
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   return self.gradInput
+end
+
+
+function SpatialConvolutionFFTTiledAsync:instAccGradParametersFFTImpl(
+      input, gradOutput, scale)
+   -- Make sure tiling information has been precomputed
+   assert(self.inputTensorList2)
+   assert(self.gradOutputTensorList2)
+
+   -- At this point tiles / metadata for buffer management / reuse are available
+   local previousStream
+   local currentStream = 1
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- Run ahead
+   cutorch.setStream(currentStream)
+   ConvolutionBiasFFI.accGradParametersBiasFFI(
+      cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale)
+   previousStream, currentStream =
+      currentStream, currentStream % cutorch.getNumStreams() + 1
+
+   for i = 1, self.inputTensorList2:len() do
+      -- Assert consistency of tensor dimensions
+      errorIfNot(#self.inputTensorList2[i].tensor:size() == #input:size(),
+             "Tensor size mismatch: " ..
+                #self.inputTensorList2[i].tensor:size() ..
+                " vs " .. #input:size())
+      errorIfNot(
+         #self.gradOutputTensorList2[i].tensor:size() == #gradOutput:size())
+
+      -- Set padding for this tile which can be partial and on the boundary
+      local savePadUp, savePadLeft, savePadDown, savePadRight =
+         self:pushPadding(i, self.inputTensorList2)
+
+      local firstWrite = (i == 1)
+      local lastWrite = (i == self.inputTensorList2:len())
+      -- Interestingly, tiled input is reusable but has a long liveness
+      -- If we don't reuse it we can reclaim the memory for something else
+      -- This is all controlled by the bufferKey
+      -- local reuseList = List{}
+      local reuseList = List{}
+      if self.inputTensorList and -- not cleaned earlier -> may want to reuse
+         equalsTiledTensorDescriptor(self.inputTensorList[i],
+                                     self.inputTensorList2[i]) then
+         reuseList:append(self.FFTInputBufferType)
+      end
+
+      local inputLocal = self.inputTensorList2[i].tensor
+      local outputLocal = self.gradOutputTensorList2[i].tensor
+      local metaData = self.metaDataListAccGrad[i]
+      local cublasBatchDims = 2
+      -- 2D convolutions on 4D tensors atm
+      assert(#inputLocal:size() == cublasBatchDims + 2)
+
+      local commonSize = self:prepareSizeAndBuffers(
+         inputLocal, self.gradWeight, outputLocal, metaData)
+
+      -- Run all under this currentStream
+      cutorch.setStream(currentStream)
+      if not reuseList or not reuseList:contains(self.FFTOutputBufferType)
+      then
+         -- Potentially reuse buffer if so told
+         local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+         fftWrapper:fft(outputLocal, self.outputBuffer, cublasBatchDims)
+      else
+         error('UpdateGradInput and AccGradParameter tiled padded ' ..
+                  'gradOuput cannot be shared atm')
+      end
+      if not reuseList or not reuseList:contains(self.FFTInputBufferType)
+      then
+         -- Potentially reuse buffer if so told
+         local fftWrapperPadded = nn.FFTWrapper(
+            self.fftImplementation, self.padLeft, self.padUp)
+         fftWrapperPadded:fft(
+            inputLocal, self.inputBuffer, cublasBatchDims)
+      end
+
+      -- Because we accumulate into C, we must synchronize with the
+      -- previous transposeMMFFI call. We statically know by construction
+      -- that it leaves on previousStream and by transitivity of
+      -- dependences we're good to go
+      cutorch.streamWaitFor(currentStream, {previousStream})
+      local lastWriteNorm = 1.0
+      if lastWrite then
+         local norm = self:getNormalizationFactor(commonSize, outputLocal)
+         lastWriteNorm = (1.0 * scale) / norm
+      end
+      FBMMFFI.transposeMMFFI(cutorch._state,
+                             self.outputBuffer:cdata(),
+                             self.inputBuffer:cdata(),
+                             self.weightBuffer:cdata(),
+                             lastWriteNorm,
+                             true,            -- conjugate A
+                             false,           -- B
+                                not firstWrite)  -- accumulate into C
+
+      -- 3. Accumulate in the frequency domain, IFFT on last write
+      if lastWrite then
+         local fftWrapper = nn.FFTWrapper(self.fftImplementation)
+         fftWrapper:ffti(
+            self.gradWeight, self.weightBuffer, cublasBatchDims)
+      end
+
+      if self.printDebugLevel >= 3 then
+         print('Step ASYNC gradWeight: ', self.gradWeight)
+      end
+      previousStream, currentStream =
+         currentStream, currentStream % cutorch.getNumStreams() + 1
+
+      -- Pop back saved padding values
+      self.padUp, self.padLeft, self.padDown, self.padRight =
+         savePadUp, savePadLeft, savePadDown, savePadRight
+   end
+
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+end
diff --git a/fbcunn/SpatialConvolutionFFTTiledIterated.lua b/fbcunn/SpatialConvolutionFFTTiledIterated.lua
new file mode 100644
index 0000000..77d1cca
--- /dev/null
+++ b/fbcunn/SpatialConvolutionFFTTiledIterated.lua
@@ -0,0 +1,231 @@
+-- Copyright 2004-present Facebook. All Rights Reserved.
+
+require 'cudnn'
+local List = require 'pl.List'
+local ffi = require 'ffi'
+
+local lib_name = 'torch_fb_fbcunn_convolution_bias'
+local lib_path = package.searchpath(lib_name, package.cpath)
+local ConvolutionBiasFFI = ffi.load(lib_path and lib_path or lib_name)
+
+local lib_name = 'torch_fb_fbcunn_FFTIteratedConvolution'
+local lib_path = package.searchpath(lib_name, package.cpath)
+local FFTIteratedConvolution = ffi.load(lib_path and lib_path or lib_name)
+
+------------------------------------------------------------------------------
+--   Actual Module
+------------------------------------------------------------------------------
+local SpatialConvolutionFFTTiledIterated, parent =
+   torch.class('nn.SpatialConvolutionFFTTiledIterated',
+               'nn.SpatialConvolutionFFTTiled')
+
+function SpatialConvolutionFFTTiledIterated:__init(nInputPlane,
+                                               nOutputPlane,
+                                               kW,
+                                               kH,
+                                               dW,
+                                               dH,
+                                               padLeft,
+                                               padUp,
+                                               tileSizeW,
+                                               tileSizeH,
+                                               memoryReusePolicy,
+                                               numCudaStreams)
+   parent.__init(self,
+                 nInputPlane,
+                 nOutputPlane,
+                 kW,
+                 kH,
+                 dW,
+                 dH,
+                 padLeft,
+                 padUp,
+                 tileSizeW,
+                 tileSizeH,
+                 memoryReusePolicy,
+                 numCudaStreams)
+
+   -- Override any memory reuse scheme: just no reuse
+   self.memoryReusePolicy = List{nn.SpatialConvolutionFFT.memoryReuseNone}
+end
+
+-- Adjustment needed for updateGradInput since we don't do circular
+-- shifts in the Fourier domain, just shift in time.
+local function buildTiledDeviceTensorFFI(
+      inputTensorList, outputTensorList, adjustInputShiftW, adjustInputShiftH)
+   local adjustInputShiftW = adjustInputShiftW or 0
+   local adjustInputShiftH = adjustInputShiftH or 0
+   local size = inputTensorList:len()
+   assert(outputTensorList:len() == size)
+   local inputTiledDeviceTensorFFI =
+      ffi.new("TiledDeviceTensorFFI[?]", size)
+   local outputTiledDeviceTensorFFI =
+      ffi.new("TiledDeviceTensorFFI[?]", size)
+   for i = 1, size do
+      inputTiledDeviceTensorFFI[i - 1].tensor =
+         inputTensorList[i].tensor:cdata()
+      inputTiledDeviceTensorFFI[i - 1].padL =
+         inputTensorList[i].padLeft + adjustInputShiftW
+      inputTiledDeviceTensorFFI[i - 1].padU =
+         inputTensorList[i].padUp + adjustInputShiftH
+      outputTiledDeviceTensorFFI[i - 1].tensor =
+         outputTensorList[i].tensor:cdata()
+      outputTiledDeviceTensorFFI[i - 1].padL = outputTensorList[i].padLeft
+      outputTiledDeviceTensorFFI[i - 1].padU = outputTensorList[i].padUp
+   end
+   return inputTiledDeviceTensorFFI, outputTiledDeviceTensorFFI, size
+end
+
+function SpatialConvolutionFFTTiledIterated:instUpdateOutputFFTImpl(input)
+   -- Make sure tiling information has been precomputed
+   assert(self.inputTensorList)
+   assert(self.outputTensorList)
+   assert(self.memoryReusePolicy:contains(
+             nn.SpatialConvolutionFFT.memoryReuseNone))
+
+   local inputTiledDeviceTensorFFI, outputTiledDeviceTensorFFI, numTiles =
+      buildTiledDeviceTensorFFI(self.inputTensorList, self.outputTensorList)
+
+
+   for _, actualTileSize in ipairs({8, 16, 32}) do
+      if self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseNone) and
+         self.tileSizeH <= actualTileSize and self.tileSizeW <= actualTileSize
+      then
+         -- Only do iterated convolutions if there is no reuse
+         self.output:zero()
+         -- ##############################################
+         cutorch.streamBarrier(self.allStreams)
+         local convolutionPassFFI =
+            ffi.new("FFTConvolutionPassFFI")
+         convolutionPassFFI.pass = convolutionPassFFI.FFT_UpdateOutput
+
+         FFTIteratedConvolution.convolveIteratedFFI(
+            cutorch._state,
+            inputTiledDeviceTensorFFI,
+            self.weight:cdata(),
+            outputTiledDeviceTensorFFI,
+            numTiles,
+            actualTileSize,
+            convolutionPassFFI,
+            1.0)
+
+         -- ##############################################
+         cutorch.streamBarrier(self.allStreams)
+         ConvolutionBiasFFI.updateOutputBiasFFI(
+            cutorch._state, self.output:cdata(), self.bias:cdata())
+         -- ##############################################
+         cutorch.streamBarrier(self.allStreams)
+         return self.output
+      end
+   end
+
+   error('updateOutputIterated tiling by ' .. self.tileSizeW .. 'x' ..
+            self.tileSizeH .. ' not supported')
+end
+
+
+
+function SpatialConvolutionFFTTiledIterated:instUpdateGradInputFFTImpl(
+      input, gradOutput)
+   -- Make sure tiling information has been precomputed
+   assert(self.gradInputTensorList)
+   assert(self.gradOutputTensorList)
+   assert(self.memoryReusePolicy:contains(
+             nn.SpatialConvolutionFFT.memoryReuseNone))
+
+   local gradInputTiledDeviceTensorFFI,
+         gradOutputTiledDeviceTensorFFI,
+         numTiles =
+            buildTiledDeviceTensorFFI(self.gradInputTensorList,
+                                      self.gradOutputTensorList,
+                                      -- Adjust for no circular rotation in
+                                      -- Fourier domain
+                                      self.kW - 1,
+                                      self.kH - 1
+            )
+
+   for _, actualTileSize in ipairs({8, 16, 32}) do
+      if self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseNone) and
+         self.tileSizeH <= actualTileSize and self.tileSizeW <= actualTileSize
+      then
+         -- Only do iterated convolutions if there is not reuse
+         self.gradInput:zero()
+         -- ##############################################
+         cutorch.streamBarrier(self.allStreams)
+         local convolutionPassFFI =
+            ffi.new("FFTConvolutionPassFFI")
+         convolutionPassFFI.pass = convolutionPassFFI.FFT_UpdateGradInput
+         FFTIteratedConvolution.convolveIteratedFFI(
+            cutorch._state,
+            gradInputTiledDeviceTensorFFI,
+            self.weight:cdata(),
+            gradOutputTiledDeviceTensorFFI,
+            numTiles,
+            actualTileSize,
+            convolutionPassFFI,
+            1.0)
+         -- ##############################################
+         cutorch.streamBarrier(self.allStreams)
+         return self.gradInput
+      end
+   end
+
+   error('updateGradInputIterated tiling by ' .. self.tileSizeW .. 'x' ..
+            self.tileSizeH .. ' not supported')
+end
+
+
+function SpatialConvolutionFFTTiledIterated:instAccGradParametersFFTImpl(
+      input, gradOutput, scale)
+
+   -- Make sure tiling information has been precomputed
+   assert(self.inputTensorList2)
+   assert(self.gradOutputTensorList2)
+   assert(self.memoryReusePolicy:contains(
+             nn.SpatialConvolutionFFT.memoryReuseNone))
+
+   local inputTiledDeviceTensorFFI,
+         gradOutputTiledDeviceTensorFFI,
+         numTiles =
+            buildTiledDeviceTensorFFI(self.inputTensorList2,
+                                      self.gradOutputTensorList2)
+
+   for _, actualTileSize in ipairs({8, 16, 32}) do
+      if self.memoryReusePolicy:contains(
+         nn.SpatialConvolutionFFT.memoryReuseNone) and
+         self.tileSizeH <= actualTileSize and self.tileSizeW <= actualTileSize
+      then
+         -- Only do iterated convolutions if there is no reuse
+         self.gradWeight:zero()
+         -- ##############################################
+         cutorch.streamBarrier(self.allStreams)
+
+         -- Run ahead
+         cutorch.setStream(1)
+         ConvolutionBiasFFI.accGradParametersBiasFFI(
+            cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale)
+
+         cutorch.setStream(2)
+         local convolutionPassFFI =
+            ffi.new("FFTConvolutionPassFFI")
+         convolutionPassFFI.pass = convolutionPassFFI.FFT_AccGradParameters
+         FFTIteratedConvolution.convolveIteratedFFI(
+            cutorch._state,
+            inputTiledDeviceTensorFFI,
+            self.gradWeight:cdata(),
+            gradOutputTiledDeviceTensorFFI,
+            numTiles,
+            actualTileSize,
+            convolutionPassFFI,
+            scale)
+         -- ##############################################
+         cutorch.streamBarrier(self.allStreams)
+         return
+      end
+   end
+
+   error('accGradParametersIterated tiling by ' .. self.tileSizeW .. 'x' ..
+            self.tileSizeH .. ' not supported')
+end
diff --git a/fbcunn/SpatialConvolutionFFTTiledSync.lua b/fbcunn/SpatialConvolutionFFTTiledSync.lua
new file mode 100644
index 0000000..decac43
--- /dev/null
+++ b/fbcunn/SpatialConvolutionFFTTiledSync.lua
@@ -0,0 +1,247 @@
+-- Copyright 2004-present Facebook. All Rights Reserved.
+
+require 'cudnn'
+local List = require 'pl.List'
+local ffi = require 'ffi'
+local ConvolutionBiasFFI = ffi.load('torch_fb_fbcunn_convolution_bias')
+
+local function errorIf(cond, msg)
+   if cond then
+      error(msg)
+   end
+end
+
+local function errorIfNot(cond, msg)
+   errorIf(not cond, msg)
+end
+
+------------------------------------------------------------------------------
+--   Actual Module
+------------------------------------------------------------------------------
+local SpatialConvolutionFFTTiledSync, parent =
+   torch.class('nn.SpatialConvolutionFFTTiledSync',
+               'nn.SpatialConvolutionFFTTiled')
+
+function SpatialConvolutionFFTTiledSync:__init(nInputPlane,
+                                               nOutputPlane,
+                                               kW,
+                                               kH,
+                                               dW,
+                                               dH,
+                                               padLeft,
+                                               padUp,
+                                               tileSizeW,
+                                               tileSizeH,
+                                               memoryReusePolicy,
+                                               numCudaStreams)
+   parent.__init(self,
+                 nInputPlane,
+                 nOutputPlane,
+                 kW,
+                 kH,
+                 dW,
+                 dH,
+                 padLeft,
+                 padUp,
+                 tileSizeW,
+                 tileSizeH,
+                 memoryReusePolicy,
+                 numCudaStreams)
+
+   -- Override any memory reuse scheme: just no reuse
+   self.memoryReusePolicy = List{nn.SpatialConvolutionFFT.memoryReuseNone}
+end
+
+function SpatialConvolutionFFTTiledSync:instUpdateOutputFFTImpl(input)
+   -- Make sure tiling information has been precomputed
+   assert(self.inputTensorList)
+   assert(self.outputTensorList)
+   assert(self.memoryReusePolicy:contains(
+             nn.SpatialConvolutionFFT.memoryReuseNone))
+
+   -- Push / pop the local tensor, we're calling a parent in sync mode
+   local saveOutput = self.output
+   for i = 1, self.outputTensorList:len() do
+      -- Assert consistency of tensor dimensions
+      errorIfNot(#self.inputTensorList[i].tensor:size() == #input:size(),
+                 "Tensor size mismatch: " ..
+                    #self.inputTensorList[i].tensor:size() .. " vs " ..
+                    #input:size())
+      errorIfNot(#self.outputTensorList[i].tensor:size() == #self.output:size())
+
+      -- Set padding for this tile which can be partial and on the boundary
+      local savePadUp, savePadLeft, savePadDown, savePadRight =
+         self:pushPadding(i, self.inputTensorList)
+
+      -- Even in the absence of reuse we can compute the weight buffers only
+      -- once. This is one of the points of tiling in the first place
+      local firstIteration = (i == 1)
+      local reuseList = List{}
+      if not firstIteration then
+         reuseList:append(self.FFTWeightBufferType)
+      end
+      self.output = self.outputTensorList[i].tensor
+      -- Go up 2 levels, 'cast' as SpatialConvolutionFBFFT
+      nn.SpatialConvolutionFBFFT.updateOutputFFTImpl(
+         self,
+         self.inputTensorList[i].tensor,
+         reuseList,
+         self.metaDataListUpdateOutput[i])
+
+      -- Pop back saved padding values
+      self.padUp, self.padLeft, self.padDown, self.padRight =
+         savePadUp, savePadLeft, savePadDown, savePadRight
+   end
+
+   self.output = saveOutput
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+   cutorch.setStream(1)
+   ConvolutionBiasFFI.updateOutputBiasFFI(
+      cutorch._state, self.output:cdata(), self.bias:cdata())
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   return self.output
+end
+
+
+
+function SpatialConvolutionFFTTiledSync:instUpdateGradInputFFTImpl(
+      input, gradOutput)
+   -- Make sure tiling information has been precomputed
+   assert(self.gradInputTensorList)
+   assert(self.gradOutputTensorList)
+   assert(self.memoryReusePolicy:contains(
+             nn.SpatialConvolutionFFT.memoryReuseNone))
+
+   -- Push / pop the local tensor, we're calling a parent in sync mode
+   local saveGradInput = self.gradInput
+   for i = 1, self.gradInputTensorList:len() do
+      -- Assert consistency of tensor dimensions
+      errorIfNot(#self.gradInputTensorList[i].tensor:size() == #input:size(),
+             "Tensor size mismatch: " ..
+                #self.gradInputTensorList[i].tensor:size() ..
+                " vs " .. #self.gradInput:size())
+      errorIfNot(
+         #self.gradOutputTensorList[i].tensor:size() == #gradOutput:size())
+
+      -- Set padding for this tile which can be partial and on the boundary
+      -- Need additional padding for circular symmetry in Fourier domain
+      local savePadUp, savePadLeft, savePadDown, savePadRight =
+         self:pushPaddingWithCircularSymmetry(i, self.tileSizeH, self.tileSizeW)
+
+      local firstIteration = (i == 1)
+      local reuseList = List{}
+      if not firstIteration then
+         reuseList:append(self.FFTWeightBufferType)
+      end
+
+      self.gradInput = self.gradInputTensorList[i].tensor
+      -- Go up 2 levels, 'cast' as SpatialConvolutionFBFFT
+      nn.SpatialConvolutionFBFFT.updateGradInputFFTImpl(
+         self,
+         self.gradInput, -- used only as model
+         self.gradOutputTensorList[i].tensor,
+         -- weight buffers can always be reused
+         -- since we enforce that tiles are larger
+         -- than weights
+         reuseList,
+         self.metaDataListUpdateGradInput[i])
+
+      -- Pop back saved padding values
+      self.padUp, self.padLeft, self.padDown, self.padRight =
+         savePadUp, savePadLeft, savePadDown, savePadRight
+   end
+   self.gradInput = saveGradInput
+   return self.gradInput
+end
+
+
+function SpatialConvolutionFFTTiledSync:instAccGradParametersFFTImpl(
+      input, gradOutput, scale)
+   -- Make sure tiling information has been precomputed
+   assert(self.inputTensorList2)
+   assert(self.gradOutputTensorList2)
+   assert(self.memoryReusePolicy:contains(
+             nn.SpatialConvolutionFFT.memoryReuseNone))
+
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+
+   -- Run ahead
+   local currentStream = 0
+   cutorch.setStream(currentStream)
+   ConvolutionBiasFFI.accGradParametersBiasFFI(
+      cutorch._state, gradOutput:cdata(), self.gradBias:cdata(), scale)
+
+   for i = 1, self.inputTensorList2:len() do
+      -- Assert consistency of tensor dimensions
+      errorIfNot(#self.inputTensorList2[i].tensor:size() == #input:size(),
+             "Tensor size mismatch: " ..
+                #self.inputTensorList2[i].tensor:size() ..
+                " vs " .. #input:size())
+      errorIfNot(
+         #self.gradOutputTensorList2[i].tensor:size() == #gradOutput:size())
+
+      -- Set padding for this tile which can be partial and on the boundary
+      local savePadUp, savePadLeft, savePadDown, savePadRight =
+         self:pushPadding(i, self.inputTensorList2)
+
+      local firstWrite = (i == 1)
+      local lastWrite = (i == self.inputTensorList2:len())
+
+      -- We accumulate in this thing, make sure it is zero
+      self.gradWeight:zero()
+      self.gradBias:zero()
+
+      if firstWrite then
+         self.gradWeightAcc = self.gradWeight:clone()
+         self.gradBiasAcc = self.gradBias:clone()
+      end
+
+      -- Can't reuse tiled gradOutput without extra work
+      errorIf(self.memoryReusePolicy:contains(
+                 nn.SpatialConvolutionFFT.memoryReuseOutput),
+              "Reuse output in tiled accGradParameters is not supproted")
+
+      if self.printDebugLevel >= 3 then
+         print('Pre step synchronous gradWeight @',
+               self.gradWeight:cdata(), ': ', self.gradWeight:float())
+      end
+
+      -- Go up 2 levels, 'cast' as SpatialConvolutionFBFFT
+      nn.SpatialConvolutionFBFFT.accGradParametersFFTImpl(
+         self,
+         self.inputTensorList2[i].tensor,
+         self.gradOutputTensorList2[i].tensor,
+         scale,
+         List{}, -- reuseList
+         self.metaDataListAccGrad[i])
+
+      -- Super heavy, need to clear this up
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+      self.gradWeightAcc:add(self.gradWeight)
+      self.gradBiasAcc:add(self.gradBias)
+      -- ##############################################
+      cutorch.streamBarrier(self.allStreams)
+
+      if self.printDebugLevel >= 3 then
+         print('Step synchronous gradWeight @',
+               self.gradWeight:cdata(), ': ', self.gradWeight:float())
+      end
+
+      if lastWrite then
+         self.gradWeight:copy(self.gradWeightAcc)
+         self.gradBias:copy(self.gradBiasAcc)
+      end
+
+      -- Pop back saved padding values
+      self.padUp, self.padLeft, self.padDown, self.padRight =
+         savePadUp, savePadLeft, savePadDown, savePadRight
+   end
+
+   -- ##############################################
+   cutorch.streamBarrier(self.allStreams)
+end
diff --git a/fbcunn/TemporalKMaxPooling.lua b/fbcunn/TemporalKMaxPooling.lua
index 2fe82ff..34fb88d 100644
--- a/fbcunn/TemporalKMaxPooling.lua
+++ b/fbcunn/TemporalKMaxPooling.lua
@@ -1,5 +1,11 @@
 -- Copyright 2004-present Facebook. All Rights Reserved.
 
+-- TemporalKmaxPooling
+-- Input : (bsize x) width x height
+-- Output : (bisze x) k_out x height
+-- with k_out = max(k_out_prop, inputSeqLen)
+-- where k_out_prop = max(k, ceil(k_dynamic*inputSeqLen))
+
 require 'cutorch'
 require 'nn'
 
@@ -10,8 +16,21 @@ function TemporalKMaxPooling:__init(k, k_dynamic)
     parent.__init(self)
 
     self.k = k
+    if k_dynamic then
+        assert(k_dynamic <= 1 and k_dynamic >=0,
+        'k_dynamic must be between 0 and 1')
+    end
     self.k_dynamic = k_dynamic or -1
 
+    -- k_dynamic is an optional scalar parameter between 0 and 1
+    -- that makes k a fraction of the input sequence size.
+
+    -- To follow Kalchbrenner et al's architecture on Dynamic k-Max Pooling:
+    -- Use (k = k_top, kDynamic = (L - l)/L), with
+    -- L : total number of conv layers,
+    -- l : current convolutional layer to which the pooling is applied,
+    -- k_top : fixed pooling parameter for the topmost convolutional layer.
+
     self.output = torch.CudaTensor()
     self.gradInput = torch.CudaTensor()
     self.indices = torch.CudaTensor()
diff --git a/fbcunn/init.lua b/fbcunn/init.lua
index 67ab302..913cae9 100644
--- a/fbcunn/init.lua
+++ b/fbcunn/init.lua
@@ -2,20 +2,31 @@ require 'nn'
 require 'fbnn'
 require 'cunn'
 require 'libfbcunn'
-require 'libfbcunnlayers'
+require 'fbcunn.cuda_ext'
 
 include('AbstractParallel.lua')
+include('BatchNormalization.lua')
 include('CuBLASWrapper.lua')
 include('DataParallel.lua')
 include('FeatureLPPooling.lua')
 include('FFTWrapper.lua')
--- include('HalfPrecision.lua')
+include('HalfPrecision.lua')
 include('LookupTableGPU.lua')
 include('ModelParallel.lua')
 include('OneBitDataParallel.lua')
 include('OneBitQuantization.lua')
 include('OneBitSGD.lua')
-include('SpatialConvolutionCuFFT.lua')
+include('FFTCDefs.lua')
+include('SpatialBatchNormalization.lua')
+-- include('SpatialConvolutionFFT.lua')
+-- include('SpatialConvolutionCuFFT.lua')
+-- include('SpatialConvolutionFBFFT.lua')
+-- include('SpatialConvolutionFBFFTGemm.lua')
+-- include('SpatialConvolutionFFTTiled.lua')
+-- include('SpatialConvolutionFFTTiledSync.lua')
+-- include('SpatialConvolutionFFTTiledAsync.lua')
+-- include('SpatialConvolutionFFTTiledIterated.lua')
+-- include('SpatialConvolution.lua')
 include('TemporalConvolutionFB.lua')
 include('TemporalKMaxPooling.lua')
 
@@ -65,11 +76,11 @@ function nn.Module:getParametersByDevice()
             return nil
         end
         if dev == 0 then
-            return nn.Module._gather(params), nn.Module._gather(grads)
+            return nn.Module.flatten(params), nn.Module.flatten(grads)
         end
         return cutorch.withDevice(dev,
-            function() return nn.Module._gather(params),
-                              nn.Module._gather(grads)
+            function() return nn.Module.flatten(params),
+                              nn.Module.flatten(grads)
         end)
     end
 
diff --git a/src/BLASParameters.cpp b/src/BLASParameters.cpp
index f6b4c7b..6b4e164 100644
--- a/src/BLASParameters.cpp
+++ b/src/BLASParameters.cpp
@@ -16,8 +16,11 @@ std::ostream& operator<<(ostream& os, const BLASParameters& params) {
   os << " batchStepC = " << params.batchStepC;
   os << " #handles = " << params.handles.size();
   os << " #streams = " << params.streams.size();
-  os << " transposeA = " << (params.transposeA == CUBLAS_OP_T);
-  os << " transposeB = " << (params.transposeB == CUBLAS_OP_T);
+  os << " transposeA = " << ((params.transposeA == CUBLAS_OP_T) ? "t " :
+                             (params.transposeA == CUBLAS_OP_C) ? "c " : "n");
+  os << " transposeB = " << ((params.transposeB == CUBLAS_OP_T) ? "t " :
+                             (params.transposeB == CUBLAS_OP_C) ? "c " : "n");
+  os << " scale = (" << params.scaleRe << ", " << params.scaleIm << ")";
   return os;
 }
 
diff --git a/src/BLASParameters.h b/src/BLASParameters.h
index b9890c4..abe06b6 100644
--- a/src/BLASParameters.h
+++ b/src/BLASParameters.h
@@ -33,12 +33,14 @@ struct BLASParameters {
     iterDims = i;
     return *this;
   }
+
   // After iterDims, remaining outermost dimensions to be treated as batch
   // dimensions, for instance, in a gemmbatched call.
   BLASParameters& withBatchDims(int i) {
     batchDims = i;
     return *this;
   }
+
   // Force running on a particular handle / stream index in the handle /
   // stream vectors. The actual handle / stream we will end up running on is
   // recovered by modulo indexing into the vector, default handle / stream if
@@ -47,6 +49,7 @@ struct BLASParameters {
     resourceIndex = i;
     return *this;
   }
+
   // Distance between two batches of A, used in batched mode, in case we want
   // to compute one entry every k. Step of zerom means the same matrix A will
   // be read over and over again.
@@ -54,6 +57,7 @@ struct BLASParameters {
     batchStepA = i;
     return *this;
   }
+
   // Distance between two batches of B, used in batched mode, in case we want
   // to compute one entry every k. Step of zerom means the same matrix B will
   // be read over and over again.
@@ -61,6 +65,7 @@ struct BLASParameters {
     batchStepB = i;
     return *this;
   }
+
   // Distance between two batches of C, used in batched mode, in case we want
   // to compute one entry every k. Step of zerom means the same matrix C will
   // be written over and over again.
@@ -68,47 +73,69 @@ struct BLASParameters {
     batchStepC = i;
     return *this;
   }
+
   // Sets real scale in C += alpha * C + scale * A * B
   BLASParameters& withScaleReal(float f) {
     scaleRe = f;
     return *this;
   }
+
   // Sets imaginary scale in C += alpha * C + scale * A * B
   BLASParameters& withScaleImaginary(float f) {
     scaleIm = f;
     return *this;
   }
+
   // Use cgemm instead of sgemm
   BLASParameters& withComplex(bool b) {
     asComplex = b;
     return *this;
   }
+
   // If true, computes C += scale * A * B. Default is C = scale * A * B.
   BLASParameters& withAccumulate(bool b) {
     accumulate = b;
     return *this;
   }
+
   // Set vector of handle resources
   BLASParameters& withHandles(const std::vector<cublasHandle_t>& h) {
     handles = h;
     return *this;
   }
+
   // Set vector of stream resources
   BLASParameters& withStreams(const std::vector<cudaStream_t>& s) {
     streams = s;
     return *this;
   }
+
   // Transpose A
   BLASParameters& withTransposeA(cublasOperation_t t) {
     transposeA = t;
     return *this;
   }
+
   // Transpose B
   BLASParameters& withTransposeB(cublasOperation_t t) {
     transposeB = t;
     return *this;
   }
 
+  // Transpose A
+  BLASParameters& withTransposeA(char c) {
+    transposeA = (c == 't') ? CUBLAS_OP_T :
+      ((c == 'c') ? CUBLAS_OP_C : CUBLAS_OP_N);
+    return *this;
+  }
+
+  // Transpose B
+  BLASParameters& withTransposeB(char c) {
+    transposeB = (c == 't') ? CUBLAS_OP_T :
+      ((c == 'c') ? CUBLAS_OP_C : CUBLAS_OP_N);
+    return *this;
+  }
+
   unsigned int iterDims;
   unsigned int batchDims;
   unsigned int resourceIndex;
diff --git a/src/BatchNormalization.cu b/src/BatchNormalization.cu
new file mode 100644
index 0000000..2a40d7d
--- /dev/null
+++ b/src/BatchNormalization.cu
@@ -0,0 +1,460 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#include "src/DeviceTensorUtils.h"
+#include "THCTensor.h"
+
+#include "cuda/CudaUtils.cuh"
+#include "cuda/DeviceTensor.cuh"
+#include "cuda/MemoryAccess.cuh"
+#include "cuda/util/CachedDeviceProperties.h"
+
+#define ENABLE_CUDA_DEBUG
+#include "cuda/CudaDebugUtils.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <glog/logging.h>
+
+using namespace facebook::cuda;
+
+namespace facebook { namespace deeplearning { namespace torch {
+
+#define LOG_TARGET VLOG(1) // LOG(INFO)
+
+template<typename T, bool affine, typename ComputeT = float>
+__global__ void BatchNormalizationUpdateOutputInferenceUnrolled_kernel(
+    const DeviceTensor<T, 2> input,
+    DeviceTensor<T, 2> output,
+    DeviceTensor<T, 1> runningMean,
+    DeviceTensor<T, 1> runningStddev,
+    const DeviceTensor<T, 1> weight,
+    const DeviceTensor<T, 1> bias) {
+
+  static_assert(std::is_same<ComputeT, double>::value , "type");
+
+  auto batch = blockIdx.y;
+  auto x = blockIdx.x * blockDim.x + threadIdx.x;
+  if (x >= input.getSize(1)) {
+    return;
+  }
+
+  // stddev is actually 1 / stddev
+  ComputeT stddev = runningStddev[x].ldg();
+  ComputeT mean = runningMean[x].ldg();
+  ComputeT inp = input[batch][x].ldg();
+  if (affine) {
+    // multiply with gamma and add beta
+    // TODO: everyone pulling this, optimize by reusing better
+    ComputeT beta =  bias[x].ldg();
+    ComputeT gamma = weight[x].ldg();
+    output[batch][x] =  gamma * (inp - mean) * (stddev) + beta;
+  } else {
+    output[batch][x] = (inp - mean) * (stddev);
+  }
+}
+
+template<typename T, bool affine, typename ComputeT = float>
+__global__ void BatchNormalizationUpdateOutput_kernel(
+    const DeviceTensor<T, 2> input,
+    DeviceTensor<T, 2> output,
+    DeviceTensor<T, 2> centered,
+    DeviceTensor<T, 1> std,
+    DeviceTensor<T, 2> normalized,
+    DeviceTensor<T, 1> runningMean,
+    DeviceTensor<T, 1> runningStddev,
+    const DeviceTensor<T, 1> weight,
+    const DeviceTensor<T, 1> bias,
+    T epsilon,
+    T momentum) {
+
+  static_assert(std::is_same<ComputeT, double>::value , "type");
+
+  auto x = blockIdx.x * blockDim.x + threadIdx.x;
+  if (x >= output.getSize(1)) {
+    return;
+  }
+
+  ComputeT norm = (ComputeT)1 / input.getSize(0);
+
+  ComputeT batchMean = (ComputeT)0;
+  for (auto batch = 0; batch < output.getSize(0); ++batch) {
+    ComputeT b = input[batch][x].ldg();
+    batchMean += b;
+  }
+  batchMean *= norm;
+  runningMean[x] = (1 - momentum) * runningMean[x] + momentum * batchMean;
+
+  ComputeT stdMean = (ComputeT)0;
+  for (auto batch = 0; batch < output.getSize(0); ++batch) {
+    ComputeT inp = input[batch][x].ldg() ;
+    centered[batch][x] = inp - batchMean;
+    stdMean += (inp - batchMean) * (inp - batchMean);
+  }
+  stdMean =  1 / sqrt(stdMean * norm + epsilon);
+
+  std[x] = stdMean;
+  runningStddev[x] = (1 - momentum) * runningStddev[x] + momentum * stdMean;
+
+  for (auto batch = 0; batch < output.getSize(0); ++batch) {
+    output[batch][x] = centered[batch][x] * stdMean;
+    normalized[batch][x] = centered[batch][x] * stdMean;
+    if (affine) {
+      ComputeT beta = bias[x];
+      ComputeT gamma = weight[x];
+      output[batch][x] = gamma * output[batch][x] + beta;
+    }
+  }
+}
+
+
+template<typename T, int BatchDims, int ImageDims, bool train, bool affine, typename ComputeT = float>
+void BatchNormalizationUpdateOutput(
+    const DeviceTensor<T, BatchDims + ImageDims> input,
+    DeviceTensor<T, BatchDims + ImageDims> output,
+    DeviceTensor<T, BatchDims + ImageDims> centered,
+    DeviceTensor<T, 1> std,
+    DeviceTensor<T, BatchDims + ImageDims> normalized,
+    DeviceTensor<T, 1> runningMean,
+    DeviceTensor<T, 1> runningStddev,
+    const DeviceTensor<T, 1> weight,
+    const DeviceTensor<T, 1> bias,
+    T epsilon,
+    T momentum,
+    cudaStream_t s)
+{
+  static_assert(BatchDims == 2, "BatchDims == 2 only atm");
+  static_assert(ImageDims == 0, "ImageDims == 0 only atm");
+
+  dim3 threads(128);
+  // auto prop = getCurrentDeviceProperties();
+  if (!train) {
+    dim3 blocks(ceil(input.getSize(1), 128), input.getSize(0));
+    LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+               << threads.x << " " << threads.y << " " << threads.z;
+    BatchNormalizationUpdateOutputInferenceUnrolled_kernel
+      <T, affine, ComputeT>
+      <<<blocks, threads, 0, s>>>
+      (input, output, runningMean, runningStddev, weight, bias);
+  } else {
+    dim3 blocks(ceil(input.getSize(1), 128));
+    LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+               << threads.x << " " << threads.y << " " << threads.z;
+    BatchNormalizationUpdateOutput_kernel<T, affine, ComputeT>
+      <<<blocks, threads, 0, s>>>(input,
+                                  output,
+                                  centered,
+                                  std,
+                                  normalized,
+                                  runningMean,
+                                  runningStddev,
+                                  weight,
+                                  bias,
+                                  epsilon,
+                                  momentum);
+  }
+
+}
+
+extern "C" void BatchNormalizationUpdateOutputFFI(
+    THCState* state,
+    THCudaTensor* input,
+    THCudaTensor* output,
+    THCudaTensor* centered,
+    THCudaTensor* std,
+    THCudaTensor* normalized,
+    THCudaTensor* runningMean,
+    THCudaTensor* runningStddev,
+    THCudaTensor* weight,
+    THCudaTensor* bias,
+    float epsilon,
+    float momentum,
+    bool train,
+    bool affine)
+{
+  // The BatchNormalization lua module is designed for
+  // 2-D only: batch, plane
+  constexpr int BatchDims = 2;
+  constexpr int ImageDims = 0;
+  typedef double ComputeT;
+  if (!train) {
+    if (!affine) {
+      // Collapse
+      BatchNormalizationUpdateOutput
+        <float, BatchDims, ImageDims, false, false, ComputeT>
+        (
+          torchToDeviceTensor<float, BatchDims + ImageDims>(state, input),
+          torchToDeviceTensor<float, BatchDims + ImageDims>(state, output),
+          DeviceTensor<float, BatchDims + ImageDims>(),
+          DeviceTensor<float, 1>(),
+          DeviceTensor<float, BatchDims + ImageDims>(),
+          torchToDeviceTensor<float, 1>(state, runningMean),
+          torchToDeviceTensor<float, 1>(state, runningStddev),
+          DeviceTensor<float, 1>(),
+          DeviceTensor<float, 1>(),
+          epsilon,
+          momentum,
+          THCState_getCurrentStream(state)
+        );
+    } else {
+      // Collapse
+      BatchNormalizationUpdateOutput
+        <float, BatchDims, ImageDims, false, true, ComputeT>
+        (
+          torchToDeviceTensor<float, BatchDims + ImageDims>(state, input),
+          torchToDeviceTensor<float, BatchDims + ImageDims>(state, output),
+          DeviceTensor<float, BatchDims + ImageDims>(),
+          DeviceTensor<float, 1>(),
+          DeviceTensor<float, BatchDims + ImageDims>(),
+          torchToDeviceTensor<float, 1>(state, runningMean),
+          torchToDeviceTensor<float, 1>(state, runningStddev),
+          torchToDeviceTensor<float, 1>(state, weight),
+          torchToDeviceTensor<float, 1>(state, bias),
+          epsilon,
+          momentum,
+          THCState_getCurrentStream(state)
+        );
+    }
+  } else {
+    if (!affine) {
+      BatchNormalizationUpdateOutput
+        <float, BatchDims, ImageDims, true, false, ComputeT>
+      (
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, input),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, output),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, centered),
+        torchToDeviceTensor<float, 1>(state, std),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, normalized),
+        torchToDeviceTensor<float, 1>(state, runningMean),
+        torchToDeviceTensor<float, 1>(state, runningStddev),
+        DeviceTensor<float, 1>(),
+        DeviceTensor<float, 1>(),
+        epsilon,
+        momentum,
+        THCState_getCurrentStream(state)
+      );
+    } else {
+      BatchNormalizationUpdateOutput
+        <float, BatchDims, ImageDims, true, true, ComputeT>
+      (
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, input),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, output),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, centered),
+        torchToDeviceTensor<float, 1>(state, std),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, normalized),
+        torchToDeviceTensor<float, 1>(state, runningMean),
+        torchToDeviceTensor<float, 1>(state, runningStddev),
+        torchToDeviceTensor<float, 1>(state, weight),
+        torchToDeviceTensor<float, 1>(state, bias),
+        epsilon,
+        momentum,
+        THCState_getCurrentStream(state)
+      );
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+
+template<typename T, bool affine, typename ComputeT = float>
+__global__ void BatchNormalizationUpdateGradInput_kernel(
+    DeviceTensor<T, 2> gradInput,
+    const DeviceTensor<T, 2> gradOutput,
+    DeviceTensor<T, 2> centered,
+    DeviceTensor<T, 1> std,
+    const DeviceTensor<T, 1> weight) {
+
+  static_assert(std::is_same<ComputeT, double>::value , "type");
+
+  auto x = blockIdx.x * blockDim.x + threadIdx.x;
+  if (x >= gradOutput.getSize(1)) {
+    return;
+  }
+
+  ComputeT norm = (ComputeT)1 / gradInput.getSize(0);
+  ComputeT gradMean = (ComputeT)0;
+  ComputeT centeredGradMean = (ComputeT)0;
+  for (auto batch = 0; batch < gradOutput.getSize(0); ++batch) {
+    ComputeT g = gradOutput[batch][x].ldg();
+    ComputeT c = centered[batch][x].ldg();
+    gradMean += g;
+    centeredGradMean += c * g;
+  }
+  gradMean *= norm;
+  centeredGradMean *= norm;
+
+  ComputeT stdVal = std[x];
+  ComputeT weightVal = (ComputeT)0;
+  if (affine) {
+    weightVal = weight[x];
+  }
+  for (auto batch = 0; batch < gradOutput.getSize(0); ++batch) {
+    if (affine) {
+      gradInput[batch][x] =
+        (
+          - centeredGradMean * centered[batch][x] * stdVal * stdVal
+          + gradOutput[batch][x]
+          - gradMean
+        ) * stdVal * weightVal;
+    } else {
+      gradInput[batch][x] =
+        (
+          - centeredGradMean * centered[batch][x] * stdVal * stdVal
+          + gradOutput[batch][x]
+          - gradMean
+        ) * stdVal;
+    }
+  }
+}
+
+template<typename T, int BatchDims, int ImageDims, bool affine, typename ComputeT = float>
+void BatchNormalizationUpdateGradInput(
+    DeviceTensor<T, BatchDims + ImageDims> gradInput,
+    const DeviceTensor<T, BatchDims + ImageDims> gradOutput,
+    DeviceTensor<T, BatchDims + ImageDims> centered,
+    DeviceTensor<T, 1> std,
+    const DeviceTensor<T, 1> weight,
+    cudaStream_t s)
+{
+  static_assert(BatchDims == 2, "BatchDims == 2 only atm");
+  static_assert(ImageDims == 0, "ImageDims == 0 only atm");
+
+  dim3 blocks(ceil(gradOutput.getSize(1), 128));
+  dim3 threads(128);
+  LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+             << threads.x << " " << threads.y << " " << threads.z;
+  BatchNormalizationUpdateGradInput_kernel<T, affine, ComputeT>
+    <<<blocks, threads, 0, s>>>(gradInput,
+                                gradOutput,
+                                centered,
+                                std,
+                                weight);
+}
+
+extern "C" void BatchNormalizationUpdateGradInputFFI(
+    THCState* state,
+    THCudaTensor* gradInput,
+    THCudaTensor* gradOutput,
+    THCudaTensor* centered,
+    THCudaTensor* std,
+    THCudaTensor* weight,
+    bool affine) {
+
+  // The BatchNormalization lua module is designed for
+  // 2-D only: batch, plane
+  constexpr int BatchDims = 2;
+  constexpr int ImageDims = 0;
+  typedef double ComputeT;
+  if (!affine) {
+    // Collapse
+    BatchNormalizationUpdateGradInput
+      <float, BatchDims, ImageDims, false, ComputeT>
+      (
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradInput),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradOutput),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, centered),
+        torchToDeviceTensor<float, 1>(state, std),
+        DeviceTensor<float, 1>(),
+        THCState_getCurrentStream(state)
+      );
+  } else {
+    // Collapse
+    BatchNormalizationUpdateGradInput
+      <float, BatchDims, ImageDims, true, ComputeT>
+      (
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradInput),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradOutput),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, centered),
+        torchToDeviceTensor<float, 1>(state, std),
+        torchToDeviceTensor<float, 1>(state, weight),
+        THCState_getCurrentStream(state)
+      );
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+
+template<typename T, typename ComputeT = float>
+__global__  void BatchNormalizationAccGradParameters_kernel(
+    const DeviceTensor<T, 2> gradOutput,
+    const DeviceTensor<T, 2> normalized,
+    DeviceTensor<T, 1> gradWeight,
+    DeviceTensor<T, 1> gradBias,
+    T scale)
+{
+
+  static_assert(std::is_same<ComputeT, double>::value , "type");
+
+  auto x = blockIdx.x * blockDim.x + threadIdx.x;
+  if (x >= gradOutput.getSize(1)) {
+    return;
+  }
+
+  ComputeT gradMean = (ComputeT)0;
+  ComputeT normalizedGradMean = (ComputeT)0;
+  for (auto batch = 0; batch < gradOutput.getSize(0); ++batch) {
+    ComputeT g = gradOutput[batch][x].ldg();
+    ComputeT n = normalized[batch][x].ldg();
+    gradMean += g;
+    normalizedGradMean += n * g;
+  }
+  gradBias[x] += scale * gradMean;
+  gradWeight[x] += scale * normalizedGradMean;
+}
+
+template<typename T, int BatchDims, int ImageDims, typename ComputeT = float>
+void BatchNormalizationAccGradParameters(
+    const DeviceTensor<T, BatchDims + ImageDims> gradOutput,
+    const DeviceTensor<T, BatchDims + ImageDims> normalized,
+    DeviceTensor<T, 1> gradWeight,
+    DeviceTensor<T, 1> gradBias,
+    T scale,
+    cudaStream_t s)
+{
+  static_assert(BatchDims == 2, "BatchDims == 2 only atm");
+  static_assert(ImageDims == 0, "ImageDims == 0 only atm");
+
+  dim3 blocks(ceil(gradOutput.getSize(1), 128));
+  dim3 threads(128);
+  LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+             << threads.x << " " << threads.y << " " << threads.z;
+  BatchNormalizationAccGradParameters_kernel<T, ComputeT>
+    <<<blocks, threads, 0, s>>>(gradOutput,
+                                normalized,
+                                gradWeight,
+                                gradBias,
+                                scale);
+
+}
+
+extern "C" void BatchNormalizationAccGradParametersFFI(
+    THCState* state,
+    THCudaTensor* gradOutput,
+    THCudaTensor* normalized,
+    THCudaTensor* gradWeight,
+    THCudaTensor* gradBias,
+    float scale) {
+  // The BatchNormalization lua module is designed for
+  // 2-D only: batch, plane
+  constexpr int BatchDims = 2;
+  constexpr int ImageDims = 0;
+  typedef double ComputeT;
+  // Collapse
+  BatchNormalizationAccGradParameters
+    <float, BatchDims, ImageDims, ComputeT>
+    (
+      torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradOutput),
+      torchToDeviceTensor<float, BatchDims + ImageDims>(state, normalized),
+      torchToDeviceTensor<float, 1>(state, gradWeight),
+      torchToDeviceTensor<float, 1>(state, gradBias),
+      scale,
+      THCState_getCurrentStream(state)
+    );
+
+  THCudaCheck(cudaGetLastError());
+}
+
+
+}}}
diff --git a/src/ConvolutionBias.cu b/src/ConvolutionBias.cu
index f738b9d..37ebb6e 100644
--- a/src/ConvolutionBias.cu
+++ b/src/ConvolutionBias.cu
@@ -1,19 +1,19 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "ConvolutionBias.cuh"
+#include "src/ConvolutionBias.cuh"
 
 #include "cuda/ComputeCapabilities.cuh"
 #include "cuda/CudaUtils.cuh"
 #include "cuda/DeviceTensor.cuh"
 #include "cuda/WarpReductions.cuh"
-#include "DeviceTensorUtils.h"
-#include "util/Misc.h"
+#include "cuda/util/CachedDeviceProperties.h"
+#include "src/DeviceTensorUtils.h"
 
 #include <boost/preprocessor/repetition/repeat.hpp>
 #include <glog/logging.h>
 
 using namespace facebook::cuda;
-using namespace facebook::CUDAUtil;
+using namespace facebook::cuda;
 
 // This layer computes the following:
 //
@@ -252,4 +252,31 @@ accGradParametersTemporalBias(THCState* state,
     0, THCState_getCurrentStream(state)>>>(gradBias, output, biasScale);
 }
 
+
+extern "C" void updateOutputBiasFFI(THCState* state,
+                                    THCudaTensor* outputTH,
+                                    THCudaTensor* biasTH) {
+  updateOutputBias(state, outputTH, biasTH);
+}
+
+extern "C" void updateOutputTemporalBiasFFI(THCState* state,
+                                            THCudaTensor* outputTH,
+                                            THCudaTensor* biasTH) {
+  updateOutputTemporalBias(state, outputTH, biasTH);
+}
+
+extern "C" void accGradParametersBiasFFI(THCState* state,
+                                         THCudaTensor* outputTH,
+                                         THCudaTensor* gradBiasTH,
+                                         float biasScale) {
+  accGradParametersBias(state, outputTH, gradBiasTH, biasScale);
+}
+
+extern "C" void accGradParametersTemporalBiasFFI(THCState* state,
+                                                 THCudaTensor* outputTH,
+                                                 THCudaTensor* gradBiasTH,
+                                                 float biasScale) {
+  accGradParametersTemporalBias(state, outputTH, gradBiasTH, biasScale);
+}
+
 } } } } // namespace
diff --git a/src/CrossMapNormalization.cu b/src/CrossMapNormalization.cu
index 9a69f7d..db53866 100644
--- a/src/CrossMapNormalization.cu
+++ b/src/CrossMapNormalization.cu
@@ -3,7 +3,7 @@
  * @author Tudor Bosman (tudorb@fb.com)
  */
 
-#include "CrossMapNormalization.cuh"
+#include "src/CrossMapNormalization.cuh"
 
 
 namespace facebook { namespace deeplearning { namespace torch {
diff --git a/src/CrossMapNormalizationHost.cpp b/src/CrossMapNormalizationHost.cpp
index 535b1b6..43ea2d4 100644
--- a/src/CrossMapNormalizationHost.cpp
+++ b/src/CrossMapNormalizationHost.cpp
@@ -4,8 +4,8 @@
  */
 
 #include "THC.h"
-#include "CrossMapNormalization.cuh"
-#include "Utils.h"
+#include "src/CrossMapNormalization.cuh"
+#include "src/Utils.h"
 #include <luaT.h>
 #include <lua.hpp>
 
diff --git a/src/CuBLASWrapper.cpp b/src/CuBLASWrapper.cpp
index 4339eb2..f5f0065 100644
--- a/src/CuBLASWrapper.cpp
+++ b/src/CuBLASWrapper.cpp
@@ -1,10 +1,10 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "CuBLASWrapper.h"
+#include "src/CuBLASWrapper.h"
 
 #include "cuda/DeviceTensor.cuh"
 #include "THCTensor.h"
-#include "BLASParameters.h"
+#include "src/BLASParameters.h"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -33,8 +33,9 @@ const cuFloatComplex kOneComplex = make_cuComplex(1.0f, 0.0f);
 template <int Dim>
 void transpose(const DeviceTensor<float, Dim>& in,
                DeviceTensor<float, Dim>& out,
-               int sep,
+               int separator,
                bool asComplex,
+               bool transposeMetaData,
                cublasHandle_t handle,
                cudaStream_t stream) {
   cublasHandle_t localHandle;
@@ -55,18 +56,35 @@ void transpose(const DeviceTensor<float, Dim>& in,
     CHECK_EQ(true, in.isContiguousDim(i)) << "Not contiguous dim = " << i;
     CHECK_EQ(true, out.isContiguousDim(i)) << "Not contiguous dim = " << i;
   }
-  for (int i = 0; i < Dim; ++i) {
-    CHECK_EQ(in.getSize(i), out.getSize(i)) <<
-      "Not eq dim = " << i << " in = " << in << " out = " << out;
+  if (transposeMetaData) {
+    for (int i = 0; i < Dim; ++i) {
+      CHECK_EQ(in.getSize(i), out.getSize(i)) <<
+        "Not eq dim = " << i << " in = " << in << " out = " << out;
+    }
+  } else {
+    auto upper = (asComplex) ? Dim - 2 : Dim - 1;
+    if (!asComplex) {
+      for (int i = 0; i < separator; ++i) {
+        CHECK_EQ(in.getSize(i), out.getSize(Dim - separator + i)) <<
+          "Not eq dim, in(" << i << ") = " << in << " out(" <<
+          (Dim - separator + i) << ") = " << out;
+      }
+      for (int i = separator; i < upper; ++i) {
+        CHECK_EQ(in.getSize(i), out.getSize(i - separator)) <<
+          "Not eq dim, in(" << i << ") = " << in << " out(" <<
+          (i - separator) << ") = " << out;
+      }
+    }
   }
 
+
   int rows = 1;
-  for (int i = 0; i < sep; ++i) {
+  for (int i = 0; i < separator; ++i) {
     rows *= in.getSize(i);
   }
 
   int cols = 1;
-  for (int i = sep; i < Dim; ++i) {
+  for (int i = separator; i < Dim; ++i) {
     cols *= in.getSize(i);
   }
 
@@ -122,32 +140,33 @@ void transpose(const DeviceTensor<float, Dim>& in,
   }
   CHECK_EQ(CUBLAS_STATUS_SUCCESS, res);
 
-  // Permute the sizes to keep the CudaTensor consistent.
-  // This only works because all dims are contiguous.
-  std::vector<int> permDims;
-  permDims.reserve(Dim);
-  if (!asComplex) {
-    // Non-complex case is easy
-    for (int i = sep; i < Dim; ++i) {
-      permDims.push_back(i);
+  if (transposeMetaData) {
+    // Permute the sizes to keep the CudaTensor consistent.
+    // This only works because all dims are contiguous.
+    std::vector<int> permDims;
+    permDims.reserve(Dim);
+    if (!asComplex) {
+      // Non-complex case is easy
+      for (int i = separator; i < Dim; ++i) {
+        permDims.push_back(i);
+      }
+      for (int i = 0; i < separator; ++i) {
+        permDims.push_back(i);
+      }
+    } else {
+      // Complex case is trickier since it is float[2] that must stay in
+      // horizontal order whatever happens
+      for (int i = separator; i < Dim - 1; ++i) {
+        permDims.push_back(i);
+      }
+      for (int i = 0; i < separator; ++i) {
+        permDims.push_back(i);
+      }
+      permDims.push_back(Dim - 1);
     }
-    for (int i = 0; i < sep; ++i) {
-      permDims.push_back(i);
-    }
-  } else {
-    // Complex case is trickier since it is float[2] that must stay in
-    // horizontal order whatever happens
-    for (int i = sep; i < Dim - 1; ++i) {
-      permDims.push_back(i);
-    }
-    for (int i = 0; i < sep; ++i) {
-      permDims.push_back(i);
-    }
-    permDims.push_back(Dim - 1);
+    out.permuteDims(permDims);
   }
 
-  out.permuteDims(permDims);
-
   THCudaCheck(cudaGetLastError());
   CHECK_EQ(CUBLAS_STATUS_SUCCESS, res);
 }
@@ -155,25 +174,28 @@ void transpose(const DeviceTensor<float, Dim>& in,
 template <int Dim>
 void transposeAsComplex(const DeviceTensor<float, Dim>& in,
                         DeviceTensor<float, Dim>& out,
-                        int sep,
+                        int separator,
+                        bool transposeMetaData,
                         cublasHandle_t handle,
                         cudaStream_t stream) {
-  transpose<Dim>(in, out, sep, true, handle, stream);
+  transpose<Dim>(in, out, separator, true, transposeMetaData, handle, stream);
 }
 
 #define TRANSPOSE_INSTANTIATION(DIM)                                    \
   template void transpose<DIM>(const DeviceTensor<float, DIM>& in,      \
                                DeviceTensor<float, DIM>& out,           \
-                               int sep,                                 \
+                               int separator,                           \
                                bool asComplex,                          \
+                               bool transposeMetaData,                  \
                                cublasHandle_t handle,                   \
                                cudaStream_t stream);
 
-#define TRANSPOSE_AS_COMPLEX_INSTANTIATION(DIM)                         \
+#define TRANSPOSE_AS_COMPLEX_INSTANTIATION(DIM)                             \
   template void transposeAsComplex<DIM>(const DeviceTensor<float, DIM>& in, \
-                                        DeviceTensor<float, DIM>& out,  \
-                                        int sep,                        \
-                                        cublasHandle_t handle,          \
+                                        DeviceTensor<float, DIM>& out,      \
+                                        int separator,                      \
+                                        bool transposeMetaData,             \
+                                        cublasHandle_t handle,              \
                                         cudaStream_t stream);
 
 TRANSPOSE_INSTANTIATION(2);
@@ -516,11 +538,11 @@ struct matmultBatchedStruct<Dim, Dim> {
 #define BATCHEDMM_TAIL_INSTANTIATION(DIM1, DIM2)                        \
   template <>                                                           \
   struct matmultBatchedStruct<DIM1, DIM2> {                             \
-    void run(DeviceTensor<float, DIM1>& C,                                \
-             DeviceTensor<float, DIM1>& A,                          \
-             DeviceTensor<float, DIM1>& B,                          \
+    void run(DeviceTensor<float, DIM1>& C,                              \
+             DeviceTensor<float, DIM1>& A,                              \
+             DeviceTensor<float, DIM1>& B,                              \
              const BLASParameters& params) {                            \
-      throw invalid_argument("BatchedMM needs at least 3 dimensions");  \
+      THError("BatchedMM needs at least 3 dimensions");                 \
     }                                                                   \
   }                                                                     \
 
@@ -586,7 +608,7 @@ void matmultBatched(DeviceTensor<float, Dim>& C,
       matmultBatchedStruct<Dim, Dim - 1>().run(C, A, B, params);
       break;
     default:
-      throw invalid_argument("At most 2 outer sequential dimensions supported");
+      THError("At most 2 outer sequential dimensions supported");
   };
 }
 
@@ -628,15 +650,15 @@ struct matmultIterStruct<Dim, Dim> {
   }
 };
 
-#define ITERATEDMM_TAIL_INSTANTIATION(DIM1, DIM2)       \
-  template <>                                           \
-  struct matmultIterStruct<DIM1, DIM2> {                \
+#define ITERATEDMM_TAIL_INSTANTIATION(DIM1, DIM2)         \
+  template <>                                             \
+  struct matmultIterStruct<DIM1, DIM2> {                  \
     void run(DeviceTensor<float, DIM1>& C,                \
              DeviceTensor<float, DIM1>& A,                \
              DeviceTensor<float, DIM1>& B,                \
-             const BLASParameters& params) {            \
-      CHECK(false) << "Should not be here";             \
-    }                                                   \
+             const BLASParameters& params) {              \
+      CHECK(false) << "Should not be here";               \
+    }                                                     \
   };
 
 ITERATEDMM_TAIL_INSTANTIATION(3, 1);
@@ -676,15 +698,14 @@ void matmultIter(DeviceTensor<float, Dim>& C,
       break;
 
     default:
-      throw invalid_argument(
-        "At most 2 outer sequential and 2 batch dimensions supported");
+      THError("At most 2 outer sequential and 2 batch dimensions supported");
   };
 }
 
 #define MATMULT_ITER_INSTANTIATION(DIM)                                 \
-  template void matmultIter<DIM>(DeviceTensor<float, DIM>& C,             \
-                                 DeviceTensor<float, DIM>& A,             \
-                                 DeviceTensor<float, DIM>& B,             \
+  template void matmultIter<DIM>(DeviceTensor<float, DIM>& C,           \
+                                 DeviceTensor<float, DIM>& A,           \
+                                 DeviceTensor<float, DIM>& B,           \
                                  const BLASParameters& params);
 
 MATMULT_ITER_INSTANTIATION(2);
diff --git a/src/CuBLASWrapper.h b/src/CuBLASWrapper.h
index 8df3763..434251c 100644
--- a/src/CuBLASWrapper.h
+++ b/src/CuBLASWrapper.h
@@ -3,7 +3,7 @@
 
 #include "cuda/DeviceTensor.cuh"
 
-#include "BLASParameters.h"
+#include "src/BLASParameters.h"
 
 #include "cublas_v2.h"
 #include <iostream>
@@ -15,9 +15,10 @@ namespace facebook { namespace deeplearning { namespace torch {
 //
 // This transposition wrapper implements quick device-side transpositions.
 // Consider tensor dimensions are collapsed into a 2-D 'y'-by-'x'.
-// The wrapper takes a sep integer and considers dimensions (0 .. sep - 1) as
-// being collapsed to form the 'y' dimension. Dimensions (sep .. Dim - 1)
-// are collapsed to form the 'x' dimension.
+// The wrapper takes a separator integer and considers dimensions
+// (0 .. separator - 1) as being collapsed to form the 'y'
+// dimension. Dimensions (separator .. Dim - 1) are collapsed to form the 'x'
+// dimension.
 //
 // The complex case is a bit trickier since Torch does not natively support
 // complex numbers, we emulate them with float[2]. In that case, 'x' is
@@ -33,15 +34,17 @@ namespace facebook { namespace deeplearning { namespace torch {
 template<int Dim>
 void transpose(const cuda::DeviceTensor<float, Dim>& in,
                cuda::DeviceTensor<float, Dim>& out,
-               int sep,
+               int separator,
                bool asComplex = false,
+               bool transposeMetaData = true,
                cublasHandle_t handle = NULL,
                cudaStream_t stream = NULL);
 
 template<int Dim>
 void transposeAsComplex(const cuda::DeviceTensor<float, Dim>& in,
                         cuda::DeviceTensor<float, Dim>& out,
-                        int sep,
+                        int separator,
+                        bool transposeMetaData = true,
                         cublasHandle_t handle = NULL,
                         cudaStream_t stream = NULL);
 
diff --git a/src/CuBLASWrapperLua.cpp b/src/CuBLASWrapperLua.cpp
index 8133ee3..f544b1f 100644
--- a/src/CuBLASWrapperLua.cpp
+++ b/src/CuBLASWrapperLua.cpp
@@ -1,12 +1,12 @@
 // Copyright 2014 Facebook
 
 #include "cuda/KernelTimer.h"
-#include "Utils.h"
-#include "DeviceTensorUtils.h"
+#include "cuda/util/CachedDeviceProperties.h"
+#include "src/Utils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THC.h"
 #include "THCTensor.h"
-#include "CuBLASWrapper.h"
-#include "util/Misc.h"
+#include "src/CuBLASWrapper.h"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -17,30 +17,33 @@
 #include <string>
 
 using namespace facebook::cuda;
-using namespace facebook::CUDAUtil;
 using namespace std;
 
 namespace facebook { namespace deeplearning { namespace torch {
 
 namespace {
 
-#define MATMULT_CASE(DIM)                                               \
-  case DIM:                                                             \
-  CHECK_EQ(DIM, iterDims + batchDims + 2);                              \
-  {                                                                     \
-    DeviceTensor<float, DIM> A = torchToDeviceTensor<float, DIM>(state, thA);  \
-    DeviceTensor<float, DIM> B = torchToDeviceTensor<float, DIM>(state, thB);  \
-    DeviceTensor<float, DIM> C = torchToDeviceTensor<float, DIM>(state, thC);  \
-    matmultIter<DIM>(C, A, B, params);                                  \
-  }                                                                     \
+#define LOG_TARGET VLOG(3)
+
+#define MATMULT_CASE(DIM)                                                     \
+  case DIM:                                                                   \
+  CHECK_EQ(DIM, iterDims + batchDims + 2 + ((asComplex) ? 1 : 0));            \
+  {                                                                           \
+    DeviceTensor<float, DIM> A = torchToDeviceTensor<float, DIM>(state, thA); \
+    DeviceTensor<float, DIM> B = torchToDeviceTensor<float, DIM>(state, thB); \
+    DeviceTensor<float, DIM> C = torchToDeviceTensor<float, DIM>(state, thC); \
+    matmultIter<DIM>(C, A, B, params);                                        \
+  }                                                                           \
   break;
 
 int matmult(lua_State* L, bool asComplex = false) {
   THCState* state = getCutorchState(L);
+  auto transA = luaT_getfieldcheckstring(L, 1, "transA");
+  auto transB = luaT_getfieldcheckstring(L, 1, "transB");
   auto iterDims = luaT_getfieldcheckint(L, 1, "iterDims");
   auto batchDims = luaT_getfieldcheckint(L, 1, "batchDims");
-  auto numHandles = luaT_getfieldcheckint(L, 1, "handles");
-  auto numStreams = luaT_getfieldcheckint(L, 1, "streams");
+  auto scale = luaT_getfieldchecknumber(L, 1, "scale");
+  auto timed = luaT_getfieldcheckboolean(L, 1, "timed");
   auto thA = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
   auto thB = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
   auto thC = (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor");
@@ -52,26 +55,28 @@ int matmult(lua_State* L, bool asComplex = false) {
   CHECK_EQ(THCudaTensor_nDimension(state, thC),
            THCudaTensor_nDimension(state, thB));
 
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+
   std::vector<cublasHandle_t> handles;
-  for (auto i = 0; i < numHandles; ++i) {
-    handles.push_back(cublasHandle_t());
-    cublasCreate(&(handles.back()));
+  // Skip NULL handle
+  for (auto i = 1; i <= THCState_getNumBlasHandles(state); ++i) {
+    handles.push_back(THCState_getDeviceBlasHandle(state, device, i));
   }
 
   std::vector<cudaStream_t> streams;
-  for (auto i = 0; i < numStreams; ++i) {
-    streams.push_back(cudaStream_t());
-    cudaStreamCreate(&(streams.back()));
+  // Skip default stream
+  for (auto i = 1; i <= THCState_getNumStreams(state); ++i) {
+    streams.push_back(THCState_getDeviceStream(state, device, i));
   }
 
-  auto time = 0.0f;
-  constexpr long kNumTrials = 5;
   int dims = THCudaTensor_nDimension(state, thA);
   BLASParameters p;
   auto& params = p.withIterDims(iterDims).withBatchDims(batchDims).
-    withComplex(asComplex).withHandles(handles).withStreams(streams);
-  for (int i = 0; i < kNumTrials; ++i) {
-    cuda::KernelTimer timer;
+    withComplex(asComplex).withHandles(handles).withStreams(streams).
+    withTransposeA(transA[0]).withTransposeB(transB[0]).withScaleReal(scale);
+
+  if (!timed) {
     switch (dims) {
       MATMULT_CASE(2);
       MATMULT_CASE(3);
@@ -79,39 +84,54 @@ int matmult(lua_State* L, bool asComplex = false) {
       MATMULT_CASE(5);
       MATMULT_CASE(6);
       default:
-        throw invalid_argument("Unsupported dims");
+        THError("GEMM Unsupported dims");
     };
-    auto timeMS = timer.stop();
-    if (i > 0) {
-      time += timeMS;
+  } else {
+    auto time = 0.0f;
+    constexpr long kNumTrials = 5;
+    for (int i = 0; i < kNumTrials; ++i) {
+      cuda::KernelTimer timer;
+      switch (dims) {
+        MATMULT_CASE(2);
+        MATMULT_CASE(3);
+        MATMULT_CASE(4);
+        MATMULT_CASE(5);
+        MATMULT_CASE(6);
+        default:
+          THError("GEMM Unsupported dims");
+      };
+      auto timeMS = timer.stop();
+      if (i > 0) {
+        time += timeMS;
+      }
     }
-  }
-  time /= kNumTrials - 1;
+    time /= kNumTrials - 1;
 
-  long iters = 1;
-  for (int i = 0; i < iterDims; ++i) {
-    iters *= THCudaTensor_size(state, thA, i);
-  }
-  long batch = 1;
-  for (int i = iterDims; i < iterDims + batchDims; ++i) {
-    batch *= THCudaTensor_size(state, thA, i);
-  }
+    long iters = 1;
+    for (int i = 0; i < iterDims; ++i) {
+      iters *= THCudaTensor_size(state, thA, i);
+    }
+    long batch = 1;
+    for (int i = iterDims; i < iterDims + batchDims; ++i) {
+      batch *= THCudaTensor_size(state, thA, i);
+    }
 
-  auto GOut = (THCudaTensor_size(state, thC, 0) *
-               THCudaTensor_stride(state, thC, 0) *
-               THCudaTensor_size(state, thA, dims - 1)) /
-               1e9;
-  LOG(INFO) << folly::format(
-    "  Running mxm ({}x{}x{}): {} iterations (parallel over streams),"  \
-    "  {} batches, GReductions(virtual fmas)/s = {:.5f}"                \
-    "  time = {:.2f}ms",
-    THCudaTensor_size(state, thC, dims - 2),
-    THCudaTensor_size(state, thC, dims - 1),
-    THCudaTensor_size(state, thA, dims - 1),
-    iters,
-    batch,
-    (GOut / time) * 1e3,
-    time).str();
+    auto GOut = (THCudaTensor_size(state, thC, 0) *
+                 THCudaTensor_stride(state, thC, 0) *
+                 THCudaTensor_size(state, thA, dims - 1)) /
+      1e9;
+    LOG_TARGET << folly::format(
+      "  Running mxm ({}x{}x{}): {} iterations (parallel over streams),"  \
+      "  {} batches, GReductions(virtual fmas)/s = {:.5f}"                \
+      "  time = {:.2f}ms",
+      THCudaTensor_size(state, thC, (asComplex) ? dims - 3 : dims - 2),
+      THCudaTensor_size(state, thC, (asComplex) ? dims - 2 : dims - 1),
+      THCudaTensor_size(state, thA, (asComplex) ? dims - 2 : dims - 1),
+      iters,
+      batch,
+      (GOut / time) * 1e3,
+      time).str();
+  }
 
   return 0;
 }
@@ -124,9 +144,64 @@ int matmultComplex(lua_State* L) {
   return matmult(L, true);
 }
 
+#define TRANSPOSE_CASE(DIM)                                                    \
+  if (dim == DIM) {                                                            \
+    DeviceTensor<float, DIM> A = torchToDeviceTensor<float, DIM>(state, thA);  \
+    DeviceTensor<float, DIM> tA = torchToDeviceTensor<float, DIM>(state, thB); \
+    facebook::deeplearning::torch::transpose<DIM>(                             \
+      A, tA, separator, asComplex, transposeMetaData, handle, stream);         \
+    if (transposeMetaData) {                                                   \
+      /* Also transpose the metadata */                                        \
+      for (auto i = 0; i < dim; ++i) {                                         \
+        thB->size[i] = tA.getSize(i);                                          \
+        thB->stride[i] = tA.getStride(i);                                      \
+      }                                                                        \
+    }                                                                          \
+    done = true;                                                               \
+  }
+
+int transpose(lua_State* L, bool asComplex = false) {
+  THCState* state = getCutorchState(L);
+  auto separator  = luaT_getfieldcheckint(L, 1, "separator");
+  auto transposeMetaData = luaT_getfieldcheckboolean(L, 1, "transposeMetaData");
+  auto handleIndex = luaT_getfieldcheckint(L, 1, "handle");
+  auto streamIndex = luaT_getfieldcheckint(L, 1, "stream");
+  auto thA = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
+  auto thB = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
+  int dim = THCudaTensor_nDimension(state, thA);
+
+  CHECK_EQ(THCudaTensor_nDimension(state, thA),
+           THCudaTensor_nDimension(state, thB));
+
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+
+  auto handle = THCState_getDeviceBlasHandle(state, device, handleIndex);
+  auto stream = THCState_getDeviceStream(state, device, streamIndex);
+
+  auto done = false;
+  TRANSPOSE_CASE(2);
+  TRANSPOSE_CASE(3);
+  TRANSPOSE_CASE(4);
+  TRANSPOSE_CASE(5);
+  if (!done) { THError("Transpose Unsupported dims"); }
+
+  return 0;
+}
+
+int transpose(lua_State* L) {
+  return transpose(L, false);
+}
+
+int transposeComplex(lua_State* L) {
+  return transpose(L, true);
+}
+
 const luaL_Reg functions[] = {
   {"CuBLASWrapper_matmult", matmult},
   {"CuBLASWrapper_matmultComplex", matmultComplex},
+  {"CuBLASWrapper_transpose", transpose},
+  {"CuBLASWrapper_transposeComplex", transposeComplex},
   {nullptr, nullptr},
 };
 
diff --git a/src/CudaTensorUtils.cpp b/src/CudaTensorUtils.cpp
index 295f722..0edaa4b 100644
--- a/src/CudaTensorUtils.cpp
+++ b/src/CudaTensorUtils.cpp
@@ -1,5 +1,5 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
-#include "CudaTensorUtils.h"
+#include "src/CudaTensorUtils.h"
 #include "THC.h"
 
 using namespace std;
@@ -107,10 +107,10 @@ makeAliasedTHCudaTensorFull(THCState* state,
   }
 
   auto szTH = LongStorage::wrap(
-    makeMutable(LongRange(sizesTH, sizes.size()))).moveAsTH();
+    folly::Range<long*>(sizesTH, sizes.size())).moveAsTH();
   SCOPE_EXIT { THLongStorage_free(szTH); };
   auto strTH = LongStorage::wrap(
-    makeMutable(LongRange(stridesTH, sizes.size()))).moveAsTH();
+    folly::Range<long*>(stridesTH, sizes.size())).moveAsTH();
   SCOPE_EXIT { THLongStorage_free(strTH); };
 
   auto tensor = THCudaTensor_newWithStorage(
@@ -148,9 +148,9 @@ Tensor<float> copyFromCuda(THCState* state, const THCudaTensor* ctensor) {
   return Tensor<float>(
     Storage<float>(dataTH), tensor->storageOffset,
     LongStorage::wrap(
-      makeMutable(LongRange(tensor->size, tensor->nDimension))),
+      folly::Range<long*>(tensor->size, tensor->nDimension)),
     LongStorage::wrap(
-      makeMutable(LongRange(tensor->stride, tensor->nDimension))));
+      folly::Range<long*>(tensor->stride, tensor->nDimension)));
 }
 
 unique_ptr<THCudaTensor, CudaTensorDeleter>
diff --git a/src/CudaTensorUtils.h b/src/CudaTensorUtils.h
index c3b8ec8..016c0cc 100644
--- a/src/CudaTensorUtils.h
+++ b/src/CudaTensorUtils.h
@@ -2,7 +2,7 @@
 #pragma once
 
 #include "THCTensor.h"
-#include "folly/Optional.h"
+#include <folly/Optional.h>
 #include "thpp/Tensor.h"
 
 #include <cuda_runtime.h>
diff --git a/src/DeviceTensorUtils.h b/src/DeviceTensorUtils.h
index c74f15b..e4207f3 100644
--- a/src/DeviceTensorUtils.h
+++ b/src/DeviceTensorUtils.h
@@ -51,4 +51,4 @@ torchToDeviceTensorCast(THCState* state, THCudaTensor* t) {
 
 } } }  // namespace
 
-#include "DeviceTensorUtils-inl.h"
+#include "src/DeviceTensorUtils-inl.h"
diff --git a/src/FeatureLPPooling.cu b/src/FeatureLPPooling.cu
index 02806b3..73e1480 100644
--- a/src/FeatureLPPooling.cu
+++ b/src/FeatureLPPooling.cu
@@ -1,11 +1,11 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "FeatureLPPooling.cuh"
+#include "src/FeatureLPPooling.cuh"
 #include "cuda/DeviceTensor.cuh"
 #include "cuda/CudaStaticAssert.cuh"
 #include "cuda/CudaUtils.cuh"
 #include "cuda/RegisterUtils.cuh"
-#include "util/Misc.h"
+#include "cuda/util/CachedDeviceProperties.h"
 #include "THC.h"
 
 #include <boost/preprocessor/repetition/repeat.hpp>
@@ -349,7 +349,7 @@ runFeatureLPPoolingUpdateOutput(cudaStream_t stream,
                                 DeviceTensor<float, 4>& output,
                                 float power, int width, int stride) {
   const cudaDeviceProp& deviceProperties =
-    facebook::CUDAUtil::getCurrentDeviceProperties();
+    facebook::cuda::getCurrentDeviceProperties();
   const int outputFeatures = ((input.getSize(1) - width) / stride) + 1;
 
   assert(input.getSize(0) == output.getSize(0));
@@ -442,7 +442,7 @@ runFeatureLPPoolingUpdateGradInput(cudaStream_t stream,
                                    DeviceTensor<float, 4>& gradInput,
                                    float power, int width, int stride) {
   const cudaDeviceProp& deviceProperties =
-    facebook::CUDAUtil::getCurrentDeviceProperties();
+    facebook::cuda::getCurrentDeviceProperties();
 
   for (int i = 0; i < 4; ++i) {
     assert(gradOutput.getSize(i) == output.getSize(i));
@@ -463,7 +463,7 @@ runFeatureLPPoolingUpdateGradInput(cudaStream_t stream,
 
   // Different threads are potentially adding into overlapping input
   // points, so we must clear out gradInput before continuing.
-  gradInput.fillAsync(0.0f, stream);
+  gradInput.zero();
 
   // Split non-features among threads and grid x
   int totalNonFeatureSize = input.getSize(2) * input.getSize(3);
diff --git a/src/FeatureLPPoolingHost.cpp b/src/FeatureLPPoolingHost.cpp
index e1d3a30..f7fb493 100644
--- a/src/FeatureLPPoolingHost.cpp
+++ b/src/FeatureLPPoolingHost.cpp
@@ -1,10 +1,10 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
 #include "cuda/DeviceTensor.cuh"
-#include "Utils.h"
-#include "DeviceTensorUtils.h"
+#include "src/Utils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THC.h"
-#include "FeatureLPPooling.cuh"
+#include "src/FeatureLPPooling.cuh"
 
 #include <folly/Optional.h>
 #include <folly/ScopeGuard.h>
@@ -264,7 +264,7 @@ int featureLPPooling_updateGradInput(lua_State *L) {
   gradOutput = *gradOutputUpcast;
   output = *outputUpcast;
 
-  if (!output.isSameSizeAndStride(gradOutput)) {
+  if (!output.isSameSize(gradOutput)) {
     luaL_error(L, "output and gradOutput sizes do not match");
   }
 
diff --git a/src/HSMHost.cpp b/src/HSMHost.cpp
index 671fab3..2537e43 100644
--- a/src/HSMHost.cpp
+++ b/src/HSMHost.cpp
@@ -3,7 +3,7 @@
  * @author Michael Mathieu (myrhev@fb.com)
  */
 
-#include "Utils.h"
+#include "src/Utils.h"
 #include <lua.hpp>
 #include <TH.h>
 #include "THC.h"
diff --git a/src/HalfPrec.cpp b/src/HalfPrec.cpp
index 673119e..bd6a802 100644
--- a/src/HalfPrec.cpp
+++ b/src/HalfPrec.cpp
@@ -1,13 +1,13 @@
 // Copyright 2004-, Facebook, Inc. All Rights Reserved.
 
-#include "HalfPrec.h"
+#include "src/HalfPrec.h"
 
 #include <string>
 #include <assert.h>
 #include <lua.hpp>
-#include "Utils.h"
-#include "Tensor.h"
-#include "LuaUtils.h"
+#include "src/Utils.h"
+#include "src/Tensor.h"
+#include "src/LuaUtils.h"
 #include "THC.h"
 
 using namespace std;
diff --git a/src/HalfPrecKernels.cu b/src/HalfPrecKernels.cu
index 8c13c6e..9c03878 100644
--- a/src/HalfPrecKernels.cu
+++ b/src/HalfPrecKernels.cu
@@ -4,10 +4,10 @@
 #include <stdexcept>
 #include <cuda.h>
 
-#include "HalfPrec.h"
-#include "util/Transform.cuh"
+#include "src/HalfPrec.h"
+#include "src/util/Transform.cuh"
 
-using namespace facebook::CUDAUtil;
+using namespace facebook::cuda;
 void halfprec_ToHalf(cudaStream_t stream,
                      const float* input,
                      half_t* output,
diff --git a/src/HalfPrecTest.cpp b/src/HalfPrecTest.cpp
index 18155c1..c2c2899 100644
--- a/src/HalfPrecTest.cpp
+++ b/src/HalfPrecTest.cpp
@@ -1,4 +1,4 @@
-#include <HalfPrec.h>
+#include <src/HalfPrec.h>
 #include <gtest/gtest.h>
 #include <common/math/Float16.h>
 #include <cuda.h>
diff --git a/src/InitCuda.cpp b/src/InitCuda.cpp
index d8246e9..99bdd68 100644
--- a/src/InitCuda.cpp
+++ b/src/InitCuda.cpp
@@ -5,6 +5,11 @@
 
 #include <lua.hpp>
 
+#ifdef FB_INTERNAL
+#else
+#define LUAOPEN(x) luaopen_fbcunn_cuda_ext(x)
+#endif
+
 namespace facebook { namespace deeplearning { namespace torch {
 
 void initCrossMapNormalizationCuda(lua_State* L);
@@ -17,14 +22,15 @@ void initOneBitQuantizationCuda(lua_State* L);
 void initSparseNLLCriterionCuda(lua_State* L);
 void initFeatureLPPoolingCuda(lua_State* L);
 void initCuBLASWrapper(lua_State *L);
-void initFFTWrapper(lua_State *L);
-void initSpatialConvolutionCuFFT(lua_State *L);
+// void initFFTWrapper(lua_State *L);
+// void initSpatialConvolutionCuFFT(lua_State *L);
+void initWeightedLookupTableCuda(lua_State *L);
 
 }}}  // namespace
 
 using namespace facebook::deeplearning::torch;
 
-extern "C" int luaopen_libfbcunnlayers(lua_State* L) {
+extern "C" int LUAOPEN(lua_State* L) {
   initCrossMapNormalizationCuda(L);
   initLocallyConnectedCuda(L);
   initLookupTableGPUCuda(L);
@@ -35,8 +41,9 @@ extern "C" int luaopen_libfbcunnlayers(lua_State* L) {
   initSparseNLLCriterionCuda(L);
   initFeatureLPPoolingCuda(L);
   initCuBLASWrapper(L);
-  initFFTWrapper(L);
-  initSpatialConvolutionCuFFT(L);
+  // initFFTWrapper(L);
+  // initSpatialConvolutionCuFFT(L);
+  initWeightedLookupTableCuda(L);
 
   return 0;
 }
diff --git a/src/LocallyConnected.cuh b/src/LocallyConnected.cuh
index c913d2d..99baab8 100644
--- a/src/LocallyConnected.cuh
+++ b/src/LocallyConnected.cuh
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "cuda/DeviceTensor.cuh"
-#include "DeviceTensorUtils.h"
+#include "src/DeviceTensorUtils.h"
 #include <cassert>
 
 namespace facebook { namespace deeplearning { namespace torch {
diff --git a/src/LocallyConnectedHost.cpp b/src/LocallyConnectedHost.cpp
index 40459c6..0b775f8 100644
--- a/src/LocallyConnectedHost.cpp
+++ b/src/LocallyConnectedHost.cpp
@@ -4,7 +4,7 @@
  */
 
 #include "THC.h"
-#include "Utils.h"
+#include "src/Utils.h"
 #include "LocallyConnected.cuh"
 #include <luaT.h>
 #include <lua.hpp>
@@ -77,14 +77,6 @@ void initializeParams(THCState* state,
   }
 }
 
-void narrowTensors(THCState* state,
-                   THCudaTensor* in, THCudaTensor* in1,
-                   THCudaTensor* out, THCudaTensor* out1,
-                   int index, int size) {
-  THCudaTensor_narrow(state, in1, in, 0, index, size);
-  THCudaTensor_narrow(state, out1, out, 0, index, size);
-}
-
 // Updates a cache in cuda layout.
 //
 // The input tensor is in standard Torch layout and the resulting
diff --git a/src/LookupTableGPUHost.cpp b/src/LookupTableGPUHost.cpp
index 3eaa5e1..b9c7115 100644
--- a/src/LookupTableGPUHost.cpp
+++ b/src/LookupTableGPUHost.cpp
@@ -4,8 +4,8 @@
  */
 
 #include "cuda/DeviceTensor.cuh"
-#include "Utils.h"
-#include "DeviceTensorUtils.h"
+#include "src/Utils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THC.h"
 
 #include <lua.hpp>
diff --git a/src/MM.cu b/src/MM.cu
index 78df619..56de949 100644
--- a/src/MM.cu
+++ b/src/MM.cu
@@ -1,33 +1,76 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
+#include "DeviceTensorUtils.h"
+#include "THCTensor.h"
+
 #include "cuda/DeviceTensor.cuh"
 #include "cuda/MM.cuh"
 
+
 using namespace facebook::cuda;
 
 namespace facebook { namespace deeplearning { namespace torch {
 
-template <int Dim, bool ConjugateTransposeA, bool ConjugateTransposeB>
+template
+<int Dim, bool ConjugateTransposeA, bool ConjugateTransposeB, bool Accumulate>
 void transposeMM(DeviceTensor<float, Dim>& A,
                  DeviceTensor<float, Dim>& B,
                  DeviceTensor<float, Dim>& C,
                  float invNorm,
                  cudaStream_t s = 0) {
-  facebook::cuda::transposeMM<Dim, ConjugateTransposeA, ConjugateTransposeB>(
-    A, B, C, invNorm, s);
+  facebook::cuda::transposeMM
+    <Dim, ConjugateTransposeA, ConjugateTransposeB, Accumulate>(
+      A, B, C, invNorm, s);
 }
 
-#define INSTANTIATE_TRANSPOSE_MM(DIM, CONJA, CONJB)     \
-  template void transposeMM<DIM, CONJA, CONJB>(         \
-    DeviceTensor<float, DIM>& A,                        \
-    DeviceTensor<float, DIM>& B,                        \
-    DeviceTensor<float, DIM>& C,                        \
-    float invNorm,                                      \
+#define INSTANTIATE_TRANSPOSE_MM(DIM, CONJA, CONJB, ACC)        \
+  template void transposeMM<DIM, CONJA, CONJB, ACC>(            \
+    DeviceTensor<float, DIM>& A,                                \
+    DeviceTensor<float, DIM>& B,                                \
+    DeviceTensor<float, DIM>& C,                                \
+    float invNorm,                                              \
     cudaStream_t s);
 
-INSTANTIATE_TRANSPOSE_MM(5, true, false);
-INSTANTIATE_TRANSPOSE_MM(5, false, true);
-INSTANTIATE_TRANSPOSE_MM(5, false, false);
+INSTANTIATE_TRANSPOSE_MM(5, true, false, true);
+INSTANTIATE_TRANSPOSE_MM(5, false, true, true);
+INSTANTIATE_TRANSPOSE_MM(5, false, false, true);
+INSTANTIATE_TRANSPOSE_MM(5, true, false, false);
+INSTANTIATE_TRANSPOSE_MM(5, false, true, false);
+INSTANTIATE_TRANSPOSE_MM(5, false, false, false);
+
+#define CALL_TRANSPOSE_MM(DIM, CONJA, CONJB, ACC)                            \
+  if (THCudaTensor_nDimension(state, tA) == DIM &&                           \
+      conjugateTransposeA == CONJA &&                                        \
+      conjugateTransposeB == CONJB &&                                        \
+      accumulate == ACC) {                                                   \
+    DeviceTensor<float, DIM> A = torchToDeviceTensor<float, DIM>(state, tA); \
+    DeviceTensor<float, DIM> B = torchToDeviceTensor<float, DIM>(state, tB); \
+    DeviceTensor<float, DIM> C = torchToDeviceTensor<float, DIM>(state, tC); \
+    facebook::deeplearning::torch::transposeMM<DIM, CONJA, CONJB, ACC>(      \
+      A, B, C, invNorm, THCState_getCurrentStream(state));                   \
+    return;                                                                  \
+  }
+
+extern "C" void transposeMMFFI(THCState* state,
+                               THCudaTensor* tA,
+                               THCudaTensor* tB,
+                               THCudaTensor* tC,
+                               float invNorm,
+                               bool conjugateTransposeA,
+                               bool conjugateTransposeB,
+                               bool accumulate) {
+  CHECK_EQ(THCudaTensor_nDimension(state, tA),
+           THCudaTensor_nDimension(state, tB));
+  CHECK_EQ(THCudaTensor_nDimension(state, tA),
+           THCudaTensor_nDimension(state, tC));
+
+  CALL_TRANSPOSE_MM(5, true, false, true);
+  CALL_TRANSPOSE_MM(5, false, true, true);
+  CALL_TRANSPOSE_MM(5, false, false, true);
+  CALL_TRANSPOSE_MM(5, true, false, false);
+  CALL_TRANSPOSE_MM(5, false, true, false);
+  CALL_TRANSPOSE_MM(5, false, false, false);
+}
 
 #undef INSTANTIATE_TRANSPOSE_MM
 
diff --git a/src/MM.h b/src/MM.h
index 1b43a18..1dd43f5 100644
--- a/src/MM.h
+++ b/src/MM.h
@@ -8,7 +8,8 @@
 
 namespace facebook { namespace deeplearning { namespace torch {
 
-template <int Dim, bool ConjugateTransposeA, bool ConjugateTransposeB>
+template
+<int Dim, bool ConjugateTransposeA, bool ConjugateTransposeB, bool Accumulate>
 void transposeMM(facebook::cuda::DeviceTensor<float, Dim>& A,
                  facebook::cuda::DeviceTensor<float, Dim>& B,
                  facebook::cuda::DeviceTensor<float, Dim>& C,
diff --git a/src/OneBitQuantization.cu b/src/OneBitQuantization.cu
index 1ac6149..a5e6953 100644
--- a/src/OneBitQuantization.cu
+++ b/src/OneBitQuantization.cu
@@ -1,6 +1,6 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "OneBitQuantization.cuh"
+#include "src/OneBitQuantization.cuh"
 
 #include "cuda/ComputeCapabilities.cuh"
 #include "cuda/CudaDebugUtils.cuh"
diff --git a/src/OneBitQuantizationHost.cpp b/src/OneBitQuantizationHost.cpp
index 9b56eb9..37f395f 100644
--- a/src/OneBitQuantizationHost.cpp
+++ b/src/OneBitQuantizationHost.cpp
@@ -1,11 +1,11 @@
 // Copyright 2014 Facebook
 
 #include "cuda/DeviceTensor.cuh"
-#include "DeviceTensorUtils.h"
-#include "Utils.h"
+#include "src/DeviceTensorUtils.h"
+#include "src/Utils.h"
 #include "THC.h"
 #include "THCTensor.h"
-#include "OneBitQuantization.cuh"
+#include "src/OneBitQuantization.cuh"
 
 #include <cuda_runtime.h>
 #include <glog/logging.h>
diff --git a/src/SparseNLLCriterion.cu b/src/SparseNLLCriterion.cu
index c381e20..05870ea 100644
--- a/src/SparseNLLCriterion.cu
+++ b/src/SparseNLLCriterion.cu
@@ -5,7 +5,7 @@
 
 #include "cuda/CudaUtils.cuh"
 #include "cuda/WarpReductions.cuh"
-#include "util/Misc.h"
+#include "cuda/util/CachedDeviceProperties.h"
 
 #include "SparseNLLCriterion.cuh"
 
@@ -76,7 +76,7 @@ void runSparseNLLCriterion_updateOutput(
   DeviceTensor<float, 1>& output) {
 
   const cudaDeviceProp& deviceProperties =
-    facebook::CUDAUtil::getCurrentDeviceProperties();
+    facebook::cuda::getCurrentDeviceProperties();
   const int maxThreads = deviceProperties.maxThreadsPerBlock;
 
   const int batchSize = targetP.getSize(0);
@@ -97,7 +97,7 @@ void runSparseNLLCriterion_updateGradInput(
   DeviceTensor<float, 2>& gradInput) {
 
   const cudaDeviceProp& deviceProperties =
-    facebook::CUDAUtil::getCurrentDeviceProperties();
+    facebook::cuda::getCurrentDeviceProperties();
 
   const int batchSize = targetP.getSize(0);
   const int K = targetP.getSize(1);
diff --git a/src/SparseNLLCriterionHost.cpp b/src/SparseNLLCriterionHost.cpp
index 4e888e2..9ae80f4 100644
--- a/src/SparseNLLCriterionHost.cpp
+++ b/src/SparseNLLCriterionHost.cpp
@@ -1,11 +1,11 @@
 // Copyright 2014 Facebook
 
 #include "cuda/DeviceTensor.cuh"
-#include "Utils.h"
-#include "DeviceTensorUtils.h"
+#include "src/Utils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THC.h"
 #include "THCTensor.h"
-#include "SparseNLLCriterion.cuh"
+#include "src/SparseNLLCriterion.cuh"
 
 #include <cuda_runtime.h>
 #include <glog/logging.h>
diff --git a/src/SpatialBatchNormalization.cu b/src/SpatialBatchNormalization.cu
new file mode 100644
index 0000000..743b693
--- /dev/null
+++ b/src/SpatialBatchNormalization.cu
@@ -0,0 +1,791 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#include "src/DeviceTensorUtils.h"
+#include "THCTensor.h"
+
+#include "cuda/CudaUtils.cuh"
+#include "cuda/DeviceTensor.cuh"
+#include "cuda/MemoryAccess.cuh"
+#include "cuda/util/CachedDeviceProperties.h"
+
+#define ENABLE_CUDA_DEBUG
+#include "cuda/CudaDebugUtils.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <glog/logging.h>
+
+using namespace facebook::cuda;
+
+namespace facebook { namespace deeplearning { namespace torch {
+
+#define LOG_TARGET VLOG(1) // LOG(INFO)
+
+template<typename T, int NumThreads, bool affine, typename ComputeT = float>
+__global__ void SpatialBatchNormalizationUpdateOutputInferenceUnrolled_kernel(
+    const DeviceTensor<T, 4> input,
+    DeviceTensor<T, 4> output,
+    DeviceTensor<T, 1> runningMean,
+    DeviceTensor<T, 1> runningStddev,
+    const DeviceTensor<T, 1> weight,
+    const DeviceTensor<T, 1> bias) {
+
+  static_assert(std::is_same<ComputeT, double>::value , "type");
+
+  auto x = threadIdx.x;
+  auto y = threadIdx.y;
+  auto plane = blockIdx.x;
+  auto batch = blockIdx.y;
+
+  // stddev is actually 1 / stddev
+  auto stddev = runningStddev[plane].ldg();
+  auto mean = runningMean[plane].ldg();
+  auto inp = input[batch][plane][y][x].ldg();
+  if (affine) {
+    // multiply with gamma and add beta
+    // TODO: everyone pulling this, optimize by reusing better
+    auto beta =  bias[plane].ldg();
+    auto gamma = weight[plane].ldg();
+    output[batch][plane][y][x] = gamma * (inp - mean) * (stddev) + beta;
+  } else {
+    output[batch][plane][y][x] = (inp - mean) * (stddev);
+  }
+}
+
+template<typename T, int NumThreads, bool affine, typename ComputeT = float>
+__global__ void SpatialBatchNormalizationUpdateOutputInference_kernel(
+    const DeviceTensor<T, 4> input,
+    DeviceTensor<T, 4> output,
+    DeviceTensor<T, 1> runningMean,
+    DeviceTensor<T, 1> runningStddev,
+    const DeviceTensor<T, 1> weight,
+    const DeviceTensor<T, 1> bias) {
+
+  static_assert(std::is_same<ComputeT, double>::value , "type");
+
+  auto x = threadIdx.x;
+  auto plane = blockIdx.x;
+  auto batch = blockIdx.y;
+
+  // stddev is actually 1 / stddev
+  auto stddev = runningStddev[plane].ldg();
+  auto mean = runningMean[plane].ldg();
+  T beta, gamma;
+  if (affine) {
+    beta =  bias[plane].ldg();
+    gamma = weight[plane].ldg();
+  }
+
+  for (auto y = threadIdx.y; y < output.getSize(2); y += blockDim.y) {
+    auto inp = input[batch][plane][y][x].ldg();
+    if (affine) {
+      // multiply with gamma and add beta
+      // TODO: everyone pulling this, optimize by reusing better
+      output[batch][plane][y][x] = gamma * (inp - mean) * (stddev) + beta;
+    } else {
+      output[batch][plane][y][x] = (inp - mean) * (stddev);
+    }
+  }
+
+}
+
+template<typename T, int NumThreads, bool affine, typename ComputeT = float>
+__global__ void SpatialBatchNormalizationUpdateOutput_kernel(
+    const DeviceTensor<T, 4> input,
+    DeviceTensor<T, 4> output,
+    DeviceTensor<T, 4> centered,
+    DeviceTensor<T, 1> std,
+    DeviceTensor<T, 4> normalized,
+    DeviceTensor<T, 1> runningMean,
+    DeviceTensor<T, 1> runningStddev,
+    const DeviceTensor<T, 1> weight,
+    const DeviceTensor<T, 1> bias,
+    T epsilon,
+    T momentum) {
+
+  static_assert(std::is_same<ComputeT, double>::value , "type");
+
+  // Assert powers of 2 for proper intra-warp shuffle reduction
+  assert(blockDim.x == NumThreads);
+  assert(blockDim.y == NumThreads);
+  static_assert((NumThreads & (NumThreads - 1)) == 0,
+                "NumThreads must be a power of 2 for proper warp shuffling");
+  auto plane = blockIdx.x;
+  auto numBatches = input.getSize(0);
+
+  auto norm = (T)0;
+  if (threadIdx.y == 0) {
+    norm = input.getSize(0) * input.getSize(2) * input.getSize(3);
+    norm = (T)1 / norm;
+  }
+
+  // 1. Compute the mean across (batch, y, x), save it and update the
+  // runningMean with momentum
+  auto batchMeanGlobal = (T)0;
+  for (int y = threadIdx.y; y < input.getSize(2); y += NumThreads) {
+    auto batchMeanLocal = (T)0;
+    for (auto batch = 0; batch < numBatches; ++batch) {
+      for (int x = threadIdx.x; x < input.getSize(3); x += NumThreads) {
+        auto inp = (inBounds(y, x, input)) ?
+          input[batch][plane][y][x].ldg() : 0.0f;
+        batchMeanLocal += inp;
+      }
+    }
+    // Reduce within warp
+    for (auto i = 0; i < getMSB(NumThreads); ++i) {
+      batchMeanLocal += __shfl_xor(batchMeanLocal, 1 << i, NumThreads);
+    }
+    // thread 0 has it
+    batchMeanGlobal += batchMeanLocal;
+  }
+
+  __shared__ T shared[NumThreads];
+  // thx == 0 stores into smem
+  if (threadIdx.x == 0) {
+    shared[threadIdx.y] = batchMeanGlobal;
+  }
+
+  __syncthreads();
+  // 'transpose', and reduce within warp again
+  if (threadIdx.y == 0) {
+    auto batchMeanLocal = shared[threadIdx.x];
+    // Reduce within warp again
+    for (auto i = 0; i < getMSB(NumThreads); ++i) {
+      batchMeanLocal += __shfl_xor(batchMeanLocal, 1 << i, NumThreads);
+    }
+    // We did an allreduce with xors, this should reduce contention on
+    // shared memory.
+    batchMeanGlobal = batchMeanLocal * norm;
+    // Save the non momentum-altered version to share with everyone
+    shared[threadIdx.x] = batchMeanGlobal;
+  }
+  __syncthreads();
+
+  // Everyone picks it up
+  batchMeanGlobal = shared[threadIdx.x];
+  if (threadIdx.y == 0 && threadIdx.x == 0) {
+    // Momentum based writeback
+    runningMean[plane] =
+      (1 - momentum) * runningMean[plane] + momentum * batchMeanGlobal;
+  }
+
+
+  // 2. Compute the stddev across (batch, y, x),
+  //      save it
+  //      update the runningStddev with momentum
+  //      save a copy
+  // All threads have the batchMean now, compute the stddev
+  auto batchStddevGlobal = (T)0;
+  for (int y = threadIdx.y; y < input.getSize(2); y += NumThreads) {
+    auto batchStddevLocal = (T)0;
+    for (auto batch = 0; batch < numBatches; ++batch) {
+      for (int x = threadIdx.x; x < input.getSize(3); x += NumThreads) {
+        auto inp = 0.0f;
+        if (inBounds(y, x, input)) {
+          inp = input[batch][plane][y][x].ldg();
+          batchStddevLocal +=
+            (inp - batchMeanGlobal) * (inp - batchMeanGlobal);
+          centered[batch][plane][y][x] = inp - batchMeanGlobal;
+        }
+      }
+    }
+    // Reduce within warp
+    for (auto i = 0; i < getMSB(NumThreads); ++i) {
+      batchStddevLocal += __shfl_xor(batchStddevLocal, 1 << i, NumThreads);
+    }
+    // thread 0 has it
+    batchStddevGlobal += batchStddevLocal;
+  }
+
+  // thx == 0 stores into smem, reuse the same smem region, be sure to kill
+  // WAR / WAW dependences even if they are extremely unlikely.
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    shared[threadIdx.y] = batchStddevGlobal;
+  }
+
+  __syncthreads();
+  // 'transpose', and reduce within warp again
+  if (threadIdx.y == 0) {
+    auto batchStddevLocal = shared[threadIdx.x];
+    // Reduce within warp again
+    for (auto i = 0; i < getMSB(NumThreads); ++i) {
+      batchStddevLocal += __shfl_xor(batchStddevLocal, 1 << i, NumThreads);
+    }
+    // We did an allreduce with xors, this should reduce contention on
+    // shared memory.
+    batchStddevLocal *= norm;
+    batchStddevGlobal = 1 / sqrt(batchStddevLocal + epsilon);
+    // Save the non momentum-altered version to share with everyone
+    shared[threadIdx.x] = batchStddevGlobal;
+  }
+  __syncthreads();
+
+  // Everyone picks it up
+  batchStddevGlobal = shared[threadIdx.x];
+  // Momentum based writeback
+  if (threadIdx.y == 0 && threadIdx.x == 0) {
+    std[plane] = batchStddevGlobal;
+    runningStddev[plane] =
+      (1 - momentum) * runningStddev[plane] + momentum * batchStddevGlobal;
+  }
+
+  // Write normalized and update the output
+  auto beta =  bias[plane];
+  auto gamma =  weight[plane];
+  for (int y = threadIdx.y; y < input.getSize(2); y += NumThreads) {
+    for (int x = threadIdx.x; x < input.getSize(3); x += NumThreads) {
+      if(inBounds(y, x, output)) {
+        for (auto batch = 0; batch < numBatches; ++batch) {
+          auto inp = input[batch][plane][y][x].ldg();
+          normalized[batch][plane][y][x] =
+            (inp - batchMeanGlobal) * (batchStddevGlobal);
+          if (affine) {
+            // multiply with gamma and add beta
+            output[batch][plane][y][x] =
+              gamma * (inp - batchMeanGlobal) * (batchStddevGlobal) + beta;
+          } else {
+            output[batch][plane][y][x] =
+            (inp - batchMeanGlobal) * (batchStddevGlobal);
+          }
+        }
+      }
+    }
+  }
+
+}
+
+
+template<typename T, int BatchDims, int ImageDims, bool train, bool affine, typename ComputeT = float>
+void SpatialBatchNormalizationUpdateOutput(
+    const DeviceTensor<T, BatchDims + ImageDims> input,
+    DeviceTensor<T, BatchDims + ImageDims> output,
+    DeviceTensor<T, BatchDims + ImageDims> centered,
+    DeviceTensor<T, 1> std,
+    DeviceTensor<T, BatchDims + ImageDims> normalized,
+    DeviceTensor<T, 1> runningMean,
+    DeviceTensor<T, 1> runningStddev,
+    const DeviceTensor<T, 1> weight,
+    const DeviceTensor<T, 1> bias,
+    T epsilon,
+    T momentum,
+    cudaStream_t s)
+{
+  static_assert(BatchDims == 2, "BatchDims == 2 only atm");
+
+  auto prop = getCurrentDeviceProperties();
+  if (!train) {
+    if (input.getSize(3) * input.getSize(2) < prop.maxThreadsPerBlock) {
+      dim3 blocks(input.getSize(1), input.getSize(0));
+      dim3 threads(input.getSize(3), input.getSize(2));
+      LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+                << threads.x << " " << threads.y << " " << threads.z;
+      SpatialBatchNormalizationUpdateOutputInferenceUnrolled_kernel
+        <T, 1, affine, ComputeT>
+        <<<blocks, threads, 0, s>>>
+        (input, output, runningMean, runningStddev, weight, bias);
+    } else {
+      CHECK_GE(prop.maxThreadsPerBlock, input.getSize(3)) <<
+        "Need a rolled version across both threadIdx.x and y";
+      dim3 blocks(input.getSize(1),
+                  input.getSize(0));
+      dim3 threads(input.getSize(3),
+                   min(input.getSize(2),
+                       floor(prop.maxThreadsPerBlock, input.getSize(3)))
+                  );
+      LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+                << threads.x << " " << threads.y << " " << threads.z;
+      SpatialBatchNormalizationUpdateOutputInference_kernel
+        <T, 1, affine, ComputeT>
+        <<<blocks, threads, 0, s>>>
+        (input, output, runningMean, runningStddev, weight, bias);
+    }
+  } else {
+    dim3 blocks(input.getSize(1));
+    if (input.getSize(3) >= 16 && input.getSize(2) >= 16) {
+      dim3 threads(16, 16);
+      LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+                << threads.x << " " << threads.y << " " << threads.z;
+      SpatialBatchNormalizationUpdateOutput_kernel
+        <T, 16, affine, ComputeT>
+        <<<blocks, threads, 0, s>>>(input,
+                                    output,
+                                    centered,
+                                    std,
+                                    normalized,
+                                    runningMean,
+                                    runningStddev,
+                                    weight,
+                                    bias,
+                                    epsilon,
+                                    momentum);
+    } else {
+      dim3 threads(8, 8);
+      LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+                << threads.x << " " << threads.y << " " << threads.z;
+      SpatialBatchNormalizationUpdateOutput_kernel
+        <T, 8, affine, ComputeT>
+        <<<blocks, threads, 0, s>>>(input,
+                                    output,
+                                    centered,
+                                    std,
+                                    normalized,
+                                    runningMean,
+                                    runningStddev,
+                                    weight,
+                                    bias,
+                                    epsilon,
+                                    momentum);
+    }
+  }
+
+}
+
+extern "C" void SpatialBatchNormalizationUpdateOutputFFI(
+    THCState* state,
+    THCudaTensor* input,
+    THCudaTensor* output,
+    THCudaTensor* centered,
+    THCudaTensor* std,
+    THCudaTensor* normalized,
+    THCudaTensor* runningMean,
+    THCudaTensor* runningStddev,
+    THCudaTensor* weight,
+    THCudaTensor* bias,
+    float epsilon,
+    float momentum,
+    bool train,
+    bool affine)
+{
+  // The SpatialBatchNormalization lua module is designed for
+  // 4-D only: batch, plane, y, x
+  constexpr int BatchDims = 2;
+  constexpr int ImageDims = 2;
+  typedef double ComputeT;
+  if (!train) {
+    if (!affine) {
+      // Collapse
+      SpatialBatchNormalizationUpdateOutput
+        <float, BatchDims, ImageDims, false, false, ComputeT>
+        (
+          torchToDeviceTensor<float, BatchDims + ImageDims>(state, input),
+          torchToDeviceTensor<float, BatchDims + ImageDims>(state, output),
+          DeviceTensor<float, BatchDims + ImageDims>(),
+          DeviceTensor<float, 1>(),
+          DeviceTensor<float, BatchDims + ImageDims>(),
+          torchToDeviceTensor<float, 1>(state, runningMean),
+          torchToDeviceTensor<float, 1>(state, runningStddev),
+          DeviceTensor<float, 1>(),
+          DeviceTensor<float, 1>(),
+          epsilon,
+          momentum,
+          THCState_getCurrentStream(state)
+        );
+    } else {
+      // Collapse
+      SpatialBatchNormalizationUpdateOutput
+        <float, BatchDims, ImageDims, false, true, ComputeT>
+        (
+          torchToDeviceTensor<float, BatchDims + ImageDims>(state, input),
+          torchToDeviceTensor<float, BatchDims + ImageDims>(state, output),
+          DeviceTensor<float, BatchDims + ImageDims>(),
+          DeviceTensor<float, 1>(),
+          DeviceTensor<float, BatchDims + ImageDims>(),
+          torchToDeviceTensor<float, 1>(state, runningMean),
+          torchToDeviceTensor<float, 1>(state, runningStddev),
+          torchToDeviceTensor<float, 1>(state, weight),
+          torchToDeviceTensor<float, 1>(state, bias),
+          epsilon,
+          momentum,
+          THCState_getCurrentStream(state)
+        );
+    }
+  } else {
+    if (!affine) {
+      SpatialBatchNormalizationUpdateOutput
+        <float, BatchDims, ImageDims, true, false, ComputeT>
+      (
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, input),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, output),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, centered),
+        torchToDeviceTensor<float, 1>(state, std),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, normalized),
+        torchToDeviceTensor<float, 1>(state, runningMean),
+        torchToDeviceTensor<float, 1>(state, runningStddev),
+        DeviceTensor<float, 1>(),
+        DeviceTensor<float, 1>(),
+        epsilon,
+        momentum,
+        THCState_getCurrentStream(state)
+      );
+    } else {
+      SpatialBatchNormalizationUpdateOutput
+        <float, BatchDims, ImageDims, true, true, ComputeT>
+      (
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, input),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, output),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, centered),
+        torchToDeviceTensor<float, 1>(state, std),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, normalized),
+        torchToDeviceTensor<float, 1>(state, runningMean),
+        torchToDeviceTensor<float, 1>(state, runningStddev),
+        torchToDeviceTensor<float, 1>(state, weight),
+        torchToDeviceTensor<float, 1>(state, bias),
+        epsilon,
+        momentum,
+        THCState_getCurrentStream(state)
+      );
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+
+template<typename T, int NumThreads, bool affine, typename ComputeT = float>
+__global__ void SpatialBatchNormalizationUpdateGradInput_kernel(
+    DeviceTensor<T, 4> gradInput,
+    const DeviceTensor<T, 4> gradOutput,
+    DeviceTensor<T, 4> centered,
+    DeviceTensor<T, 1> std,
+    const DeviceTensor<T, 1> weight) {
+
+  static_assert(std::is_same<ComputeT, double>::value , "type");
+
+  // Assert powers of 2 for proper intra-warp shuffle reduction
+  assert(blockDim.x == NumThreads);
+  assert(blockDim.y == NumThreads);
+  static_assert((NumThreads & (NumThreads - 1)) == 0,
+                "NumThreads must be a power of 2 for proper warp shuffling");
+  auto plane = blockIdx.x;
+  auto numBatches = gradInput.getSize(0);
+
+  auto norm = (T)0;
+  if (threadIdx.y == 0) {
+    norm = gradInput.getSize(0) * gradInput.getSize(2) * gradInput.getSize(3);
+    norm = (T)1 / norm;
+  }
+
+  // 1. Compute means across (batch, y, x)
+  auto gradMeanGlobal = (T)0;
+  auto centeredGradMeanGlobal = (T)0;
+  for (int y = threadIdx.y; y < gradInput.getSize(2); y += NumThreads) {
+    auto gradMeanLocal = (T)0;
+    auto centeredGradMeanLocal = (T)0;
+    for (auto batch = 0; batch < numBatches; ++batch) {
+      for (int x = threadIdx.x; x < gradInput.getSize(3); x += NumThreads) {
+        auto g = (inBounds(y, x, gradOutput)) ?
+          gradOutput[batch][plane][y][x].ldg() : 0.0f;
+        auto c = (inBounds(y, x, centered)) ?
+          centered[batch][plane][y][x].ldg()   : 0.0f;
+        gradMeanLocal += g;
+        centeredGradMeanLocal += c * g;
+      }
+    }
+    // Reduce within warp
+    for (auto i = 0; i < getMSB(NumThreads); ++i) {
+      gradMeanLocal +=
+        __shfl_xor(gradMeanLocal, 1 << i, NumThreads);
+      centeredGradMeanLocal +=
+        __shfl_xor(centeredGradMeanLocal, 1 << i, NumThreads);
+    }
+    // thread 0 has it
+    gradMeanGlobal += gradMeanLocal;
+    centeredGradMeanGlobal += centeredGradMeanLocal;
+  }
+
+  __shared__ T shared[2][NumThreads];
+  // thx == 0 stores into smem
+  if (threadIdx.x == 0) {
+    shared[0][threadIdx.y] = gradMeanGlobal;
+    shared[1][threadIdx.y] = centeredGradMeanGlobal;
+  }
+
+  __syncthreads();
+  // 'transpose', and reduce within warp again
+  if (threadIdx.y == 0) {
+    auto gradMeanLocal = shared[0][threadIdx.x];
+    auto centeredGradMeanLocal = shared[1][threadIdx.x];
+    // Reduce within warp again
+    for (auto i = 0; i < getMSB(NumThreads); ++i) {
+      gradMeanLocal +=
+        __shfl_xor(gradMeanLocal, 1 << i, NumThreads);
+      centeredGradMeanLocal +=
+        __shfl_xor(centeredGradMeanLocal, 1 << i, NumThreads);
+    }
+    // We did an allreduce with xors, this should reduce contention on
+    // shared memory.
+    gradMeanGlobal = gradMeanLocal * norm;
+    centeredGradMeanGlobal = centeredGradMeanLocal * norm;
+    // Save the non momentum-altered version to share with everyone
+    shared[0][threadIdx.x] = gradMeanGlobal;
+    shared[1][threadIdx.x] = centeredGradMeanGlobal;
+  }
+  __syncthreads();
+
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  gradMeanGlobal = shared[0][threadIdx.x];
+  centeredGradMeanGlobal = shared[1][threadIdx.x];
+
+  auto stdVal = std[plane];
+  for (int y = threadIdx.y; y < gradInput.getSize(2); y += NumThreads) {
+    for (auto batch = 0; batch < numBatches; ++batch) {
+      for (int x = threadIdx.x; x < gradInput.getSize(3); x += NumThreads) {
+        if (affine) {
+          gradInput[batch][plane][y][x] =
+            ( - centeredGradMeanGlobal *
+                centered[batch][plane][y][x] *
+                stdVal *
+                stdVal
+              +
+                gradOutput[batch][plane][y][x]
+              -
+                gradMeanGlobal
+            )
+            * stdVal * weight[plane];
+        } else {
+          gradInput[batch][plane][y][x] =
+            ( - centeredGradMeanGlobal *
+                centered[batch][plane][y][x] *
+                stdVal *
+                stdVal
+              +
+                gradOutput[batch][plane][y][x]
+              -
+                gradMeanGlobal
+            )
+            * stdVal;
+        }
+      }
+    }
+  }
+
+}
+
+template<typename T, int BatchDims, int ImageDims, bool affine, typename ComputeT = float>
+void SpatialBatchNormalizationUpdateGradInput(
+    DeviceTensor<T, BatchDims + ImageDims> gradInput,
+    const DeviceTensor<T, BatchDims + ImageDims> gradOutput,
+    DeviceTensor<T, BatchDims + ImageDims> centered,
+    DeviceTensor<T, 1> std,
+    const DeviceTensor<T, 1> weight,
+    cudaStream_t s)
+{
+  static_assert(BatchDims == 2, "BatchDims == 2 only atm");
+
+  dim3 blocks(gradInput.getSize(1));
+  if (gradInput.getSize(3) >= 16 && gradInput.getSize(2) >= 16) {
+    dim3 threads(16, 16);
+    LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+              << threads.x << " " << threads.y << " " << threads.z;
+    SpatialBatchNormalizationUpdateGradInput_kernel
+      <T, 16, affine, ComputeT>
+      <<<blocks, threads, 0, s>>>(gradInput,
+                                  gradOutput,
+                                  centered,
+                                  std,
+                                  weight);
+  } else {
+    dim3 threads(8, 8);
+    LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+              << threads.x << " " << threads.y << " " << threads.z;
+    SpatialBatchNormalizationUpdateGradInput_kernel
+      <T, 8, affine, ComputeT>
+      <<<blocks, threads, 0, s>>>(gradInput,
+                                  gradOutput,
+                                  centered,
+                                  std,
+                                  weight);
+  }
+
+}
+
+extern "C" void SpatialBatchNormalizationUpdateGradInputFFI(
+    THCState* state,
+    THCudaTensor* gradInput,
+    THCudaTensor* gradOutput,
+    THCudaTensor* centered,
+    THCudaTensor* std,
+    THCudaTensor* weight,
+    bool affine) {
+
+  // The SpatialBatchNormalization lua module is designed for
+  // 4-D only: batch, plane, y, x
+  constexpr int BatchDims = 2;
+  constexpr int ImageDims = 2;
+  typedef double ComputeT;
+  if (!affine) {
+    // Collapse
+    SpatialBatchNormalizationUpdateGradInput
+      <float, BatchDims, ImageDims, false, ComputeT>
+      (
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradInput),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradOutput),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, centered),
+        torchToDeviceTensor<float, 1>(state, std),
+        DeviceTensor<float, 1>(),
+        THCState_getCurrentStream(state)
+      );
+  } else {
+    // Collapse
+    SpatialBatchNormalizationUpdateGradInput
+      <float, BatchDims, ImageDims, true, ComputeT>
+      (
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradInput),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradOutput),
+        torchToDeviceTensor<float, BatchDims + ImageDims>(state, centered),
+        torchToDeviceTensor<float, 1>(state, std),
+        torchToDeviceTensor<float, 1>(state, weight),
+        THCState_getCurrentStream(state)
+      );
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+
+template<typename T, int NumThreads, typename ComputeT = float>
+__global__  void SpatialBatchNormalizationAccGradParameters_kernel(
+    const DeviceTensor<T, 4> gradOutput,
+    const DeviceTensor<T, 4> normalized,
+    DeviceTensor<T, 1> gradWeight,
+    DeviceTensor<T, 1> gradBias,
+    T scale)
+{
+
+  static_assert(std::is_same<ComputeT, double>::value , "type");
+
+  // Assert powers of 2 for proper intra-warp shuffle reduction
+  assert(blockDim.x == NumThreads);
+  assert(blockDim.y == NumThreads);
+  static_assert((NumThreads & (NumThreads - 1)) == 0,
+                "NumThreads must be a power of 2 for proper warp shuffling");
+  auto plane = blockIdx.x;
+  auto numBatches = gradOutput.getSize(0);
+
+  // 1. Compute sums across (batch, y, x)
+  auto gradMeanGlobal = (T)0;
+  auto normalizedGradMeanGlobal = (T)0;
+  for (int y = threadIdx.y; y < gradOutput.getSize(2); y += NumThreads) {
+    auto gradMeanLocal = (T)0;
+    auto normalizedGradMeanLocal = (T)0;
+    for (auto batch = 0; batch < numBatches; ++batch) {
+      for (int x = threadIdx.x; x < gradOutput.getSize(3); x += NumThreads) {
+        auto g = (inBounds(y, x, gradOutput)) ?
+          gradOutput[batch][plane][y][x].ldg() : 0.0f;
+        auto n = (inBounds(y, x, normalized)) ?
+          normalized[batch][plane][y][x].ldg() : 0.0f;
+        gradMeanLocal += g;
+        normalizedGradMeanLocal += n * g;
+      }
+    }
+    // Reduce within warp
+    for (auto i = 0; i < getMSB(NumThreads); ++i) {
+      gradMeanLocal +=
+        __shfl_xor(gradMeanLocal, 1 << i, NumThreads);
+      normalizedGradMeanLocal +=
+        __shfl_xor(normalizedGradMeanLocal, 1 << i, NumThreads);
+    }
+    // thread 0 has it
+    gradMeanGlobal += gradMeanLocal;
+    normalizedGradMeanGlobal += normalizedGradMeanLocal;
+  }
+
+  __shared__ T shared[2][NumThreads];
+  // thx == 0 stores into smem
+  if (threadIdx.x == 0) {
+    shared[0][threadIdx.y] = gradMeanGlobal;
+    shared[1][threadIdx.y] = normalizedGradMeanGlobal;
+  }
+
+  __syncthreads();
+  // 'transpose', and reduce within warp again
+  if (threadIdx.y == 0) {
+    auto gradMeanLocal = shared[0][threadIdx.x];
+    auto normalizedGradMeanLocal = shared[1][threadIdx.x];
+    // Reduce within warp again
+    for (auto i = 0; i < getMSB(NumThreads); ++i) {
+      gradMeanLocal +=
+        __shfl_xor(gradMeanLocal, 1 << i, NumThreads);
+      normalizedGradMeanLocal +=
+        __shfl_xor(normalizedGradMeanLocal, 1 << i, NumThreads);
+    }
+    // We did an allreduce with xors, this should reduce contention on
+    // shared memory.
+    gradMeanGlobal = gradMeanLocal;
+    normalizedGradMeanGlobal = normalizedGradMeanLocal;
+
+    // thread 0 has it
+    if (threadIdx.x == 0) {
+      gradBias[plane] += scale * gradMeanGlobal;
+      gradWeight[plane] += scale * normalizedGradMeanGlobal;
+    }
+  }
+}
+
+template<typename T, int BatchDims, int ImageDims, typename ComputeT = float>
+void SpatialBatchNormalizationAccGradParameters(
+    const DeviceTensor<T, BatchDims + ImageDims> gradOutput,
+    const DeviceTensor<T, BatchDims + ImageDims> normalized,
+    DeviceTensor<T, 1> gradWeight,
+    DeviceTensor<T, 1> gradBias,
+    T scale,
+    cudaStream_t s)
+{
+  static_assert(BatchDims == 2, "BatchDims == 2 only atm");
+
+  dim3 blocks(gradOutput.getSize(1));
+  if (gradOutput.getSize(3) >= 16 && gradOutput.getSize(2) >= 16) {
+    dim3 threads(16, 16);
+    LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+              << threads.x << " " << threads.y << " " << threads.z;
+    SpatialBatchNormalizationAccGradParameters_kernel<T, 16, ComputeT>
+      <<<blocks, threads, 0, s>>>(gradOutput,
+                                  normalized,
+                                  gradWeight,
+                                  gradBias,
+                                  scale);
+  } else {
+    dim3 threads(8, 8);
+    LOG_TARGET << blocks.x << " " << blocks.y << " " << blocks.z << " "
+              << threads.x << " " << threads.y << " " << threads.z;
+    SpatialBatchNormalizationAccGradParameters_kernel<T, 8, ComputeT>
+      <<<blocks, threads, 0, s>>>(gradOutput,
+                                  normalized,
+                                  gradWeight,
+                                  gradBias,
+                                  scale);
+  }
+
+}
+
+extern "C" void SpatialBatchNormalizationAccGradParametersFFI(
+    THCState* state,
+    THCudaTensor* gradOutput,
+    THCudaTensor* normalized,
+    THCudaTensor* gradWeight,
+    THCudaTensor* gradBias,
+    float scale) {
+  // The SpatialBatchNormalization lua module is designed for
+  // 4-D only: batch, plane, y, x
+  constexpr int BatchDims = 2;
+  constexpr int ImageDims = 2;
+  typedef double ComputeT;
+  // Collapse
+  SpatialBatchNormalizationAccGradParameters
+    <float, BatchDims, ImageDims, ComputeT>
+    (
+      torchToDeviceTensor<float, BatchDims + ImageDims>(state, gradOutput),
+      torchToDeviceTensor<float, BatchDims + ImageDims>(state, normalized),
+      torchToDeviceTensor<float, 1>(state, gradWeight),
+      torchToDeviceTensor<float, 1>(state, gradBias),
+      scale,
+      THCState_getCurrentStream(state)
+    );
+
+  THCudaCheck(cudaGetLastError());
+}
+
+
+}}}
diff --git a/src/TemporalConvolutionFBHost.cpp b/src/TemporalConvolutionFBHost.cpp
index 0339f8b..80ffd83 100644
--- a/src/TemporalConvolutionFBHost.cpp
+++ b/src/TemporalConvolutionFBHost.cpp
@@ -1,13 +1,13 @@
 // Copyright 2014 Facebook
 
 #include "cuda/DeviceTensor.cuh"
+#include "cuda/util/CachedDeviceProperties.h"
 #include "THC.h"
 #include "THCTensor.h"
-#include "Utils.h"
-#include "CuBLASWrapper.h"
-#include "ConvolutionBias.cuh"
-#include "DeviceTensorUtils.h"
-#include "util/Misc.h"
+#include "src/Utils.h"
+#include "src/CuBLASWrapper.h"
+#include "src/ConvolutionBias.cuh"
+#include "src/DeviceTensorUtils.h"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
diff --git a/src/TemporalKMaxPooling.cu b/src/TemporalKMaxPooling.cu
index 6e42296..f64b2df 100644
--- a/src/TemporalKMaxPooling.cu
+++ b/src/TemporalKMaxPooling.cu
@@ -3,7 +3,7 @@
 #include "cuda/DeviceTensor.cuh"
 #include "cuda/TopKElements.cuh"
 #include "cuda/DeviceTensor.cuh"
-#include "util/Misc.h"
+#include "cuda/util/CachedDeviceProperties.h"
 #include "THC.h"
 
 using namespace facebook::cuda;
@@ -88,7 +88,7 @@ runTemporalKMaxPoolingUpdateOutput(cudaStream_t stream,
                                    DeviceTensor<float, 3>& output,
                                    int k) {
   const cudaDeviceProp& deviceProperties =
-    facebook::CUDAUtil::getCurrentDeviceProperties();
+    facebook::cuda::getCurrentDeviceProperties();
 
   // We aim to run with 4 warps.
   const int numWarps = std::min(input.getSize(2), 4);
@@ -107,7 +107,7 @@ runTemporalKMaxPoolingUpdateGradInput(cudaStream_t stream,
                                       DeviceTensor<float, 3>& gradInput,
                                       int k) {
   const cudaDeviceProp& deviceProperties =
-    facebook::CUDAUtil::getCurrentDeviceProperties();
+    facebook::cuda::getCurrentDeviceProperties();
 
   // We aim to run with 4 warps.
   const int numThreads =
diff --git a/src/TemporalKMaxPoolingHost.cpp b/src/TemporalKMaxPoolingHost.cpp
index 1ea1678..fe30821 100644
--- a/src/TemporalKMaxPoolingHost.cpp
+++ b/src/TemporalKMaxPoolingHost.cpp
@@ -1,14 +1,15 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
 #include "cuda/DeviceTensor.cuh"
-#include "Utils.h"
-#include "DeviceTensorUtils.h"
+#include "src/Utils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THC.h"
-#include "TemporalKMaxPooling.cuh"
+#include "src/TemporalKMaxPooling.cuh"
 
 #include <folly/ScopeGuard.h>
 #include <lua.hpp>
 #include <luaT.h>
+#include <cmath>
 
 using namespace facebook::cuda;
 
@@ -17,12 +18,12 @@ namespace facebook { namespace deeplearning { namespace torch {
 namespace {
 
 int checkAndAdjustK(lua_State* L, int k, double kDynamic, long sequenceLength) {
-  if (kDynamic > 0) {
-    k = std::max(k, (int) (kDynamic * sequenceLength));
+  if (kDynamic != -1) {
+    k = std::max(k, (int) (std::ceil(kDynamic * sequenceLength)));
   }
 
   if (k > sequenceLength) {
-    luaL_error(L, "k (%d) must be less than sequence length (%d) ", k, sequenceLength);
+    luaL_error(L, "k: k must be less than the sequence length");
   }
 
   return k;
diff --git a/src/TemporalMaxPooling.cu b/src/TemporalMaxPooling.cu
index ac16412..1230278 100644
--- a/src/TemporalMaxPooling.cu
+++ b/src/TemporalMaxPooling.cu
@@ -1,12 +1,9 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
 #include "cuda/DeviceTensor.cuh"
-#include "DeviceTensorUtils.h"
-#include "util/Misc.h"
-#include "Utils.h"
-#include "lua.h"
-#include "luaT.h"
-#include "THC.h"
+#include "cuda/util/CachedDeviceProperties.h"
+#include "src/DeviceTensorUtils.h"
+#include "src/Utils.h"
 
 using namespace facebook::cuda;
 using namespace facebook::deeplearning::torch;
@@ -141,7 +138,7 @@ static int fbcunn_TemporalMaxPooling_updateOutput(lua_State *L) {
   // be limited by smem or register count, so no need to use the
   // occupancy calculator.
   const cudaDeviceProp& deviceProperties =
-    facebook::CUDAUtil::getCurrentDeviceProperties();
+    facebook::cuda::getCurrentDeviceProperties();
 
   dim3 block(min(input.getSize(2), deviceProperties.maxThreadsPerBlock));
   dim3 grid(input.getSize(0), // batch size
@@ -205,7 +202,7 @@ static int fbcunn_TemporalMaxPooling_updateGradInput(lua_State *L) {
   // be limited by smem or register count, so no need to use the
   // occupancy calculator.
   const cudaDeviceProp& deviceProperties =
-    facebook::CUDAUtil::getCurrentDeviceProperties();
+    facebook::cuda::getCurrentDeviceProperties();
 
   dim3 block(min(gradOutput.getSize(2), deviceProperties.maxThreadsPerBlock));
   dim3 grid(gradOutput.getSize(0), // batch size
diff --git a/src/WeightedLookupTable.cu b/src/WeightedLookupTable.cu
new file mode 100644
index 0000000..730f90c
--- /dev/null
+++ b/src/WeightedLookupTable.cu
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2015 Facebook
+ */
+
+#include "cuda/CudaUtils.cuh"
+#include "cuda/DeviceTensor.cuh"
+#include "cuda/WarpReductions.cuh"
+
+using namespace facebook::cuda;
+
+namespace facebook { namespace deeplearning { namespace torch {
+namespace detail {
+
+namespace {
+
+__global__ void scaleByWeight(DeviceTensor<float, 2> output,
+                              DeviceTensor<float, 2> input,
+                              DeviceTensor<float, 1> weights) {
+  // Values computed per thread
+  const int VT = 4;
+
+  // Each block computes a 4x128 section of the output, with each
+  // warp handling a 1x128 section.
+
+  int rowIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  if (rowIdx < weights.getSize(0)) {
+    float weight = weights[rowIdx];
+
+    #pragma unroll
+    for (int i = 0; i < VT; i++) {
+      int colIdx = blockDim.x * (VT * blockIdx.y + i) + threadIdx.x;
+      if (colIdx < input.getSize(1)) {
+        output[rowIdx][colIdx] = input[rowIdx][colIdx] * weight;
+      }
+    }
+  }
+}
+
+}
+
+void launchWeightedLookupTableScaleByWeightKernel(cudaStream_t stream,
+                                                  DeviceTensor<float, 2>& output,
+                                                  DeviceTensor<float, 2>& input,
+                                                  DeviceTensor<float, 1>& weight) {
+  dim3 grid(cuda::ceil(output.getSize(0), 4), cuda::ceil(output.getSize(1), 128));
+  dim3 block(32, 4);
+
+  scaleByWeight<<<grid, block, 0, stream>>>(output, input, weight);
+}
+
+}}}}
diff --git a/src/WeightedLookupTableHost.cpp b/src/WeightedLookupTableHost.cpp
new file mode 100644
index 0000000..84b8d08
--- /dev/null
+++ b/src/WeightedLookupTableHost.cpp
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2015 Facebook
+ */
+
+#include "cuda/DeviceTensor.cuh"
+#include "src/Utils.h"
+#include "src/DeviceTensorUtils.h"
+#include "THC.h"
+
+#include <lua.hpp>
+#include <TH.h>
+#include <luaT.h>
+
+using namespace facebook::cuda;
+
+namespace facebook { namespace deeplearning { namespace torch {
+
+namespace detail {
+void launchWeightedLookupTableScaleByWeightKernel(
+  cudaStream_t stream,
+  DeviceTensor<float, 2>& output,
+  DeviceTensor<float, 2>& input,
+  DeviceTensor<float, 1>& weight);
+}
+
+namespace {
+
+int scaleByWeight(lua_State* L) {
+  THCState* state = getCutorchState(L);
+  auto output  = (THCudaTensor*)luaT_checkudata(L, 1, "torch.CudaTensor");
+  const auto input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
+  const auto weight = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
+
+  DeviceTensor<float, 2> cudaOutput = torchToDeviceTensor<float, 2>(state, output);
+  DeviceTensor<float, 2> cudaInput = torchToDeviceTensor<float, 2>(state, input);
+  DeviceTensor<float, 1> cudaWeight = torchToDeviceTensor<float, 1>(state, weight);
+
+  detail::launchWeightedLookupTableScaleByWeightKernel(
+    THCState_getCurrentStream(state),
+    cudaOutput, cudaInput, cudaWeight);
+
+  return 0;
+}
+
+const luaL_Reg functions[] = {
+  {"WeightedLookupTable_scaleByWeight", scaleByWeight},
+  {nullptr, nullptr},
+};
+
+} // namespace
+
+void initWeightedLookupTableCuda(lua_State* L) {
+  luaT_pushmetatable(L, "torch.CudaTensor");
+  luaT_registeratname(L, functions, "nn");
+  lua_pop(L, 1);
+}
+
+}}}  // namespaces
diff --git a/src/fft/CuFFTConvolution.cpp b/src/fft/CuFFTConvolution.cpp
index be77a5a..5e670d7 100644
--- a/src/fft/CuFFTConvolution.cpp
+++ b/src/fft/CuFFTConvolution.cpp
@@ -1,17 +1,17 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "CuFFTConvolution.cuh"
+#include "src/fft/CuFFTConvolution.cuh"
 
 #include "THCTensor.h"
 #include "cuda/DeviceTensor.cuh"
-#include "CuBLASWrapper.h"
-#include "DeviceTensorUtils.h"
-#include "MM.h"
-#include "CuFFTStrategy.h"
-#include "CuFFTWrapper.cuh"
-#include "FBFFTHost.h"
-#include "Utils.cuh"
-#include "Utils.h"
+#include "src/CuBLASWrapper.h"
+#include "src/DeviceTensorUtils.h"
+#include "src/MM.h"
+#include "src/fft/CuFFTStrategy.h"
+#include "src/fft/CuFFTWrapper.cuh"
+#include "src/fft/FBFFTHost.h"
+#include "src/fft/Utils.cuh"
+#include "src/fft/Utils.h"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -799,13 +799,13 @@ void CuFFTConvolution::CuFFTConvolutionMxM() {
     // A, B and C nomenclature is relative to cuBLAS' column-major.
     // In row-major fbmm, this is reversed.
     if (convPass_.pass == ConvolutionPass::kUpdateOutput) {
-      transposeMM<5, false, true>(
+      transposeMM<5, false, true, false>(
         BComplex_, AComplex_, CComplex_, norm_.x, getStream(0));
     } else if (convPass_.pass == ConvolutionPass::kUpdateGradInput) {
-      transposeMM<5, false, false>(
+      transposeMM<5, false, false, false>(
         BComplex_, AComplex_, CComplex_, norm_.x, getStream(0));
     } else if (convPass_.pass == ConvolutionPass::kAccGradParameters) {
-      transposeMM<5, true, false>(
+      transposeMM<5, true, false, false>(
         BComplex_, AComplex_, CComplex_, norm_.x, getStream(0));
     } else {
       throw std::runtime_error("Invalid pass for CuFFTConvolution");
@@ -902,7 +902,7 @@ void CuFFTConvolution::run() {
 
   if (!strategy_->fbmm()) {
     // Transpose A_ (? ? y x) -> (y x ? ?) (row-major formulation)
-    transposeAsComplex(AComplex_, AComplexT_, 2, handle0, s0);
+    transposeAsComplex(AComplex_, AComplexT_, 2, true, handle0, s0);
   }
 
   auto handle1 = getCircular(cublasHandles_, 1);
@@ -929,7 +929,7 @@ void CuFFTConvolution::run() {
 
   if (!strategy_->fbmm()) {
     // Transpose A_ (? ? y x) -> (y x ? ?) (row-major formulation)
-    transposeAsComplex(BComplex_, BComplexT_, 2, handle1, s1);
+    transposeAsComplex(BComplex_, BComplexT_, 2, true, handle1, s1);
 
     // Here, both CComplex_ and CComplexT_ contain garbage that we will
     // overwrite and that we preemptively size to (y x ? ?)..
@@ -959,7 +959,7 @@ void CuFFTConvolution::run() {
     auto handle = getCircular(cublasHandles_, 0);
     // Transpose followed by IFFT in same stream s0 as the MxM
     // Transpose input (y x ? ?) -> (? ? y x) (row-major formulation)
-    transposeAsComplex(CComplexT_, CComplex_, 2, handle, s);
+    transposeAsComplex(CComplexT_, CComplex_, 2, true, handle, s);
   }
   if (strategy_->cufft()) {
     fft2d<2>(C_, CComplex_, FFTParameters().inverse().normalize(false),
diff --git a/src/fft/CuFFTConvolution_AccGradParameters.cu b/src/fft/CuFFTConvolution_AccGradParameters.cu
index ecb6216..6b5af65 100644
--- a/src/fft/CuFFTConvolution_AccGradParameters.cu
+++ b/src/fft/CuFFTConvolution_AccGradParameters.cu
@@ -1,16 +1,16 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "CuFFTConvolution_AccGradParameters.cuh"
+#include "src/fft/CuFFTConvolution_AccGradParameters.cuh"
 
 #include "cuda/CudaUtils.cuh"
 #include "cuda/DeviceTensor.cuh"
-#include "DeviceTensorUtils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THCTensor.h"
-#include "CuBLASWrapper.h"
-#include "ConvolutionBias.cuh"
-#include "CuFFTWrapper.cuh"
-#include "CuFFTConvolution.cuh"
-#include "Utils.cuh"
+#include "src/CuBLASWrapper.h"
+#include "src/ConvolutionBias.cuh"
+#include "src/fft/CuFFTWrapper.cuh"
+#include "src/fft/CuFFTConvolution.cuh"
+#include "src/fft/Utils.cuh"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
diff --git a/src/fft/CuFFTConvolution_UpdateGradInput.cu b/src/fft/CuFFTConvolution_UpdateGradInput.cu
index b768bcc..fd0560f 100644
--- a/src/fft/CuFFTConvolution_UpdateGradInput.cu
+++ b/src/fft/CuFFTConvolution_UpdateGradInput.cu
@@ -1,15 +1,15 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "CuFFTConvolution_UpdateGradInput.cuh"
+#include "src/fft/CuFFTConvolution_UpdateGradInput.cuh"
 
 #include "cuda/CudaUtils.cuh"
 #include "cuda/DeviceTensor.cuh"
-#include "DeviceTensorUtils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THCTensor.h"
-#include "CuBLASWrapper.h"
-#include "CuFFTWrapper.cuh"
-#include "CuFFTConvolution.cuh"
-#include "Utils.cuh"
+#include "src/CuBLASWrapper.h"
+#include "src/fft/CuFFTWrapper.cuh"
+#include "src/fft/CuFFTConvolution.cuh"
+#include "src/fft/Utils.cuh"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
diff --git a/src/fft/CuFFTConvolution_UpdateOutput.cu b/src/fft/CuFFTConvolution_UpdateOutput.cu
index 15c82fa..ae29e44 100644
--- a/src/fft/CuFFTConvolution_UpdateOutput.cu
+++ b/src/fft/CuFFTConvolution_UpdateOutput.cu
@@ -1,16 +1,16 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "CuFFTConvolution_UpdateOutput.cuh"
+#include "src/fft/CuFFTConvolution_UpdateOutput.cuh"
 
 #include "cuda/CudaUtils.cuh"
 #include "cuda/DeviceTensor.cuh"
-#include "DeviceTensorUtils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THCTensor.h"
-#include "ConvolutionBias.cuh"
-#include "CuBLASWrapper.h"
-#include "CuFFTWrapper.cuh"
-#include "CuFFTConvolution.cuh"
-#include "Utils.cuh"
+#include "src/ConvolutionBias.cuh"
+#include "src/CuBLASWrapper.h"
+#include "src/fft/CuFFTWrapper.cuh"
+#include "src/fft/CuFFTConvolution.cuh"
+#include "src/fft/Utils.cuh"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
diff --git a/src/fft/CuFFTStrategy.h b/src/fft/CuFFTStrategy.h
index 075eb25..263ee6e 100644
--- a/src/fft/CuFFTStrategy.h
+++ b/src/fft/CuFFTStrategy.h
@@ -2,8 +2,8 @@
 
 #pragma once
 
-#include "CuFFTConvolution.cuh"
-#include "CuFFTWrapper.cuh"
+#include "src/fft/CuFFTConvolution.cuh"
+#include "src/fft/CuFFTWrapper.cuh"
 
 #include <algorithm>
 #include <glog/logging.h>
diff --git a/src/fft/CuFFTWrapper.cu b/src/fft/CuFFTWrapper.cu
index a55d247..5709483 100644
--- a/src/fft/CuFFTWrapper.cu
+++ b/src/fft/CuFFTWrapper.cu
@@ -1,14 +1,16 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "CuFFTWrapper.cuh"
+#include "src/fft/CuFFTWrapper.cuh"
 
-#include "cuda/DeviceTensor.cuh"
 #include "THCTensor.h"
+#include "cuda/DeviceTensor.cuh"
+#include "src/DeviceTensorUtils.h"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <cufft.h>
 #include <glog/logging.h>
+#include <gflags/gflags.h>
 #include <math_constants.h>
 #include <thrust/device_vector.h>
 #include <thrust/transform.h>
@@ -17,7 +19,7 @@
 #include <thrust/fill.h>
 #include <thrust/replace.h>
 #include <thrust/functional.h>
-#include <gflags/gflags.h>
+#include <thrust/system/cuda/execution_policy.h>
 
 DEFINE_bool(fft_verbose, false, "Dump meta information for the FFT wrapper");
 
@@ -173,7 +175,7 @@ cufftHandle makeCuFFTPlan(const DeviceTensor<float, RealTensorDim>& real,
                            batchSize);
   }
   if (errFFT != CUFFT_SUCCESS) {
-    throw std::bad_alloc();
+    THError("Could not allocate cufft plan properly!");
   }
 
   return plan;
@@ -202,7 +204,7 @@ void fft(DeviceTensor<float, RealTensorDim>& real,
                           real.template dataAs<float>(),
                           cplx.template dataAs<cufftComplex>());
     if (errFFT != CUFFT_SUCCESS) {
-      throw std::bad_alloc();
+      THError("Error running forward FFT!");
     }
     DCHECK_EQ(errFFT, CUFFT_SUCCESS);
   } else {
@@ -210,7 +212,7 @@ void fft(DeviceTensor<float, RealTensorDim>& real,
                           cplx.template dataAs<cufftComplex>(),
                           real.template dataAs<cufftReal>());
     if (errFFT != CUFFT_SUCCESS) {
-      throw std::bad_alloc();
+      THError("Error running inverse FFT!");
     }
     DCHECK_EQ(errFFT, CUFFT_SUCCESS);
 
@@ -223,7 +225,8 @@ void fft(DeviceTensor<float, RealTensorDim>& real,
       DCHECK_LT(0, size) << "Negative size not supported !";
       float val = 1 / (float)size;
       thrust::device_ptr<float> res(real.data());
-      thrust::transform(res,
+      thrust::transform(thrust::cuda::par.on(stream),
+                        res,
                         res + real.getSize(0) * real.getStride(0),
                         res,
                         CudaScaleFunctor(val));
@@ -317,4 +320,38 @@ template void fft<2, 5>(DeviceTensor<float, 5>& real,
                         cufftHandle* plan,
                         cudaStream_t stream);
 
+#define INSTANTIATE_CUFFT_PLAN(BATCH_DIMS, REAL_TENSOR_DIM)                \
+  if (BATCH_DIMS == batchDimensions &&                                     \
+      REAL_TENSOR_DIM == THCudaTensor_nDimension(state, realTH)) {         \
+    DeviceTensor<float, REAL_TENSOR_DIM> real =                            \
+      torchToDeviceTensor<float, REAL_TENSOR_DIM>(state, realTH);          \
+    DeviceTensor<float, REAL_TENSOR_DIM + 1> cplx =                        \
+      torchToDeviceTensor<float, REAL_TENSOR_DIM + 1>(state, cplxTH);      \
+    return makeCuFFTPlan<BATCH_DIMS, REAL_TENSOR_DIM>(real, cplx, params); \
+  }
+
+extern "C"
+cufftHandle makeCuFFTPlanFFI(THCState* state,
+                             THCudaTensor* realTH,
+                             THCudaTensor* cplxTH,
+                             bool direction,
+                             bool normalize,
+                             int fftVersion,
+                             int batchDimensions)
+{
+  FFTParameters params = FFTParameters().normalize(normalize);
+  if (direction) params = params.forward();
+  else params = params.inverse();
+  if (fftVersion == 0) params = params.withCufft();
+  else params = params.withFbfft();
+
+  // 1 and 2D plans atm with 1 or 2 batch dimensions
+  INSTANTIATE_CUFFT_PLAN(1, 2);
+  INSTANTIATE_CUFFT_PLAN(1, 3);
+  INSTANTIATE_CUFFT_PLAN(2, 3);
+  INSTANTIATE_CUFFT_PLAN(2, 4);
+
+  return (cufftHandle)-1;
+}
+
 } } } // namespace
diff --git a/src/fft/CuFFTWrapper.cuh b/src/fft/CuFFTWrapper.cuh
index a6d069a..26a6699 100644
--- a/src/fft/CuFFTWrapper.cuh
+++ b/src/fft/CuFFTWrapper.cuh
@@ -2,8 +2,8 @@
 #pragma once
 
 #include "cuda/DeviceTensor.cuh"
-#include "cuda/fbfft/FBFFT.h"
-#include "Utils.cuh"
+#include "cuda/fbfft/FBFFT.cuh"
+#include "src/fft/Utils.cuh"
 
 #include <cufft.h>
 
@@ -16,11 +16,12 @@ class FFTParameters {
   // Normalization occurs only in inverse FFT (by 1 / (M.N)) since CuFFT does
   // unnormalized FFTs by default
   FFTParameters() :
-      version(cufft), direction_(true), normalize_(true) {}
+      version(cufft), direction_(true), normalize_(true), padLeft_(0), padUp_(0)
+    {}
 
   operator facebook::cuda::fbfft::FBFFTParameters() const {
     facebook::cuda::fbfft::FBFFTParameters res;
-    res = res.normalize(normalize_);
+    res = res.normalize(normalize_).withPadLeft(padLeft_).withPadUp(padUp_);
     return (direction_) ? res.forward() : res.inverse();
   }
 
@@ -49,11 +50,23 @@ class FFTParameters {
     return *this;
   }
 
+  FFTParameters& withPadLeft(int p) {
+    padLeft_ = p;
+    return *this;
+  }
+
+  FFTParameters& withPadUp(int p) {
+    padUp_ = p;
+    return *this;
+  }
+
   bool forwardFFT() const { return  direction_; }
   bool inverseFFT() const { return !direction_; }
   bool normalizeFFT() const { return normalize_; }
   bool cuFFT() const { return version == cufft; }
   bool fbFFT() const { return version == fbfft; }
+  int padLeft() const { return padLeft_; }
+  int padUp() const { return padUp_; }
 
   template <bool Hermitian>
   std::vector<long> makeComplexTensorSizes(
@@ -99,6 +112,8 @@ class FFTParameters {
  private:
   bool direction_;
   bool normalize_;
+  int padLeft_;
+  int padUp_;
 };
 
 template <int NumBatch, int RealTensorDim>
diff --git a/src/fft/FBFFTDevice.cu b/src/fft/FBFFTDevice.cu
index 6a89eeb..a980d4d 100644
--- a/src/fft/FBFFTDevice.cu
+++ b/src/fft/FBFFTDevice.cu
@@ -1,8 +1,6 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#pragma once
-
-#include "cuda/fbfft/FBFFT.h"
+#include "cuda/fbfft/FBFFT.cuh"
 #include "cuda/fbfft/FBFFTCommon.cuh"
 
 namespace facebook { namespace cuda { namespace fbfft {
@@ -11,12 +9,15 @@ template
 facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft1D<1>(
     DeviceTensor<float, 2>& real,
     DeviceTensor<float, 3>& complex,
+    const int padL,
     cudaStream_t s);
 
 template
 facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft2D<1>(
     DeviceTensor<float, 3>& real,
     DeviceTensor<float, 4>& complex,
+    const int padL,
+    const int padU,
     cudaStream_t s);
 
 template
@@ -29,6 +30,7 @@ template
 facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft1D<1>(
     DeviceTensor<float, 2>& real,
     DeviceTensor<float, 3>& complex,
+    const int padL,
     cudaStream_t s);
 
 template
@@ -41,6 +43,8 @@ template
 facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft2D<1>(
     DeviceTensor<Complex, 3>& srcComplex,
     DeviceTensor<float, 3>& realDst,
+    const int padL,
+    const int padU,
     cudaStream_t s);
 
 }}}
diff --git a/src/fft/FBFFTHost.cpp b/src/fft/FBFFTHost.cpp
index b8ac98f..85c6f8d 100644
--- a/src/fft/FBFFTHost.cpp
+++ b/src/fft/FBFFTHost.cpp
@@ -4,9 +4,9 @@
 #include "cuda/Complex.cuh"
 #include "cuda/ComputeCapabilities.cuh"
 #include "cuda/DeviceTensor.cuh"
-#include "cuda/fbfft/FBFFT.h"
-#include "CuFFTWrapper.cuh"
-#include "DeviceTensorUtils.h"
+#include "cuda/fbfft/FBFFT.cuh"
+#include "src/fft/CuFFTWrapper.cuh"
+#include "src/DeviceTensorUtils.h"
 
 #include <cuda_runtime.h>
 #include <glog/logging.h>
@@ -23,9 +23,9 @@ FBFFTParameters::ErrorCode fbfft1dHost(
     FBFFTParameters params,
     cudaStream_t s) {
   if (params.forwardFFT()) {
-    return fbfft1D<Batch>(real, complexAsFloat, s);
+    return fbfft1D<Batch>(real, complexAsFloat, params.padLeft(), s);
   } else {
-    return fbifft1D<Batch>(real, complexAsFloat, s);
+    return fbifft1D<Batch>(real, complexAsFloat, params.padLeft(), s);
   }
 }
 
@@ -83,7 +83,8 @@ FBFFTParameters::ErrorCode fbfft2dHost(
 
     FBFFTParameters::ErrorCode res;
     if (params.forwardFFT()) {
-      res = fbfft2D<Batch>(real, bufferAsFloatTr, s);
+      res = fbfft2D<Batch>(
+        real, bufferAsFloatTr, params.padLeft(), params.padUp(), s);
     } else  {
       assert(real.getSize(0) == bufferAsFloat->getSize(0));
       assert(complex.getSize(1) ==
@@ -99,13 +100,15 @@ FBFFTParameters::ErrorCode fbfft2dHost(
     if (params.forwardFFT()) {
       return fbfft2D<Batch>(bufferTr, complex, s);
     } else {
-      return fbifft2D<Batch>(buffer, real, s);
+      return fbifft2D<Batch>(buffer, real, params.padLeft(), params.padUp(), s);
     }
   } else {
     if (params.forwardFFT()) {
-      return fbfft2D<Batch>(real, complexAsFloat, s);
+      return fbfft2D<Batch>(
+        real, complexAsFloat, params.padLeft(), params.padUp(), s);
     } else {
-      return fbifft2D<Batch>(complex, real, s);
+      return fbifft2D<Batch>(
+        complex, real, params.padLeft(), params.padUp(), s);
     }
   }
 
@@ -154,12 +157,13 @@ FBFFTParameters::ErrorCode fbfft(THCState* state,
                                  THCudaTensor* r,
                                  THCudaTensor* c,
                                  THCudaTensor* b,
-                                 FBFFTParameters params,
-                                 cudaStream_t s) {
+                                 FBFFTParameters params) {
   if (THCudaTensor_nDimension(state, r) - Batch == 1) {
-    return fbfft1dHost<Batch>(state, r, c, params, s);
+    return fbfft1dHost<Batch>(
+      state, r, c, params, THCState_getCurrentStream(state));
   } else if (THCudaTensor_nDimension(state, r) - Batch == 2) {
-    return fbfft2dHost<Batch>(state, r, c, b, params, s);
+    return fbfft2dHost<Batch>(
+      state, r, c, b, params, THCState_getCurrentStream(state));
   }
   return FBFFTParameters::UnsupportedDimension;
 }
@@ -169,15 +173,13 @@ fbfft<1>(THCState* state,
          THCudaTensor* real,
          THCudaTensor* complex,
          THCudaTensor* buffer,
-         FBFFTParameters params,
-         cudaStream_t s);
+         FBFFTParameters params);
 
 template FBFFTParameters::ErrorCode
 fbfft<2>(THCState* state,
          THCudaTensor* real,
          THCudaTensor* complex,
          THCudaTensor* buffer,
-         FBFFTParameters params,
-         cudaStream_t s);
+         FBFFTParameters params);
 
 } } } // namespace
diff --git a/src/fft/FBFFTHost.h b/src/fft/FBFFTHost.h
index f055209..aa72862 100644
--- a/src/fft/FBFFTHost.h
+++ b/src/fft/FBFFTHost.h
@@ -33,7 +33,6 @@ facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft(
   THCudaTensor* complex,
   THCudaTensor* buffer = nullptr,
   facebook::cuda::fbfft::FBFFTParameters params =
-  facebook::cuda::fbfft::FBFFTParameters(),
-  cudaStream_t s = 0);
+  facebook::cuda::fbfft::FBFFTParameters());
 
 } } } // namespace
diff --git a/src/fft/FFTIteratedConvolution.cu b/src/fft/FFTIteratedConvolution.cu
new file mode 100644
index 0000000..0523133
--- /dev/null
+++ b/src/fft/FFTIteratedConvolution.cu
@@ -0,0 +1,98 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#include "src/DeviceTensorUtils.h"
+#include "THCTensor.h"
+
+#include "cuda/DeviceTensor.cuh"
+#include "cuda/fbfft/FFTIteratedConvolution.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+using namespace facebook::cuda;
+
+namespace facebook { namespace deeplearning { namespace torch {
+
+typedef struct {
+  THCudaTensor* tensor;
+  int padL;
+  int padU;
+} TiledDeviceTensorFFI;
+
+#define LOG_TARGET LOG(INFO)
+
+#define INSTANTIATE_ITERATED_CONVOLUTION(DIM, FFT_SIZE)                 \
+  if (THCudaTensor_nDimension(state, weight) == DIM &&                  \
+      fftSize == FFT_SIZE) {                                            \
+    thrust::host_vector<fbfft::detail::TiledDeviceTensor<float, DIM> >  \
+      tiledInputs;                                                      \
+    thrust::host_vector<fbfft::detail::TiledDeviceTensor<float, DIM> >  \
+      tiledOutputs;                                                     \
+    for (int i = 0; i < numTiles; ++i) {                                \
+      DeviceTensor<float, DIM> ti(                                      \
+        torchToDeviceTensor<float, DIM>(state, input[i].tensor));       \
+      fbfft::detail::TiledDeviceTensor<float, DIM> inp(                 \
+        ti,                                                             \
+        input[i].padL,                                                  \
+        input[i].padU);                                                 \
+      /* TODO: emplace_back */                                          \
+      tiledInputs.push_back(inp);                                       \
+                                                                        \
+      DeviceTensor<float, DIM> to(                                      \
+        torchToDeviceTensor<float, DIM>(state, output[i].tensor));      \
+      fbfft::detail::TiledDeviceTensor<float, DIM> out(                 \
+        to,                                                             \
+        output[i].padL,                                                 \
+        output[i].padU);                                                \
+      /* TODO: emplace_back */                                          \
+      tiledOutputs.push_back(out);                                      \
+    }                                                                   \
+                                                                        \
+    thrust::device_vector<fbfft::detail::TiledDeviceTensor<float, DIM> > \
+      ins = tiledInputs;                                                \
+    thrust::device_vector<fbfft::detail::TiledDeviceTensor<float, DIM> > \
+      outs = tiledOutputs;                                              \
+                                                                        \
+    DeviceTensor<float, DIM> wei(                                       \
+      torchToDeviceTensor<float, DIM>(state, weight));                  \
+    bool res =                                                          \
+      fbfft::detail::FFTIteratedConvolution<FFT_SIZE>(                  \
+        thrust::raw_pointer_cast(&ins[0]),                              \
+        thrust::raw_pointer_cast(&outs[0]),                             \
+        wei,                                                            \
+        pass,                                                           \
+        scale,                                                          \
+        batchSize,                                                      \
+        ins.size(),                                                     \
+        THCState_getCurrentStream(state));                              \
+    if (!res) { THError("Error in iterated convolution"); }             \
+  }
+
+extern "C" void convolveIteratedFFI(THCState* state,
+                                    TiledDeviceTensorFFI* input,
+                                    THCudaTensor* weight,
+                                    TiledDeviceTensorFFI* output,
+                                    int numTiles,
+                                    int fftSize,
+                                    fbfft::detail::FFTConvolutionPassFFI pass,
+                                    float scale) {
+  // TODO: accGrad all on same stream, updateOutput / updateGradInput async
+  int batchSize = THCudaTensor_size(state, input[0].tensor, 0);
+
+  ////////////////////////////////////////////////////////
+  // FFT of size 32
+  ////////////////////////////////////////////////////////
+  INSTANTIATE_ITERATED_CONVOLUTION(4, 32);
+
+  ////////////////////////////////////////////////////////
+  // FFT of size 16
+  ////////////////////////////////////////////////////////
+  INSTANTIATE_ITERATED_CONVOLUTION(4, 16);
+
+  ////////////////////////////////////////////////////////
+  // FFT of size 8
+  ////////////////////////////////////////////////////////
+  INSTANTIATE_ITERATED_CONVOLUTION(4, 8);
+}
+
+}}}
diff --git a/src/fft/FFTWrapperLua.cpp b/src/fft/FFTWrapperLua.cpp
index f4ec016..6bea2b4 100644
--- a/src/fft/FFTWrapperLua.cpp
+++ b/src/fft/FFTWrapperLua.cpp
@@ -4,13 +4,12 @@
 
 #include "THC.h"
 #include "THCTensor.h"
-#include "cuda/fbfft/FBFFT.h"
-#include "Utils.h"
-#include "../Utils.h"
-#include "CuFFTWrapper.cuh"
-#include "FBFFTHost.h"
-#include "DeviceTensorUtils.h"
-#include "util/Misc.h"
+#include "cuda/fbfft/FBFFT.cuh"
+#include "cuda/util/CachedDeviceProperties.h"
+#include "src/Utils.h"
+#include "src/fft/CuFFTWrapper.cuh"
+#include "src/fft/FBFFTHost.h"
+#include "src/DeviceTensorUtils.h"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -23,7 +22,6 @@
 
 using namespace facebook::cuda;
 using namespace facebook::cuda::fbfft;
-using namespace facebook::CUDAUtil;
 using namespace std;
 
 namespace facebook { namespace deeplearning { namespace torch {
@@ -61,8 +59,8 @@ float timedRun(THCState* state,
       state,
       timeTHTensor, frequencyTHTensor, bufferTHTensor, (FBFFTParameters)p);
     if (result != FBFFTParameters::Success) {
-      throw std::invalid_argument(folly::format("FBFFT error: {}",
-                                                (int)result).str().c_str());
+      THCudaCheck(cudaGetLastError());
+      THError(folly::format("FBFFT error: {}", (int)result).str().c_str());
     }
     auto timeMS = timer.stop();
     return timeMS;
@@ -70,49 +68,38 @@ float timedRun(THCState* state,
   return 0.0f;
 }
 
-#define FFT_BATCH(BATCH)                                        \
-  case BATCH:                                                   \
-  {                                                             \
-    switch(dims) {                                              \
-      case 2:                                                   \
-        time += timedRun<BATCH, 2>(state,                       \
-                                   timeTHTensor,                \
-                                   frequencyTHTensor,           \
-                                   bufferTHTensor,              \
-                                   p,                           \
-                                   fftPlan);                    \
-        break;                                                  \
-      case 3:                                                   \
-        time += timedRun<BATCH, 3>(state,                       \
-                                   timeTHTensor,                \
-                                   frequencyTHTensor,           \
-                                   bufferTHTensor,              \
-                                   p,                           \
-                                   fftPlan);                    \
-        break;                                                  \
-      default:                                                  \
-        throw invalid_argument("Unsupported dims + batchDims"); \
-    }                                                           \
-  }                                                             \
-  break;
-
-
-
-int fftFun(lua_State* L, bool forward) {
-  THCState* state = getCutorchState(L);
-  bool dumpTimings = false;
+#define TIMED_FFT(BATCH, DIM)                           \
+  if (batchDims == BATCH && dims == DIM) {              \
+    time += timedRun<BATCH, DIM>(state,                 \
+                                 timeTHTensor,          \
+                                 frequencyTHTensor,     \
+                                 bufferTHTensor,        \
+                                 p,                     \
+                                 fftPlan);              \
+    done = true;                                        \
+  }
 
+int runTimedFFT(lua_State* L, bool forward) {
+  THCState* state = getCutorchState(L);
   auto batchDims = luaT_getfieldcheckint(L, 1, "batchDims");
-  auto cufft = luaT_getfieldcheckint(L, 1, "cufft");
+  auto cufft = luaT_getfieldcheckboolean(L, 1, "cufft");
+  auto padLeft = luaT_getfieldcheckint(L, 1, "padLeft");
+  auto padUp = luaT_getfieldcheckint(L, 1, "padUp");
   auto timeTHTensor =
     (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
   auto frequencyTHTensor =
     (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
   auto bufferTHTensor =
     (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor");
+  if (THCudaTensor_nDimension(state, bufferTHTensor) == 0) {
+    bufferTHTensor = nullptr;
+    THAssert(THCudaTensor_checkGPU(state, 2, timeTHTensor, frequencyTHTensor));
+  } else {
+    THAssert(THCudaTensor_checkGPU(state, 3, timeTHTensor, frequencyTHTensor,
+                                   bufferTHTensor));
+  }
+  auto fftPlan = (cufftHandle)lua_tonumber(L, 5);
 
-  THAssert(THCudaTensor_checkGPU(state, 3, timeTHTensor, frequencyTHTensor,
-                                 bufferTHTensor));
   CHECK_EQ(THCudaTensor_nDimension(state, timeTHTensor) + 1,
            THCudaTensor_nDimension(state, frequencyTHTensor));
 
@@ -124,27 +111,23 @@ int fftFun(lua_State* L, bool forward) {
   if (!forward) {
     p = p.inverse().normalize(false);
   }
-  if (cufft == 1) {
+  if (cufft) {
     p = p.withCufft();
   } else {
     p = p.withFbfft();
   }
+  p.withPadLeft(padLeft);
+  p.withPadUp(padUp);
 
   try {
-    cufftHandle fftPlan = -1;
-    SCOPE_EXIT{
-      if (fftPlan >= 0) {
-        CHECK_EQ(CUFFT_SUCCESS, cufftDestroy(fftPlan));
-      }
-    };
-
     for (int i = 0; i < kNumTrials; ++i) {
-      switch (batchDims) {
-        FFT_BATCH(1);
-        default:
-          throw invalid_argument("Unsupported batch dims");
-      };
-
+      auto done = false;
+      TIMED_FFT(1, 2);
+      TIMED_FFT(1, 3);
+      if (!done) {
+        THCudaCheck(cudaGetLastError());
+        THError("Timed FFT: Unsupported batch dims");
+      }
       // Reset time to kNumTrials
       if (i < kNumSkipTrials && kNumTrials > kNumSkipTrials) {
         time = 0.0f;
@@ -176,7 +159,7 @@ int fftFun(lua_State* L, bool forward) {
   auto version = (p.cuFFT()) ? "CuFFT" : "FBFFT";
   auto direction = (forward) ? "forward" : "inverse";
   auto GOut = size / 1e9;
-  LOG_IF(INFO, dumpTimings) << folly::format(
+  LOG(INFO) << folly::format(
     "  Running fft-{}d ({}) direction={} ({}x{}x{}),"   \
     "  {} batches, GNlogN/s = {:.5f}"                   \
     "  time = {:.2f}ms",
@@ -193,17 +176,116 @@ int fftFun(lua_State* L, bool forward) {
   return 0;
 }
 
-int fftFun(lua_State* L) {
-  return fftFun(L, true);
+#define FBFFT_CASE(BATCH_DIMS, INPUT_DIMS)                              \
+  if (batchDims == BATCH_DIMS && inputDims == INPUT_DIMS) {             \
+    auto result = fbfft<BATCH_DIMS>(state,                              \
+                                    timeTHTensor,                       \
+                                    frequencyTHTensor,                  \
+                                    bufferTHTensor,                     \
+                                    (FBFFTParameters)p);                \
+    if (result != FBFFTParameters::Success) {                           \
+      THCudaCheck(cudaGetLastError());                                  \
+      THError(                                                          \
+        folly::format("FBFFT error: {}",                                \
+                      (int)result).str().c_str());                      \
+    }                                                                   \
+    done = true;                                                        \
+  }
+
+#define CUFFT_CASE(BATCH_DIMS, INPUT_DIMS)                                  \
+  if (batchDims == BATCH_DIMS && inputDims == INPUT_DIMS) {                 \
+    auto timeTensor =                                                       \
+      torchToDeviceTensor<float, INPUT_DIMS>(state, timeTHTensor);          \
+    auto frequencyTensor =                                                  \
+      torchToDeviceTensor<float, INPUT_DIMS + 1>(state, frequencyTHTensor); \
+    if (fftPlan < 0) {                                                      \
+       localPlan = makeCuFFTPlan<BATCH_DIMS, INPUT_DIMS>(                   \
+         timeTensor, frequencyTensor, p);                                   \
+    }                                                                       \
+    fft<BATCH_DIMS, INPUT_DIMS>(timeTensor, frequencyTensor, p, &localPlan);\
+    done = true;                                                            \
+  }
+
+int runFFT(lua_State* L, bool forward) {
+  THCState* state = getCutorchState(L);
+  auto batchDims = luaT_getfieldcheckint(L, 1, "batchDims");
+  auto cufft = luaT_getfieldcheckboolean(L, 1, "cufft");
+  auto padLeft = luaT_getfieldcheckint(L, 1, "padLeft");
+  auto padUp = luaT_getfieldcheckint(L, 1, "padUp");
+  auto timeTHTensor =
+    (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
+  auto frequencyTHTensor =
+    (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
+  auto bufferTHTensor =
+    (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor");
+  if (THCudaTensor_nDimension(state, bufferTHTensor) == 0) {
+    bufferTHTensor = nullptr;
+    THAssert(THCudaTensor_checkGPU(state, 2, timeTHTensor, frequencyTHTensor));
+  } else {
+    THAssert(THCudaTensor_checkGPU(state, 3, timeTHTensor, frequencyTHTensor,
+                                   bufferTHTensor));
+  }
+  auto fftPlan = (cufftHandle)lua_tonumber(L, 5);
+
+  CHECK_EQ(THCudaTensor_nDimension(state, timeTHTensor) + 1,
+           THCudaTensor_nDimension(state, frequencyTHTensor));
+
+  int inputDims = THCudaTensor_nDimension(state, timeTHTensor);
+  FFTParameters p; // forward and normalize are default
+  if (!forward) {
+    p = p.inverse().normalize(false);
+  }
+  if (!cufft) {
+    p = p.withFbfft();
+  } else {
+    p = p.withCufft();
+  }
+  p.withPadLeft(padLeft);
+  p.withPadUp(padUp);
+
+  try {
+    auto done = false;
+    if (!cufft) {
+      FBFFT_CASE(1, 2);
+      FBFFT_CASE(1, 3);
+      FBFFT_CASE(2, 3);
+      FBFFT_CASE(2, 4);
+      if (!done) { THError("Unsupported fbfft batch dims"); }
+    } else {
+      cufftHandle localPlan = fftPlan;
+      SCOPE_EXIT {
+        if (fftPlan < 0) {
+          cufftDestroy(localPlan);
+        }
+      };
+      CUFFT_CASE(1, 2);
+      CUFFT_CASE(1, 3);
+      CUFFT_CASE(2, 3);
+      CUFFT_CASE(2, 4);
+      if (!done) { THError("Unsupported cufft batch dims"); }
+    }
+  } catch(exception &e){
+    return luaL_error(L, e.what());
+  }
+
+  return 0;
+}
+
+int fft(lua_State* L) {
+  auto timed = luaT_getfieldcheckboolean(L, 1, "timed");
+  if (timed) { return runTimedFFT(L, true); }
+  return runFFT(L, true);
 }
 
-int fftiFun(lua_State* L) {
-  return fftFun(L, false);
+int ffti(lua_State* L) {
+  auto timed = luaT_getfieldcheckboolean(L, 1, "timed");
+  if (timed) { return runTimedFFT(L, false); }
+  return runFFT(L, false);
 }
 
 const luaL_Reg functions[] = {
-  {"FFTWrapper_fft", fftFun},
-  {"FFTWrapper_ffti", fftiFun},
+  {"FFTWrapper_fft", fft},
+  {"FFTWrapper_ffti", ffti},
   {nullptr, nullptr},
 };
 
diff --git a/src/fft/SpatialConvolutionCuFFT.cpp b/src/fft/SpatialConvolutionCuFFT.cpp
index 4cba88e..6cd7a20 100644
--- a/src/fft/SpatialConvolutionCuFFT.cpp
+++ b/src/fft/SpatialConvolutionCuFFT.cpp
@@ -1,15 +1,15 @@
 // Copyright 2014 Facebook
 
 #include "THCTensor.h"
-#include "DeviceTensorUtils.h"
-#include "CuFFTConvolution.cuh"
-#include "CuFFTConvolution_UpdateOutput.cuh"
-#include "CuFFTConvolution_AccGradParameters.cuh"
-#include "CuFFTConvolution_UpdateGradInput.cuh"
-#include "CuFFTStrategy.h"
-#include "CuFFTWrapper.cuh"
-#include "Utils.h"
-#include "util/Misc.h"
+#include "cuda/util/CachedDeviceProperties.h"
+#include "src/DeviceTensorUtils.h"
+#include "src/fft/CuFFTConvolution.cuh"
+#include "src/fft/CuFFTConvolution_UpdateOutput.cuh"
+#include "src/fft/CuFFTConvolution_AccGradParameters.cuh"
+#include "src/fft/CuFFTConvolution_UpdateGradInput.cuh"
+#include "src/fft/CuFFTStrategy.h"
+#include "src/fft/CuFFTWrapper.cuh"
+#include "src/fft/Utils.h"
 
 #include <folly/Format.h>
 #include <folly/ScopeGuard.h>
@@ -18,7 +18,7 @@
 #include <vector>
 
 using namespace std;
-using namespace facebook::CUDAUtil;
+using namespace facebook::cuda;
 using namespace facebook::deeplearning::torch;
 
 namespace facebook { namespace deeplearning { namespace torch {
@@ -247,7 +247,7 @@ template <int FFTDim> class CuFFTBuffers {
       torchToDeviceTensor<float, 4>(state, realTH),
       torchToDeviceTensor<float, 5>(state, complexTH),
       params);
-    auto h = folly::make_unique<cufftPlan, CuFFTPlanDeleter>(p);
+    auto h = std::unique_ptr<cufftPlan, CuFFTPlanDeleter>(new cufftPlan(p));
     cufftPlanMap_.emplace(key, std::move(h));
     return cufftPlanMap_[key].get();
   }
@@ -459,27 +459,27 @@ void updateOutputTH(THCState* state,
 
   auto inputCPtr = MAKE_INPUT_BUFFER(p.buffers.input);
   auto inputC = inputCPtr.get();
-  DCHECK_EQ(p.buffers.input->storage, inputC->storage);
+  CHECK_EQ(p.buffers.input->storage, inputC->storage);
 
   auto outputCPtr = MAKE_OUTPUT_BUFFER(p.buffers.output);
   auto outputC = outputCPtr.get();
-  DCHECK_EQ(p.buffers.output->storage, outputC->storage);
+  CHECK_EQ(p.buffers.output->storage, outputC->storage);
 
   auto weightCPtr = MAKE_WEIGHT_BUFFER(p.buffers.weight);
   auto weightC = weightCPtr.get();
-  DCHECK_EQ(p.buffers.weight->storage, weightC->storage);
+  CHECK_EQ(p.buffers.weight->storage, weightC->storage);
 
   auto inputCTrPtr = MAKE_INPUT_BUFFER(p.buffers.inputTranspose);
   auto inputCTr = inputCTrPtr.get();
-  DCHECK_EQ(p.buffers.inputTranspose->storage, inputCTr->storage);
+  CHECK_EQ(p.buffers.inputTranspose->storage, inputCTr->storage);
 
   auto outputCTrPtr = MAKE_OUTPUT_BUFFER(p.buffers.outputTranspose);
   auto outputCTr = outputCTrPtr.get();
-  DCHECK_EQ(p.buffers.outputTranspose->storage, outputCTr->storage);
+  CHECK_EQ(p.buffers.outputTranspose->storage, outputCTr->storage);
 
   auto weightCTrPtr = MAKE_WEIGHT_BUFFER(p.buffers.weightTranspose);
   auto weightCTr = weightCTrPtr.get();
-  DCHECK_EQ(p.buffers.weightTranspose->storage, weightCTr->storage);
+  CHECK_EQ(p.buffers.weightTranspose->storage, weightCTr->storage);
 
   // Plans
   auto planInput = (s.fbfft()) ?
@@ -548,7 +548,7 @@ void updateOutputTH(THCState* state,
   }
 
   // Actual run
-  CuFFTConvolution conv(ConvolutionPass(ConvolutionPass::kUpdateOutput));
+  CuFFTConvolution conv( (ConvolutionPass(ConvolutionPass::kUpdateOutput)) );
   conv.withInputAndBuffers(
     state, inputR, inputC, inputCTr, inputCBuffer, planInput)
     .withFiltersAndBuffers(
@@ -645,27 +645,27 @@ void updateGradInputTH(THCState* state,
 
   auto gradInputCPtr = MAKE_INPUT_BUFFER(p.buffers.input);
   auto gradInputC = gradInputCPtr.get();
-  DCHECK_EQ(p.buffers.input->storage, gradInputC->storage);
+  CHECK_EQ(p.buffers.input->storage, gradInputC->storage);
 
   auto gradOutputCPtr = MAKE_OUTPUT_BUFFER(p.buffers.output);
   auto gradOutputC = gradOutputCPtr.get();
-  DCHECK_EQ(p.buffers.output->storage, gradOutputC->storage);
+  CHECK_EQ(p.buffers.output->storage, gradOutputC->storage);
 
   auto weightCPtr = MAKE_WEIGHT_BUFFER(p.buffers.weight);
   auto weightC = weightCPtr.get();
-  DCHECK_EQ(p.buffers.weight->storage, weightC->storage);
+  CHECK_EQ(p.buffers.weight->storage, weightC->storage);
 
   auto gradInputCTrPtr = MAKE_INPUT_BUFFER(p.buffers.inputTranspose);
   auto gradInputCTr = gradInputCTrPtr.get();
-  DCHECK_EQ(p.buffers.inputTranspose->storage, gradInputCTr->storage);
+  CHECK_EQ(p.buffers.inputTranspose->storage, gradInputCTr->storage);
 
   auto gradOutputCTrPtr = MAKE_OUTPUT_BUFFER(p.buffers.outputTranspose);
   auto gradOutputCTr = gradOutputCTrPtr.get();
-  DCHECK_EQ(p.buffers.outputTranspose->storage, gradOutputCTr->storage);
+  CHECK_EQ(p.buffers.outputTranspose->storage, gradOutputCTr->storage);
 
   auto weightCTrPtr = MAKE_WEIGHT_BUFFER(p.buffers.weightTranspose);
   auto weightCTr = weightCTrPtr.get();
-  DCHECK_EQ(p.buffers.weightTranspose->storage, weightCTr->storage);
+  CHECK_EQ(p.buffers.weightTranspose->storage, weightCTr->storage);
 
   auto gradInputCBuffer = (s.fbfft()) ?
     buffers.buffer(state,
@@ -711,7 +711,7 @@ void updateGradInputTH(THCState* state,
   auto handles = buffers.handles();
 
  // Actual run
-  CuFFTConvolution conv(ConvolutionPass(ConvolutionPass::kUpdateGradInput));
+  CuFFTConvolution conv( (ConvolutionPass(ConvolutionPass::kUpdateGradInput)) );
   conv.withInputAndBuffers(
     state,
     giTmp, gradInputC, gradInputCTr, gradInputCBuffer, planInput)
@@ -807,27 +807,27 @@ void accGradParametersTH(THCState* state,
 
   auto inputCPtr = MAKE_INPUT_BUFFER(p.buffers.input);
   auto inputC = inputCPtr.get();
-  DCHECK_EQ(p.buffers.input->storage, inputC->storage);
+  CHECK_EQ(p.buffers.input->storage, inputC->storage);
 
   auto gradOutputCPtr = MAKE_OUTPUT_BUFFER(p.buffers.output);
   auto gradOutputC = gradOutputCPtr.get();
-  DCHECK_EQ(p.buffers.output->storage, gradOutputC->storage);
+  CHECK_EQ(p.buffers.output->storage, gradOutputC->storage);
 
   auto gradWeightCPtr = MAKE_WEIGHT_BUFFER(p.buffers.weight);
   auto gradWeightC = gradWeightCPtr.get();
-  DCHECK_EQ(p.buffers.weight->storage, gradWeightC->storage);
+  CHECK_EQ(p.buffers.weight->storage, gradWeightC->storage);
 
   auto inputCTrPtr = MAKE_INPUT_BUFFER(p.buffers.inputTranspose);
   auto inputCTr = inputCTrPtr.get();
-  DCHECK_EQ(p.buffers.inputTranspose->storage, inputCTr->storage);
+  CHECK_EQ(p.buffers.inputTranspose->storage, inputCTr->storage);
 
   auto gradOutputCTrPtr = MAKE_OUTPUT_BUFFER(p.buffers.outputTranspose);
   auto gradOutputCTr = gradOutputCTrPtr.get();
-  DCHECK_EQ(p.buffers.outputTranspose->storage, gradOutputCTr->storage);
+  CHECK_EQ(p.buffers.outputTranspose->storage, gradOutputCTr->storage);
 
   auto gradWeightCTrPtr = MAKE_WEIGHT_BUFFER(p.buffers.weightTranspose);
   auto gradWeightCTr = gradWeightCTrPtr.get();
-  DCHECK_EQ(p.buffers.weightTranspose->storage, gradWeightCTr->storage);
+  CHECK_EQ(p.buffers.weightTranspose->storage, gradWeightCTr->storage);
 
   auto inputCBuffer = (s.fbfft()) ?
     buffers.buffer(state,
@@ -873,7 +873,7 @@ void accGradParametersTH(THCState* state,
     buffers.plan(state, gradOutputR, gradOutputC, FFTParameters().forward(), 1);
 
   auto handles = buffers.handles();
-  CuFFTConvolution conv(ConvolutionPass(ConvolutionPass::kAccGradParameters));
+  CuFFTConvolution conv((ConvolutionPass(ConvolutionPass::kAccGradParameters)));
   conv.withInputAndBuffers(
     state, inputR, inputC, inputCTr, inputCBuffer, planInput)
     .withFiltersAndBuffers(
diff --git a/src/fft/SpatialConvolutionCuFFT.h b/src/fft/SpatialConvolutionCuFFT.h
index a0a6095..2f9c49c 100644
--- a/src/fft/SpatialConvolutionCuFFT.h
+++ b/src/fft/SpatialConvolutionCuFFT.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include "CuFFTStrategy.h"
+#include "src/fft/CuFFTStrategy.h"
 
 namespace facebook { namespace deeplearning { namespace torch {
 namespace detail {
diff --git a/src/fft/SpatialConvolutionCuFFTHost.cpp b/src/fft/SpatialConvolutionCuFFTHost.cpp
index a0d684e..bd220f6 100644
--- a/src/fft/SpatialConvolutionCuFFTHost.cpp
+++ b/src/fft/SpatialConvolutionCuFFTHost.cpp
@@ -1,11 +1,10 @@
 // Copyright 2014 Facebook
 
-#include "Utils.h"
-#include "../Utils.h"
-#include "CudaTensorUtils.h"
-#include "CuFFTStrategy.h"
-#include "SpatialConvolutionCuFFT.h"
-#include "SpatialConvolutionCuFFTTuner.h"
+#include "src/Utils.h"
+#include "src/CudaTensorUtils.h"
+#include "src/fft/CuFFTStrategy.h"
+#include "src/fft/SpatialConvolutionCuFFT.h"
+#include "src/fft/SpatialConvolutionCuFFTTuner.h"
 
 #include <luaT.h>
 #include <lua.hpp>
@@ -66,7 +65,7 @@ int updateOutputLua(lua_State* L) {
                                  bufs.weight, bufs.weightTranspose));
 
   THParams thp(state, input, weight, output, bias, 0.0f, bufs);
-  ConvolutionPass pass(ConvolutionPass(ConvolutionPass::kUpdateOutput));
+  ConvolutionPass pass( (ConvolutionPass(ConvolutionPass::kUpdateOutput)) );
   ProblemSizes pbs(thp, pass);
 
   auto strategy = SpatialConvolutionCuFFTTuner::getBestPerformance(state, pbs);
@@ -119,7 +118,7 @@ int updateGradInputLua(lua_State* L) {
                                  bufs.weight, bufs.weightTranspose));
 
   THParams thp(state, gradInput, weight, gradOutput, nullptr, 0.0f, bufs);
-  ConvolutionPass pass(ConvolutionPass(ConvolutionPass::kUpdateGradInput));
+  ConvolutionPass pass( (ConvolutionPass(ConvolutionPass::kUpdateGradInput)) );
   ProblemSizes pbs(thp, pass);
 
   auto strategy = SpatialConvolutionCuFFTTuner::getBestPerformance(state, pbs);
@@ -177,7 +176,7 @@ int accGradParametersLua(lua_State* L) {
                                  bufs.weight, bufs.weightTranspose));
 
   THParams thp(state, input, gradWeight, gradOutput, gradBias, scale, bufs);
-  ConvolutionPass pass(ConvolutionPass(ConvolutionPass::kAccGradParameters));
+  ConvolutionPass pass(ConvolutionPass::kAccGradParameters);
   ProblemSizes pbs(thp, pass);
 
   auto strategy = SpatialConvolutionCuFFTTuner::getBestPerformance(state, pbs);
diff --git a/src/fft/SpatialConvolutionCuFFTTuner.cpp b/src/fft/SpatialConvolutionCuFFTTuner.cpp
index 9a42433..fe42073 100644
--- a/src/fft/SpatialConvolutionCuFFTTuner.cpp
+++ b/src/fft/SpatialConvolutionCuFFTTuner.cpp
@@ -1,11 +1,11 @@
 // Copyright 2014 Facebook
 
-#include "SpatialConvolutionCuFFTTuner.h"
+#include "src/fft/SpatialConvolutionCuFFTTuner.h"
 
 #include "cuda/KernelTimer.h"
 #include "THC.h"
-#include "CuFFTStrategy.h"
-#include "SpatialConvolutionCuFFT.h"
+#include "src/fft/CuFFTStrategy.h"
+#include "src/fft/SpatialConvolutionCuFFT.h"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
diff --git a/src/fft/SpatialConvolutionCuFFTTuner.h b/src/fft/SpatialConvolutionCuFFTTuner.h
index cb6b494..953fff0 100644
--- a/src/fft/SpatialConvolutionCuFFTTuner.h
+++ b/src/fft/SpatialConvolutionCuFFTTuner.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include "CuFFTStrategy.h"
+#include "src/fft/CuFFTStrategy.h"
 #include <folly/Optional.h>
 
 struct THCState;
diff --git a/src/fft/Utils-inl.h b/src/fft/Utils-inl.h
index 8d3ef7d..d2c20c8 100644
--- a/src/fft/Utils-inl.h
+++ b/src/fft/Utils-inl.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include "Utils.cuh"
+#include "src/fft/Utils.cuh"
 #include "THC.h"
 
 namespace facebook { namespace deeplearning { namespace torch {
@@ -21,8 +21,8 @@ makeCuFFTTensorReal(
   THCState* state,
   THCudaTensor* in,
   const std::vector<long>& commonDims,
-  THCudaTensor* candidateCudaStorageReal = nullptr,
-  FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace) {
+  THCudaTensor* candidateCudaStorageReal,
+  FFTOutputSpecification inPlace) {
   DCHECK_EQ(FFTDim, commonDims.size());
   DCHECK_EQ(4, THCudaTensor_nDimension(state, in));
   DCHECK_LE(1, FFTDim);
@@ -139,8 +139,8 @@ makeCuFFTTensorComplex(
   THCState* state,
   THCudaTensor* real,
   const std::vector<long>& commonDims,
-  THCudaTensor* candidateCudaStorageComplex = nullptr,
-  FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace) {
+  THCudaTensor* candidateCudaStorageComplex,
+  FFTOutputSpecification inPlace) {
   DCHECK_EQ(4, THCudaTensor_nDimension(state, real));
   DCHECK_LE(1, FFTDim);
   DCHECK_GE(3, FFTDim);
@@ -199,7 +199,7 @@ std::unique_ptr<THCudaTensor, CudaTensorDeleter>
 makeCuFFTTensorComplex(
   THCState* state,
   const std::vector<long>& allDims,
-  THCudaTensor* candidateCudaStorageComplex = nullptr) {
+  THCudaTensor* candidateCudaStorageComplex) {
   DCHECK_EQ(4, allDims.size());
   DCHECK_LE(1, FFTDim);
   DCHECK_GE(3, FFTDim);
@@ -240,7 +240,7 @@ makeCuFFTTensors(
   THCState* state,
   THCudaTensor* in,
   const std::vector<long>& commonDims,
-  FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace) {
+  FFTOutputSpecification inPlace) {
   auto p1 =
     makeCuFFTTensorReal<FFTDim>(
       state, in, commonDims, nullptr, inPlace);
@@ -257,7 +257,7 @@ makeCuFFTTensors(
   THCState* state,
   thpp::Tensor<float>& in,
   const std::vector<long>& commonDims,
-  FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace) {
+  FFTOutputSpecification inPlace) {
   auto th = copyToCuda(state, in);
   auto res = makeCuFFTTensors<FFTDim>(state, th.get(), commonDims, inPlace);
   return make_pair(std::move(res.first), std::move(res.second));
diff --git a/src/fft/Utils.h b/src/fft/Utils.h
index 8ea2c58..9e18ab0 100644
--- a/src/fft/Utils.h
+++ b/src/fft/Utils.h
@@ -4,7 +4,7 @@
 
 #include "thpp/Tensor.h"
 #include "THCTensor.h"
-#include "CudaTensorUtils.h"
+#include "src/CudaTensorUtils.h"
 
 #include <glog/logging.h>
 #include <vector>
diff --git a/src/util/AsyncCopier.cpp b/src/util/AsyncCopier.cpp
index 9a25a31..28f9f89 100644
--- a/src/util/AsyncCopier.cpp
+++ b/src/util/AsyncCopier.cpp
@@ -3,13 +3,14 @@
  * @author Tudor Bosman (tudorb@fb.com)
  */
 
-#include "util/AsyncCopier.h"
-#include "util/Misc.h"
+#include "src/util/AsyncCopier.h"
+#include "src/util/Misc.h"
+
 #include <exception>
 #include <folly/Format.h>
 #include <glog/logging.h>
 
-namespace facebook { namespace CUDAUtil {
+namespace facebook { namespace cuda {
 
 uint8_t* allocPageLocked(size_t size) {
   void* ptr;
diff --git a/src/util/AsyncCopier.h b/src/util/AsyncCopier.h
index 513864a..2bc4abf 100644
--- a/src/util/AsyncCopier.h
+++ b/src/util/AsyncCopier.h
@@ -15,7 +15,7 @@
 #include <folly/Optional.h>
 #include <folly/small_vector.h>
 
-namespace facebook { namespace CUDAUtil {
+namespace facebook { namespace cuda {
 
 class AsyncCopier {
  public:
diff --git a/src/util/GlobalAsyncCopier.cpp b/src/util/GlobalAsyncCopier.cpp
index d25c80c..d1c56c3 100644
--- a/src/util/GlobalAsyncCopier.cpp
+++ b/src/util/GlobalAsyncCopier.cpp
@@ -3,15 +3,15 @@
  * @author Tudor Bosman (tudorb@fb.com)
  */
 
-#include "util/GlobalAsyncCopier.h"
+#include "src/util/GlobalAsyncCopier.h"
 
 #include <cstdlib>
 #include <folly/Conv.h>
 #include <folly/Memory.h>
 
-#include "util/AsyncCopier.h"
+#include "src/util/AsyncCopier.h"
 
-using namespace facebook::CUDAUtil;
+using namespace facebook::cuda;
 
 constexpr size_t kDefaultBufferSizeMB = 16;
 const char* const kBufferSizeEnvVar = "FB_CUDA_ASYNC_COPIER_BUFFER_SIZE_MB";
diff --git a/src/util/Misc.cpp b/src/util/Misc.cpp
index 9503eeb..f1c35ec 100644
--- a/src/util/Misc.cpp
+++ b/src/util/Misc.cpp
@@ -1,6 +1,7 @@
 // Copyright 2004-, Facebook, Inc. All Rights Reserved.
 
-#include "util/Misc.h"
+#include "src/util/Misc.h"
+
 #include <folly/Format.h>
 #include <folly/Memory.h>
 #include <mutex>
@@ -8,21 +9,7 @@
 
 using namespace std;
 
-namespace facebook { namespace CUDAUtil {
-
-int getDevice() {
-  int dev;
-  checkCudaError(cudaGetDevice(&dev));
-  return dev;
-}
-
-// Streams. We have an implicit model that async memory copies
-// with send semantics happen on a dedicated, conventional stream
-// per-device. The stream runs on the destination.
-namespace {
-mutex mtx;
-unordered_map<int, cudaStream_t> deviceToCopyStream;
-}
+namespace facebook { namespace cuda {
 
 cudaStream_t getComputeStream() {
   // It would be nice to compute on non-default streams from time to time,
@@ -30,20 +17,7 @@ cudaStream_t getComputeStream() {
   return 0;
 }
 
-cudaStream_t getCopyStream() {
-  unique_lock<mutex> own(mutex);
-  auto dev = getDevice();
-  auto row = deviceToCopyStream.find(dev);
-  if (row == deviceToCopyStream.end()) {
-    cudaStream_t& stream = deviceToCopyStream[dev];
-    checkCudaError(cudaStreamCreate(&stream));
-    return stream;
-  }
-  return row->second;
-}
-
-__attribute__((__noreturn__))
-void throwCudaError(cudaError_t error, const char* msg) {
+[[noreturn]] void throwCudaError(cudaError_t error, const char* msg) {
   auto string = msg ?
     folly::sformat("{}: CUDA error {} ({})", msg, int(error),
                    cudaGetErrorString(error)) :
@@ -52,42 +26,4 @@ void throwCudaError(cudaError_t error, const char* msg) {
   throw std::runtime_error(string);
 }
 
-namespace {
-
-struct DeviceProperties {
-  DeviceProperties();
-  int deviceCount = 0;
-  std::unique_ptr<cudaDeviceProp[]> deviceProperties;
-};
-
-DeviceProperties::DeviceProperties() {
-  auto err = cudaGetDeviceCount(&deviceCount);
-  if (err == cudaErrorNoDevice) {
-    deviceCount = 0;
-  } else {
-    checkCudaError(err, "cudaGetDeviceCount");
-  }
-
-  deviceProperties = folly::make_unique<cudaDeviceProp[]>(deviceCount);
-  for (int i = 0; i < deviceCount; ++i) {
-    checkCudaError(cudaGetDeviceProperties(&deviceProperties[i], i),
-                   "cudaGetDeviceProperties");
-  }
-}
-
-}  // namespace
-
-const cudaDeviceProp& getCurrentDeviceProperties() {
-  int device = 0;
-  checkCudaError(cudaGetDevice(&device), "cudaGetDevice");
-
-  return getDeviceProperties(device);
-}
-
-const cudaDeviceProp& getDeviceProperties(int device) {
-  static DeviceProperties dprop;
-  DCHECK(device >= 0 && device < dprop.deviceCount);
-  return dprop.deviceProperties[device];
-}
-
 } }
diff --git a/src/util/Misc.h b/src/util/Misc.h
index b6b9940..a662a21 100644
--- a/src/util/Misc.h
+++ b/src/util/Misc.h
@@ -2,14 +2,13 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
+#include "cuda/util/CachedDeviceProperties.h"
 
-namespace facebook { namespace CUDAUtil {
+#include <cuda_runtime.h>
 
-int getDevice();
+namespace facebook { namespace cuda {
 
-extern __attribute__((__noreturn__))
-void throwCudaError(cudaError_t, const char* msg);
+[[noreturn]] void throwCudaError(cudaError_t, const char* msg);
 
 inline void
 checkCudaError(cudaError_t error, const char* msg = 0) {
@@ -18,7 +17,6 @@ checkCudaError(cudaError_t error, const char* msg = 0) {
   }
 }
 
-
 class OnDevice {
   int m_home;
  public:
@@ -31,10 +29,6 @@ class OnDevice {
   }
 };
 
-const cudaDeviceProp& getCurrentDeviceProperties();
-const cudaDeviceProp& getDeviceProperties(int device);
-
 cudaStream_t getComputeStream();
-cudaStream_t getCopyStream();
 
 } }
diff --git a/src/util/Transform.cu b/src/util/Transform.cu
index 416ae2a..5db91b8 100644
--- a/src/util/Transform.cu
+++ b/src/util/Transform.cu
@@ -3,9 +3,9 @@
 #include <algorithm>
 #include <assert.h>
 
-#include "util/Transform.cuh"
+#include "src/util/Transform.cuh"
 
-namespace facebook { namespace CUDAUtil {
+namespace facebook { namespace cuda {
 
 template<typename Operator>
 __global__ static void
diff --git a/src/util/Transform.cuh b/src/util/Transform.cuh
index a412e81..baeeaab 100644
--- a/src/util/Transform.cuh
+++ b/src/util/Transform.cuh
@@ -6,7 +6,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-namespace facebook { namespace CUDAUtil {
+namespace facebook { namespace cuda {
 
 /*
  * A generic interface for dense point-to-point operations.
diff --git a/test/BiasTest.cpp b/test/BiasTest.cpp
index df29948..89d8327 100644
--- a/test/BiasTest.cpp
+++ b/test/BiasTest.cpp
@@ -3,8 +3,8 @@
 #include "TestUtils.h"
 #include "THCTensor.h"
 
-#include "torch/fb/fbcunn/src/DeviceTensorUtils.h"
-#include "torch/fb/fbcunn/src/ConvolutionBias.cuh"
+#include "src/DeviceTensorUtils.h"
+#include "src/ConvolutionBias.cuh"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -14,6 +14,20 @@ using namespace std;
 using namespace facebook::deeplearning::torch;
 using namespace thpp;
 
+unique_ptr<THCState> g_state;
+
+// Override gtest_main to initialize a THCState
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  g_state.reset(new THCState);
+  THCudaInit(g_state.get());
+
+  auto ret = RUN_ALL_TESTS();
+  THCudaShutdown(g_state.get());
+  return ret;
+}
+
 namespace facebook { namespace deeplearning { namespace torch { namespace bias {
 
 constexpr int kRuns = 10;
@@ -177,17 +191,17 @@ void testOneAccGradParameters(long batchSize,
   auto expectedResult =
     referenceBiasAccGradParameters(output, gradBias, biasScale);
 
-  auto outputCuda = copyToCuda(nullptr, output);
-  auto gradBiasCuda = copyToCuda(nullptr, gradBias);
+  auto outputCuda = copyToCuda(g_state.get(), output);
+  auto gradBiasCuda = copyToCuda(g_state.get(), gradBias);
 
   for (int i = 0; i < nRep; i++) {
-    accGradParametersBias(nullptr,
+    accGradParametersBias(g_state.get(),
                           outputCuda.get(),
                           gradBiasCuda.get(),
                           biasScale);
   }
 
-  auto result = copyFromCuda(nullptr, gradBiasCuda.get());
+  auto result = copyFromCuda(g_state.get(), gradBiasCuda.get());
 
   // Due to order of reductions, our implementation is a little off
   auto comparison = test::compareTensors(expectedResult, result, 5e-4f);
@@ -213,17 +227,17 @@ void testOneAccGradParametersTemporal(long batchSize,
   auto expectedResult =
     referenceBiasAccGradParametersTemporal(output, gradBias, biasScale);
 
-  auto outputCuda = copyToCuda(nullptr, output);
-  auto gradBiasCuda = copyToCuda(nullptr, gradBias);
+  auto outputCuda = copyToCuda(g_state.get(), output);
+  auto gradBiasCuda = copyToCuda(g_state.get(), gradBias);
 
   for (int i = 0; i < nRep; i++) {
-    accGradParametersTemporalBias(nullptr,
+    accGradParametersTemporalBias(g_state.get(),
                                   outputCuda.get(),
                                   gradBiasCuda.get(),
                                   biasScale);
   }
 
-  auto result = copyFromCuda(nullptr, gradBiasCuda.get());
+  auto result = copyFromCuda(g_state.get(), gradBiasCuda.get());
 
   auto comparison = test::compareTensors(expectedResult, result, 5e-4f);
   EXPECT_TRUE(comparison.first) << comparison.second;
@@ -255,12 +269,12 @@ TEST(BiasTest, testUpdateOutput) {
     auto bias = makeBiasTensor(numPlanes);
     auto expectedResult = referenceBiasUpdateOutput(output, bias);
 
-    auto outputCuda = copyToCuda(nullptr, output);
-    auto biasCuda = copyToCuda(nullptr, bias);
+    auto outputCuda = copyToCuda(g_state.get(), output);
+    auto biasCuda = copyToCuda(g_state.get(), bias);
 
-    updateOutputBias(nullptr, outputCuda.get(), biasCuda.get());
+    updateOutputBias(g_state.get(), outputCuda.get(), biasCuda.get());
 
-    auto result = copyFromCuda(nullptr, outputCuda.get());
+    auto result = copyFromCuda(g_state.get(), outputCuda.get());
 
     auto comparison = test::compareTensors(expectedResult, result);
     EXPECT_TRUE(comparison.first) << comparison.second;
@@ -288,12 +302,12 @@ TEST(BiasTest, testUpdateOutputTemporal) {
     Tensor<float> transposedOutput;
     auto expectedResult = referenceBiasUpdateOutputTemporal(output, bias);
 
-    auto outputCuda = copyToCuda(nullptr, output);
-    auto biasCuda = copyToCuda(nullptr, bias);
+    auto outputCuda = copyToCuda(g_state.get(), output);
+    auto biasCuda = copyToCuda(g_state.get(), bias);
 
-    updateOutputTemporalBias(nullptr, outputCuda.get(), biasCuda.get());
+    updateOutputTemporalBias(g_state.get(), outputCuda.get(), biasCuda.get());
 
-    auto result = copyFromCuda(nullptr, outputCuda.get());
+    auto result = copyFromCuda(g_state.get(), outputCuda.get());
 
     auto comparison = test::compareTensors(expectedResult, result);
     EXPECT_TRUE(comparison.first) << comparison.second;
diff --git a/test/ConvolutionTest.cpp b/test/ConvolutionTest.cpp
index eefa449..6fef76f 100644
--- a/test/ConvolutionTest.cpp
+++ b/test/ConvolutionTest.cpp
@@ -1,14 +1,14 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "torch/fb/fbcunn/src/DeviceTensorUtils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THCTensor.h"
-#include "torch/fb/fbcunn/src/fft/Utils.h"
-#include "torch/fb/fbcunn/src/fft/CuFFTConvolution_UpdateOutput.cuh"
-#include "torch/fb/fbcunn/src/fft/CuFFTConvolution_AccGradParameters.cuh"
-#include "torch/fb/fbcunn/src/fft/CuFFTConvolution_UpdateGradInput.cuh"
-#include "torch/fb/fbcunn/test/InputCentricConvolution_UpdateOutput.cuh"
-#include "torch/fb/fbcunn/test/ReferenceConvolutions.h"
-#include "torch/fb/fbcunn/test/TestUtils.h"
+#include "src/fft/Utils.h"
+#include "src/fft/CuFFTConvolution_UpdateOutput.cuh"
+#include "src/fft/CuFFTConvolution_AccGradParameters.cuh"
+#include "src/fft/CuFFTConvolution_UpdateGradInput.cuh"
+#include "test/InputCentricConvolution_UpdateOutput.cuh"
+#include "test/ReferenceConvolutions.h"
+#include "test/TestUtils.h"
 
 #include <cuda.h>
 #include <folly/Optional.h>
@@ -22,11 +22,18 @@ using namespace facebook::deeplearning::torch;
 DEFINE_bool(verify, true, "Run the convolution and verify the output");
 DEFINE_bool(debug, false, "Print basic information on tensors");
 
+unique_ptr<THCState> g_state;
+
 // Override gtest_main so as to parse the --verify flag
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   google::ParseCommandLineFlags(&argc, &argv, true);
-  return RUN_ALL_TESTS();
+  g_state.reset(new THCState);
+  THCudaInit(g_state.get());
+
+  auto ret = RUN_ALL_TESTS();
+  THCudaShutdown(g_state.get());
+  return ret;
 }
 
 namespace facebook { namespace deeplearning { namespace torch { namespace test {
@@ -466,17 +473,12 @@ class TorchTest : public ConvolutionModule {
 
     // Torch APIs don't take input/filters as const even though they
     // effectively are
-    auto inputTH = input.moveAsTH();
-    auto filtersTH = filters.moveAsTH();
-    auto outputTH = output.moveAsTH();
+    auto inputTH = input.asTH();
+    auto filtersTH = filters.asTH();
+    auto outputTH = output.asTH();
 
     THFloatTensor_conv2Dmm(outputTH, 1.0, 1.0, inputTH, filtersTH,
                            filterRowStride, filterColStride, "V", "X");
-
-    // Rebind for evaluation and cleanup
-    output = std::move(outputTH);
-    input = std::move(inputTH);
-    filters = std::move(filtersTH);
   }
 
   void updateGradInput(
@@ -488,20 +490,15 @@ class TorchTest : public ConvolutionModule {
     Tensor<float>& input) override {
     ASSERT_FALSE(inputPadding); // padding not supported
 
-    auto inputTH = input.moveAsTH();
-    auto outputTH = output.moveAsTH();
+    auto inputTH = input.asTH();
+    auto outputTH = output.asTH();
 
     // Torch requires transposition of filters
     filters.transpose(0, 1);
-    auto filtersTH = filters.moveAsTH();
+    auto filtersTH = filters.asTH();
 
     THFloatTensor_conv2Dmm(inputTH, 0.0, 1.0, outputTH, filtersTH,
                            filterRowStride, filterColStride, "F", "C");
-
-    // Rebind for evaluation and cleanup
-    input = std::move(inputTH);
-    output = std::move(outputTH);
-    filters = std::move(filtersTH);
   }
 
   void accGradParameters(
@@ -514,18 +511,13 @@ class TorchTest : public ConvolutionModule {
     Tensor<float>& filters) override {
     ASSERT_FALSE(inputPadding); // padding not supported
 
-    auto inputTH = input.moveAsTH();
-    auto outputTH = output.moveAsTH();
-    auto filtersTH = filters.moveAsTH();
+    auto inputTH = input.asTH();
+    auto outputTH = output.asTH();
+    auto filtersTH = filters.asTH();
 
     THFloatTensor_conv2DRevgerm(filtersTH, 1.0, scale,
                                 inputTH, outputTH,
                                 filterRowStride, filterColStride);
-
-    // Rebind for evaluation and cleanup
-    input = std::move(inputTH);
-    output = std::move(outputTH);
-    filters = std::move(filtersTH);
   }
 };
 
@@ -588,8 +580,8 @@ class InputCentricTest : public ConvolutionModule {
     Tensor<float>& output) override {
     ASSERT_FALSE(inputPadding); // padding not supported
 
-    auto inputCuda = copyToCuda(nullptr, input);
-    auto filtersCuda = copyToCuda(nullptr, filters);
+    auto inputCuda = copyToCuda(g_state.get(), input);
+    auto filtersCuda = copyToCuda(g_state.get(), filters);
 
     CHECK(layout == Layout::Relayout) <<
       "Only Relayout mode is supported for this kernel atm";
@@ -609,13 +601,13 @@ class InputCentricTest : public ConvolutionModule {
         }
       }
     }
-    auto filtersCudaTmp = copyToCuda(nullptr, filtersTmp);
+    auto filtersCudaTmp = copyToCuda(g_state.get(), filtersTmp);
 
     // Relayout output, for instance for 32 x 96 x 71 x 71 we get
     const int filterRowSize = filters.size(2);
     const int ceilFilterSizeFilterStride =
       (filterRowSize + filterRowStride - 1) / filterRowStride;
-    auto outputCudaTmp = makeTHCudaTensorFull(nullptr, {
+    auto outputCudaTmp = makeTHCudaTensorFull(g_state.get(), {
         output.size(0),      // 32
           // 71 + 2 * ceilFilterSizeFilterStride
           // This expansion by 2 * ceilFilterSizeFilterStride allows us to
@@ -627,7 +619,7 @@ class InputCentricTest : public ConvolutionModule {
       );
 
     bool result =
-      InputCentricRelayoutConvolution_UpdateOutput(nullptr,
+      InputCentricRelayoutConvolution_UpdateOutput(g_state.get(),
                                                    inputCuda.get(),
                                                    filtersCudaTmp.get(),
                                                    filterRowStride,
@@ -637,7 +629,7 @@ class InputCentricTest : public ConvolutionModule {
     EXPECT_TRUE(result);
 
     // Recover actual output from layout
-    auto outputTmp = copyFromCuda(nullptr, outputCudaTmp.get());
+    auto outputTmp = copyFromCuda(g_state.get(), outputCudaTmp.get());
     for (long i = 0; i < output.size(0); ++i) {
       for (long j = 0; j < output.size(1); ++j) {
         for (long k = 0; k < output.size(2); ++k) {
@@ -707,33 +699,34 @@ class CuFFT : public ConvolutionModule {
       std::max(filters.size(3), output.size(3)));
 
     std::vector<long> maxSizes({maxRows, maxCols});
-    auto realComplexPair = makeCuFFTTensors<kFFTDims>(nullptr, input, maxSizes);
+    auto realComplexPair =
+      makeCuFFTTensors<kFFTDims>(g_state.get(), input, maxSizes);
     auto inputTHCudaTensor = std::move(realComplexPair.first);
     auto inputComplexTHCudaTensor = std::move(realComplexPair.second);
     auto inputComplexTHCudaTensorT = makeCuFFTTensorComplex<kFFTDims>(
-      nullptr, inputTHCudaTensor.get(), maxSizes);
+      g_state.get(), inputTHCudaTensor.get(), maxSizes);
 
     realComplexPair =
-      makeCuFFTTensors<kFFTDims>(nullptr, filters, maxSizes);
+      makeCuFFTTensors<kFFTDims>(g_state.get(), filters, maxSizes);
     auto filtersTHCudaTensor = std::move(realComplexPair.first);
     auto filtersComplexTHCudaTensor = std::move(realComplexPair.second);
     auto filtersComplexTHCudaTensorT = makeCuFFTTensorComplex<kFFTDims>(
-      nullptr, filtersTHCudaTensor.get(), maxSizes);
+      g_state.get(), filtersTHCudaTensor.get(), maxSizes);
 
     realComplexPair =
-      makeCuFFTTensors<kFFTDims>(nullptr, output, maxSizes);
+      makeCuFFTTensors<kFFTDims>(g_state.get(), output, maxSizes);
     auto outputTHCudaTensor = std::move(realComplexPair.first);
     auto outputComplexTHCudaTensor = std::move(realComplexPair.second);
     auto outputComplexTHCudaTensorT = makeCuFFTTensorComplex<kFFTDims>(
-      nullptr, outputTHCudaTensor.get(), maxSizes);
+      g_state.get(), outputTHCudaTensor.get(), maxSizes);
 
     // We don't test the bias here
-    auto bias = Tensor<float>{{output.size(0)}};
+    auto bias = Tensor<float>{output.size(0)};
     bias.fill(0);
-    auto biasCuda = copyToCuda(nullptr, bias);
+    auto biasCuda = copyToCuda(g_state.get(), bias);
 
     if (impl_ == Implementation::Reference) {
-      CuFFTConvolution_ReferenceUpdateOutput(nullptr,
+      CuFFTConvolution_ReferenceUpdateOutput(g_state.get(),
                                              inputTHCudaTensor.get(),
                                              filtersTHCudaTensor.get(),
                                              outputTHCudaTensor.get(),
@@ -742,7 +735,7 @@ class CuFFT : public ConvolutionModule {
                                              filtersComplexTHCudaTensor.get(),
                                              outputComplexTHCudaTensor.get());
     } else {
-      CuFFTConvolution_UpdateOutput(nullptr,
+      CuFFTConvolution_UpdateOutput(g_state.get(),
                                     inputTHCudaTensor.get(),
                                     filtersTHCudaTensor.get(),
                                     outputTHCudaTensor.get(),
@@ -757,13 +750,14 @@ class CuFFT : public ConvolutionModule {
 
     if (FLAGS_verify) {
       checkExpectedInput(input,
-                         copyFromCuda(nullptr, inputTHCudaTensor.get()));
+                         copyFromCuda(g_state.get(), inputTHCudaTensor.get()));
       checkExpectedInput(filters,
-                         copyFromCuda(nullptr, filtersTHCudaTensor.get()));
+                         copyFromCuda(g_state.get(),
+                                      filtersTHCudaTensor.get()));
 
       // Recover actual output from padded layout, output is smaller
       // than outputTmp when kernelSize > 1
-      auto outputTmp = copyFromCuda(nullptr, outputTHCudaTensor.get());
+      auto outputTmp = copyFromCuda(g_state.get(), outputTHCudaTensor.get());
       for (long i = 0; i < output.size(0); ++i) {
         for (long j = 0; j < output.size(1); ++j) {
           for (long k = 0; k < output.size(2); ++k) {
@@ -802,34 +796,34 @@ class CuFFT : public ConvolutionModule {
 
     std::vector<long> maxSizes({maxRows, maxCols});
     auto realComplexPair =
-      makeCuFFTTensors<kFFTDims>(nullptr, input, maxSizes);
+      makeCuFFTTensors<kFFTDims>(g_state.get(), input, maxSizes);
     auto inputTHCudaTensor = std::move(realComplexPair.first);
     auto inputComplexTHCudaTensor = std::move(realComplexPair.second);
     auto inputComplexTHCudaTensorT = makeCuFFTTensorComplex<kFFTDims>(
-      nullptr, inputTHCudaTensor.get(), maxSizes);
+      g_state.get(), inputTHCudaTensor.get(), maxSizes);
 
     realComplexPair =
-      makeCuFFTTensors<kFFTDims>(nullptr, filters, maxSizes);
+      makeCuFFTTensors<kFFTDims>(g_state.get(), filters, maxSizes);
     auto filtersTHCudaTensor = std::move(realComplexPair.first);
     auto filtersComplexTHCudaTensor = std::move(realComplexPair.second);
     auto filtersComplexTHCudaTensorT = makeCuFFTTensorComplex<kFFTDims>(
-      nullptr, filtersTHCudaTensor.get(), maxSizes);
+      g_state.get(), filtersTHCudaTensor.get(), maxSizes);
 
     realComplexPair =
-      makeCuFFTTensors<kFFTDims>(nullptr, output, maxSizes);
+      makeCuFFTTensors<kFFTDims>(g_state.get(), output, maxSizes);
     auto outputTHCudaTensor = std::move(realComplexPair.first);
     auto outputComplexTHCudaTensor = std::move(realComplexPair.second);
     auto outputComplexTHCudaTensorT = makeCuFFTTensorComplex<kFFTDims>(
-      nullptr, outputTHCudaTensor.get(), maxSizes);
+      g_state.get(), outputTHCudaTensor.get(), maxSizes);
 
     // We don't test the bias here
-    auto bias = Tensor<float>{{filters.size(0)}};
+    auto bias = Tensor<float>{filters.size(0)};
     bias.fill(0);
-    auto biasCuda = copyToCuda(nullptr, bias);
+    auto biasCuda = copyToCuda(g_state.get(), bias);
 
     if (impl_ == Implementation::Reference) {
       CuFFTConvolution_ReferenceAccGradParameters(
-        nullptr,
+        g_state.get(),
         inputTHCudaTensor.get(),
         filtersTHCudaTensor.get(),
         outputTHCudaTensor.get(),
@@ -839,7 +833,7 @@ class CuFFT : public ConvolutionModule {
         filtersComplexTHCudaTensor.get(),
         outputComplexTHCudaTensor.get());
     } else {
-      CuFFTConvolution_AccGradParameters(nullptr,
+      CuFFTConvolution_AccGradParameters(g_state.get(),
                                          inputTHCudaTensor.get(),
                                          filtersTHCudaTensor.get(),
                                          outputTHCudaTensor.get(),
@@ -855,12 +849,12 @@ class CuFFT : public ConvolutionModule {
 
     if (FLAGS_verify) {
       checkExpectedInput(input,
-                         copyFromCuda(nullptr, inputTHCudaTensor.get()));
+                         copyFromCuda(g_state.get(), inputTHCudaTensor.get()));
       checkExpectedInput(output,
-                         copyFromCuda(nullptr, outputTHCudaTensor.get()));
+                         copyFromCuda(g_state.get(), outputTHCudaTensor.get()));
       // Recover actual filters from padded layout, filters is smaller
       // than filtersTmp when kernelSize > 1
-      auto filtersTmp = copyFromCuda(nullptr, filtersTHCudaTensor.get());
+      auto filtersTmp = copyFromCuda(g_state.get(), filtersTHCudaTensor.get());
       for (long i = 0; i < filters.size(0); ++i) {
         for (long j = 0; j < filters.size(1); ++j) {
           for (long k = 0; k < filters.size(2); ++k) {
@@ -896,30 +890,30 @@ class CuFFT : public ConvolutionModule {
 
     std::vector<long> maxSizes({maxRows, maxCols});
     auto realComplexPair =
-      makeCuFFTTensors<kFFTDims>(nullptr, input, maxSizes);
+      makeCuFFTTensors<kFFTDims>(g_state.get(), input, maxSizes);
     auto inputTHCudaTensor = std::move(realComplexPair.first);
     auto inputComplexTHCudaTensor = std::move(realComplexPair.second);
     auto inputComplexTHCudaTensorT = makeCuFFTTensorComplex<kFFTDims>(
-      nullptr, inputTHCudaTensor.get(), maxSizes);
+      g_state.get(), inputTHCudaTensor.get(), maxSizes);
 
     realComplexPair =
-      makeCuFFTTensors<kFFTDims>(nullptr, filters, maxSizes);
+      makeCuFFTTensors<kFFTDims>(g_state.get(), filters, maxSizes);
     auto filtersTHCudaTensor = std::move(realComplexPair.first);
     auto filtersComplexTHCudaTensor = std::move(realComplexPair.second);
     auto filtersComplexTHCudaTensorT =
       makeCuFFTTensorComplex<kFFTDims>(
-        nullptr, filtersTHCudaTensor.get(), maxSizes);
+        g_state.get(), filtersTHCudaTensor.get(), maxSizes);
 
     realComplexPair =
-      makeCuFFTTensors<kFFTDims>(nullptr, output, maxSizes);
+      makeCuFFTTensors<kFFTDims>(g_state.get(), output, maxSizes);
     auto outputTHCudaTensor = std::move(realComplexPair.first);
     auto outputComplexTHCudaTensor = std::move(realComplexPair.second);
     auto outputComplexTHCudaTensorT = makeCuFFTTensorComplex<kFFTDims>(
-      nullptr, outputTHCudaTensor.get(), maxSizes);
+      g_state.get(), outputTHCudaTensor.get(), maxSizes);
 
     if (impl_ == Implementation::Reference) {
       CuFFTConvolution_ReferenceUpdateGradInput(
-        nullptr,
+        g_state.get(),
         inputTHCudaTensor.get(),
         filtersTHCudaTensor.get(),
         outputTHCudaTensor.get(),
@@ -928,7 +922,7 @@ class CuFFT : public ConvolutionModule {
         outputComplexTHCudaTensor.get());
     } else {
       CuFFTConvolution_UpdateGradInput(
-        nullptr,
+        g_state.get(),
         inputTHCudaTensor.get(),
         filtersTHCudaTensor.get(),
         outputTHCudaTensor.get(),
@@ -942,12 +936,13 @@ class CuFFT : public ConvolutionModule {
 
     if (FLAGS_verify) {
       checkExpectedInput(filters,
-                         copyFromCuda(nullptr, filtersTHCudaTensor.get()));
+                         copyFromCuda(g_state.get(),
+                                      filtersTHCudaTensor.get()));
       checkExpectedInput(output,
-                         copyFromCuda(nullptr, outputTHCudaTensor.get()));
+                         copyFromCuda(g_state.get(), outputTHCudaTensor.get()));
       // Recover actual filters from padded layout, filters is smaller
       // than filtersTmp when kernelSize > 1
-      auto inputTmp = copyFromCuda(nullptr, inputTHCudaTensor.get());
+      auto inputTmp = copyFromCuda(g_state.get(), inputTHCudaTensor.get());
       for (long i = 0; i < input.size(0); ++i) {
         for (long j = 0; j < input.size(1); ++j) {
           for (long k = 0; k < input.size(2); ++k) {
@@ -1376,7 +1371,7 @@ TEST(CudaConvolutionTest, CuFFT_updateGradInput_fixed) {
 
   CuFFT::checkExpectedInput(
     expectedInput,
-    copyFromCuda(nullptr, cufft.saveInputTHCudaTensor.get()));
+    copyFromCuda(g_state.get(), cufft.saveInputTHCudaTensor.get()));
 }
 
 } } } } // namespace
diff --git a/test/CuBLASTest.cpp b/test/CuBLASTest.cpp
index 3f5eefa..40a343b 100644
--- a/test/CuBLASTest.cpp
+++ b/test/CuBLASTest.cpp
@@ -1,9 +1,9 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "torch/fb/fbcunn/src/DeviceTensorUtils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THCTensor.h"
-#include "torch/fb/fbcunn/src/CuBLASWrapper.h"
-#include "torch/fb/fbcunn/test/TestUtils.h"
+#include "src/CuBLASWrapper.h"
+#include "test/TestUtils.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -13,6 +13,20 @@ using namespace std;
 using namespace facebook::deeplearning::torch;
 using namespace thpp;
 
+unique_ptr<THCState> g_state;
+
+// Override gtest_main to initialize a THCState
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  g_state.reset(new THCState);
+  THCudaInit(g_state.get());
+
+  auto ret = RUN_ALL_TESTS();
+  THCudaShutdown(g_state.get());
+  return ret;
+}
+
 namespace facebook { namespace deeplearning { namespace torch { namespace test {
 
 template<int Dim>
@@ -37,15 +51,15 @@ std::pair<std::unique_ptr<THCudaTensor, CudaTensorDeleter>,
                      bool asComplex = false) {
   CHECK_EQ(Dim, t.ndims());
   CHECK_EQ(Dim, tt.ndims());
-  auto tCuda = copyToCuda(nullptr, t);
-  auto ttCuda = copyToCuda(nullptr, tt);
+  auto tCuda = copyToCuda(g_state.get(), t);
+  auto ttCuda = copyToCuda(g_state.get(), tt);
   DeviceTensor<float, Dim> tCudaTensor =
-    torchToDeviceTensor<float, Dim>(nullptr, tCuda.get());
+    torchToDeviceTensor<float, Dim>(g_state.get(), tCuda.get());
   DeviceTensor<float, Dim> ttCudaTensor =
-    torchToDeviceTensor<float, Dim>(nullptr, ttCuda.get());
+    torchToDeviceTensor<float, Dim>(g_state.get(), ttCuda.get());
 
   transpose(tCudaTensor, ttCudaTensor, sep, asComplex);
-  tt = copyFromCuda(nullptr, ttCuda.get());
+  tt = copyFromCuda(g_state.get(), ttCuda.get());
   tt.resize(LongStorage(resizeTransposed));
   return make_pair(std::move(tCuda), std::move(ttCuda));
 }
@@ -58,11 +72,13 @@ void unTransposeAndCheckOutOfPlace(
     int sep,
     initializer_list<long> testSize,
     bool asComplex = false) {
-  auto ct = torchToDeviceTensor<float, Dim>(nullptr, pCudaTensor.first.get());
-  auto ctt = torchToDeviceTensor<float, Dim>(nullptr, pCudaTensor.second.get());
+  auto ct =
+    torchToDeviceTensor<float, Dim>(g_state.get(), pCudaTensor.first.get());
+  auto ctt =
+    torchToDeviceTensor<float, Dim>(g_state.get(), pCudaTensor.second.get());
 
   transpose(ct, ctt, Dim - sep, asComplex);
-  pTensor.second = copyFromCuda(nullptr, pCudaTensor.first.get());
+  pTensor.second = copyFromCuda(g_state.get(), pCudaTensor.first.get());
   pTensor.first.resize(LongStorage(testSize));
   pTensor.second.resize(LongStorage(testSize));
 
diff --git a/test/CudaTensorTest.cpp b/test/CudaTensorTest.cpp
index 3def370..10328cb 100644
--- a/test/CudaTensorTest.cpp
+++ b/test/CudaTensorTest.cpp
@@ -1,9 +1,9 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
-#include "torch/fb/fbcunn/src/CudaTensorUtils.h"
+#include "src/CudaTensorUtils.h"
 #include "THC.h"
-#include "torch/fb/fbcunn/test/CudaTensorTestKernels.cuh"
-#include "folly/Optional.h"
-#include "folly/ScopeGuard.h"
+#include "test/CudaTensorTestKernels.cuh"
+#include <folly/Optional.h>
+#include <folly/ScopeGuard.h>
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -11,6 +11,20 @@
 
 using namespace std;
 
+unique_ptr<THCState> g_state;
+
+// Override gtest_main to initialize a THCState
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  g_state.reset(new THCState);
+  THCudaInit(g_state.get());
+
+  auto ret = RUN_ALL_TESTS();
+  THCudaShutdown(g_state.get());
+  return ret;
+}
+
 namespace facebook { namespace deeplearning { namespace torch {
 
 namespace {
@@ -39,47 +53,51 @@ void verify3d(THCudaTensor* tensor) {
 } // unnamed namespace
 
 TEST(CudaTensor, testDimensionMismatch) {
-  EXPECT_THROW(testAssignment3d(nullptr,
-                 makeTHCudaTensorFull(nullptr, {1, 2, 3, 4}).get()),
+  EXPECT_THROW(testAssignment3d(
+                 g_state.get(),
+                 makeTHCudaTensorFull(g_state.get(), {1, 2, 3, 4}).get()),
                invalid_argument);
-  EXPECT_THROW(testAssignment3d(nullptr,
-                 makeTHCudaTensorFull(nullptr, {1}).get()),
+  EXPECT_THROW(testAssignment3d(
+                 g_state.get(),
+                 makeTHCudaTensorFull(g_state.get(), {1}).get()),
                invalid_argument);
 }
 
 TEST(CudaTensor, testWrite3d) {
-  auto tensor = makeTHCudaTensorFull(nullptr, {11, 7, 5});
+  auto tensor = makeTHCudaTensorFull(g_state.get(), {11, 7, 5});
 
   // Run our kernel
-  EXPECT_TRUE(testAssignment3d(nullptr, tensor.get()));
+  EXPECT_TRUE(testAssignment3d(g_state.get(), tensor.get()));
   verify3d(tensor.get());
 }
 
 TEST(CudaTensor, testWrite3dNonTrivialStride) {
-  auto tensor = makeTHCudaTensorFull(nullptr, {11, 7, 5}, {200, 6, 1});
+  auto tensor = makeTHCudaTensorFull(g_state.get(), {11, 7, 5}, {200, 6, 1});
 
   // Run our kernel
-  EXPECT_TRUE(testAssignment3d(nullptr, tensor.get()));
+  EXPECT_TRUE(testAssignment3d(g_state.get(), tensor.get()));
   verify3d(tensor.get());
 }
 
 TEST(CudaTensor, testWrite1d) {
   constexpr long kSize = 3;
-  auto storage = THCudaStorage_newWithSize(nullptr, kSize);
-  auto tensor = THCudaTensor_newWithStorage1d(nullptr, storage, 0, kSize, 1);
-  SCOPE_EXIT{ THCudaTensor_free(nullptr, tensor); };
+  auto storage =
+    THCudaStorage_newWithSize(g_state.get(), kSize);
+  auto tensor =
+    THCudaTensor_newWithStorage1d(g_state.get(), storage, 0, kSize, 1);
+  SCOPE_EXIT{ THCudaTensor_free(g_state.get(), tensor); };
 
   // Clear out tensor
-  THCudaTensor_fill(nullptr, tensor, 0.0f);
+  THCudaTensor_fill(g_state.get(), tensor, 0.0f);
 
   // Run our kernel
-  EXPECT_TRUE(testAssignment1d(nullptr, tensor));
+  EXPECT_TRUE(testAssignment1d(g_state.get(), tensor));
 
   // Verify output
   auto hostStorage = THFloatStorage_newWithSize(tensor->storage->size);
   SCOPE_EXIT{ THFloatStorage_free(hostStorage); };
 
-  THFloatStorage_copyCuda(nullptr, hostStorage, storage);
+  THFloatStorage_copyCuda(g_state.get(), hostStorage, storage);
 
   for (int i = 0; i < tensor->size[0]; ++i) {
     EXPECT_EQ(i, hostStorage->data[i]);
@@ -88,51 +106,58 @@ TEST(CudaTensor, testWrite1d) {
 
 TEST(CudaTensor, testUpcast) {
   // test with no padding
-  EXPECT_TRUE(testUpcast(nullptr,
-                makeTHCudaTensorFull(nullptr, {3, 2, 1}).get()));
+  EXPECT_TRUE(testUpcast(g_state.get(),
+                makeTHCudaTensorFull(g_state.get(), {3, 2, 1}).get()));
 
   // test with padding
-  EXPECT_TRUE(testUpcast(nullptr,
-                makeTHCudaTensorFull(nullptr, {4, 3, 2}, {150, 40, 15}).get()));
+  EXPECT_TRUE(
+    testUpcast(g_state.get(),
+               makeTHCudaTensorFull(
+                 g_state.get(), {4, 3, 2}, {150, 40, 15}).get()));
 }
 
 TEST(CudaTensor, testDowncastIllegalPaddingThrows) {
   // 16 should be 12 for no padding
-  EXPECT_THROW(testDowncastTo2d(nullptr,
-                 makeTHCudaTensorFull(nullptr, {2, 3, 4}, {16, 4, 1}).get()),
+  EXPECT_THROW(testDowncastTo2d(
+                 g_state.get(),
+                 makeTHCudaTensorFull(
+                   g_state.get(), {2, 3, 4}, {16, 4, 1}).get()),
                invalid_argument);
 
   // 15/5 should be 12/3 for no padding
-  EXPECT_THROW(testDowncastTo1d(nullptr,
-                 makeTHCudaTensorFull(nullptr, {2, 3, 4}, {15, 5, 1}).get()),
+  EXPECT_THROW(testDowncastTo1d(
+                 g_state.get(),
+                 makeTHCudaTensorFull(
+                   g_state.get(), {2, 3, 4}, {15, 5, 1}).get()),
                invalid_argument);
 
   // But, the same should not cause a problem for 2d since the padding
   // is in the non-collapsed dimensions
-  EXPECT_NO_THROW(testDowncastTo2d(nullptr,
+  EXPECT_NO_THROW(testDowncastTo2d(g_state.get(),
                     makeTHCudaTensorFull(
-                      nullptr, {2, 3, 4}, {15, 5, 1}).get()));
+                      g_state.get(), {2, 3, 4}, {15, 5, 1}).get()));
 }
 
 TEST(CudaTensor, testDowncast) {
-  EXPECT_TRUE(testDowncastTo2d(nullptr,
-                               makeTHCudaTensorFull(nullptr, {2, 3, 4}).get()));
+  EXPECT_TRUE(testDowncastTo2d(
+                g_state.get(),
+                makeTHCudaTensorFull(g_state.get(), {2, 3, 4}).get()));
 
   // We can have padding in the innermost dimension
-  EXPECT_TRUE(testDowncastTo2d(nullptr,
-                               makeTHCudaTensorFull(nullptr, {2, 3, 4},
+  EXPECT_TRUE(testDowncastTo2d(g_state.get(),
+                               makeTHCudaTensorFull(g_state.get(), {2, 3, 4},
                                                     {36, 12, 3}).get()));
 }
 
 TEST(CudaTensor, testDowncastWrites) {
-  auto tensor = makeTHCudaTensorFull(nullptr, {2, 3, 4});
-  EXPECT_TRUE(testDowncastWrites(nullptr, tensor.get()));
+  auto tensor = makeTHCudaTensorFull(g_state.get(), {2, 3, 4});
+  EXPECT_TRUE(testDowncastWrites(g_state.get(), tensor.get()));
 
   // Verify output
   auto hostStorage = THFloatStorage_newWithSize(tensor->storage->size);
   SCOPE_EXIT{ THFloatStorage_free(hostStorage); };
 
-  THFloatStorage_copyCuda(nullptr, hostStorage, tensor->storage);
+  THFloatStorage_copyCuda(g_state.get(), hostStorage, tensor->storage);
 
   // In the downcast view, we should have overwritten all the values
   for (int k = 0; k < tensor->size[0]; ++k) {
diff --git a/test/CudaTensorTestKernels.cu b/test/CudaTensorTestKernels.cu
index 2b7e251..3ea9803 100644
--- a/test/CudaTensorTestKernels.cu
+++ b/test/CudaTensorTestKernels.cu
@@ -1,13 +1,12 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 #include "cuda/DeviceTensor.cuh"
-#include "torch/fb/fbcunn/src/DeviceTensorUtils.h"
-
-#include "torch/fb/fbcunn/src/util/Misc.h"
+#include "cuda/util/CachedDeviceProperties.h"
+#include "src/DeviceTensorUtils.h"
 
 #include <cuda.h>
 
 using namespace facebook::cuda;
-using namespace facebook::CUDAUtil;
+using namespace facebook::cuda;
 
 namespace facebook { namespace deeplearning { namespace torch {
 
diff --git a/test/FFTTest.cpp b/test/FFTTest.cpp
index 8934379..2bfa6be 100644
--- a/test/FFTTest.cpp
+++ b/test/FFTTest.cpp
@@ -1,11 +1,11 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "torch/fb/fbcunn/src/DeviceTensorUtils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THCTensor.h"
-#include "torch/fb/fbcunn/src/fft/CuFFTWrapper.cuh"
-#include "torch/fb/fbcunn/test/InputCentricConvolution_UpdateOutput.cuh"
-#include "torch/fb/fbcunn/test/ReferenceConvolutions.h"
-#include "torch/fb/fbcunn/test/TestUtils.h"
+#include "src/fft/CuFFTWrapper.cuh"
+#include "test/InputCentricConvolution_UpdateOutput.cuh"
+#include "test/ReferenceConvolutions.h"
+#include "test/TestUtils.h"
 
 #include <folly/Optional.h>
 
@@ -18,11 +18,18 @@ using namespace facebook::deeplearning::torch;
 
 DEFINE_bool(verify, true, "Run the convolution and verify the output");
 
+unique_ptr<THCState> g_state;
+
 // Override gtest_main so as to parse the --verify flag
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   google::ParseCommandLineFlags(&argc, &argv, true);
-  return RUN_ALL_TESTS();
+  g_state.reset(new THCState);
+  THCudaInit(g_state.get());
+
+  auto ret = RUN_ALL_TESTS();
+  THCudaShutdown(g_state.get());
+  return ret;
 }
 
 namespace facebook { namespace deeplearning { namespace torch { namespace test {
@@ -81,16 +88,16 @@ class FFTTestBase : public ::testing::Test {
     }
 
     auto realComplexPair =
-      makeCuFFTTensors<FFTDim>(nullptr, input, FFTSize, cfg.inPlace);
+      makeCuFFTTensors<FFTDim>(g_state.get(), input, FFTSize, cfg.inPlace);
     inputTHCudaTensor = std::move(realComplexPair.first);
     fftTHCudaTensor = std::move(realComplexPair.second);
 
     inputCudaTensor =
       torchToDeviceTensor<float, FFTDim + BatchDim>(
-        nullptr, inputTHCudaTensor.get());
+        g_state.get(), inputTHCudaTensor.get());
     outputCudaTensor =
       torchToDeviceTensor<float, FFTDim + BatchDim + 1>(
-        nullptr, fftTHCudaTensor.get());
+        g_state.get(), fftTHCudaTensor.get());
 
     if (cfg.inPlace == FFTOutputSpecification::InPlace) {
       CHECK_EQ(inputCudaTensor.data(), outputCudaTensor.data());
@@ -220,7 +227,7 @@ TEST_F(FFT2DTest, test2x2ConstantInPlace) {
   fft2d<2>(inputCudaTensor, outputCudaTensor);
 
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()));
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()));
 }
 
 TEST_F(FFT2DTest, test2x2ConstantOutOfPlace) {
@@ -253,7 +260,7 @@ TEST_F(FFT2DTest, test2x2ConstantOutOfPlace) {
   fft2d<2>(inputCudaTensor, outputCudaTensor);
 
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()));
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()));
 }
 
 TEST_F(FFT2DTest, test2x2VariableInPlace) {
@@ -290,7 +297,7 @@ TEST_F(FFT2DTest, test2x2VariableInPlace) {
   fft2d<2>(inputCudaTensor, outputCudaTensor);
 
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()));
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()));
 }
 
 TEST_F(FFT2DTest, test2x2VariableOutOfPlace) {
@@ -328,7 +335,7 @@ TEST_F(FFT2DTest, test2x2VariableOutOfPlace) {
   fft2d<2>(inputCudaTensor, outputCudaTensor);
 
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()));
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()));
 }
 
 TEST_F(FFT2DTest, test1x2ConstantInPlacePadded) {
@@ -365,7 +372,7 @@ TEST_F(FFT2DTest, test1x2ConstantInPlacePadded) {
   fft2d<2>(inputCudaTensor, outputCudaTensor);
 
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()));
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()));
 }
 
 TEST_F(FFT2DTest, test1x2ConstantOutOfPlacePadded) {
@@ -402,7 +409,7 @@ TEST_F(FFT2DTest, test1x2ConstantOutOfPlacePadded) {
   fft2d<2>(inputCudaTensor, outputCudaTensor);
 
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()));
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()));
 }
 
 TEST_F(FFT2DTest, test2x2ConstantInPlacePadded) {
@@ -456,7 +463,7 @@ TEST_F(FFT2DTest, test2x2ConstantInPlacePadded) {
 
   // One element does not check at 1e-6f error
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()),
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()),
                       5e-5f);
 }
 
@@ -515,7 +522,7 @@ TEST_F(FFT2DTest, test2x2ConstantOutOfPlacePadded) {
 
   // One element does not check at 1e-6f error
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()),
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()),
                       5e-5f);
 }
 
@@ -552,7 +559,7 @@ TEST_F(FFT2DTest, testInverseOutOfPlace) {
 
   // First element does not check at 5e-5f error
   checkExpectedInput(input,
-                     copyFromCuda(nullptr, inputTHCudaTensor.get()),
+                     copyFromCuda(g_state.get(), inputTHCudaTensor.get()),
                      5e-4f);
 }
 
@@ -588,7 +595,7 @@ TEST_F(FFT2DTest, testInverseInPlace) {
 
   // First element does not check at 1e-6f error
   checkExpectedInput(input,
-                     copyFromCuda(nullptr, inputTHCudaTensor.get()),
+                     copyFromCuda(g_state.get(), inputTHCudaTensor.get()),
                      5e-5f);
 }
 
@@ -625,7 +632,7 @@ TEST_F(FFT2DTest, testInverseOutOfPlacePadded) {
 
   // First element does not check at 5e-5f error
   checkExpectedInput(input,
-                     copyFromCuda(nullptr, inputTHCudaTensor.get()),
+                     copyFromCuda(g_state.get(), inputTHCudaTensor.get()),
                      5e-4f);
 }
 
@@ -677,7 +684,7 @@ TEST_F(FFT1DTest, test1x4VariableOutOfPlacePadded) {
   fft1d<3>(inputCudaTensor, outputCudaTensor);
 
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()));
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()));
 }
 
 TEST_F(FFT1DTest, test1x4VariableInPlacePadded) {
@@ -728,7 +735,7 @@ TEST_F(FFT1DTest, test1x4VariableInPlacePadded) {
   fft1d<3>(inputCudaTensor, outputCudaTensor);
 
   checkExpectedOutput(expected,
-                      copyFromCuda(nullptr, fftTHCudaTensor.get()));
+                      copyFromCuda(g_state.get(), fftTHCudaTensor.get()));
 }
 
 TEST_F(FFT1DTest, testInverseInPlace) {
@@ -762,7 +769,7 @@ TEST_F(FFT1DTest, testInverseInPlace) {
   fft1d<3>(inputCudaTensor, outputCudaTensor, FFTParameters().inverse());
 
   checkExpectedInput(input,
-                     copyFromCuda(nullptr, inputTHCudaTensor.get()));
+                     copyFromCuda(g_state.get(), inputTHCudaTensor.get()));
 }
 
 TEST_F(FFT1DTest, testInverseOutOfPlacePadded) {
@@ -797,7 +804,7 @@ TEST_F(FFT1DTest, testInverseOutOfPlacePadded) {
   fft1d<3>(inputCudaTensor, outputCudaTensor, FFTParameters().inverse());
 
   checkExpectedInput(input,
-                     copyFromCuda(nullptr, inputTHCudaTensor.get()),
+                     copyFromCuda(g_state.get(), inputTHCudaTensor.get()),
                      5e-5f);
 }
 
diff --git a/test/InputCentricConvolution_UpdateOutput.cu b/test/InputCentricConvolution_UpdateOutput.cu
index 6e3d173..5ea8439 100644
--- a/test/InputCentricConvolution_UpdateOutput.cu
+++ b/test/InputCentricConvolution_UpdateOutput.cu
@@ -1,16 +1,16 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "torch/fb/fbcunn/test/InputCentricConvolution_UpdateOutput.cuh"
+#include "test/InputCentricConvolution_UpdateOutput.cuh"
 #include "cuda/CudaUtils.cuh"
 #include "cuda/DeviceTensor.cuh"
-#include "torch/fb/fbcunn/src/DeviceTensorUtils.h"
-#include "torch/fb/fbcunn/src/util/Misc.h"
+#include "cuda/util/CachedDeviceProperties.h"
+#include "src/DeviceTensorUtils.h"
 
 #include <glog/logging.h>
 #include <iostream>
 
 using namespace facebook::cuda;
-using namespace facebook::CUDAUtil;
+using namespace facebook::cuda;
 
 namespace facebook { namespace deeplearning { namespace torch { namespace test {
 
diff --git a/test/ReferenceConvolutions.cpp b/test/ReferenceConvolutions.cpp
index 034c4e1..3e8bcb7 100644
--- a/test/ReferenceConvolutions.cpp
+++ b/test/ReferenceConvolutions.cpp
@@ -1,6 +1,6 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 
-#include "torch/fb/fbcunn/test/ReferenceConvolutions.h"
+#include "test/ReferenceConvolutions.h"
 
 #include <glog/logging.h>
 
@@ -21,10 +21,10 @@ namespace facebook { namespace deeplearning { namespace torch { namespace test {
 // output * filter operates with a mask when going to the input.
 //
 // -------------------------------
-// |      implied zeros          |\
-// |  _________________________  | \
-// |  |                       |  |  \  convoled with
-// |  |       real input      |  |   \____
+// |      implied zeros          |
+// |  _________________________  |
+// |  |                       |  |     convoled with
+// |  |       real input      |  |    ____
 // |  |                       |  |    |  |
 // |  |                       |  | *  |  |  equals ==>
 // |  |         area          |  |    ----
@@ -55,10 +55,10 @@ namespace facebook { namespace deeplearning { namespace torch { namespace test {
 //
 //         total output area
 // -------------------------------
-// |  affected by implied zeros  |\
-// |  _________________________  | \  convolved with
-// |  |                       |  |  \
-// |  |   output area not     |  |   \____
+// |  affected by implied zeros  |
+// |  _________________________  |    convolved with
+// |  |                       |  |
+// |  |   output area not     |  |    ____
 // |  |     affected by       |  |    |  |
 // |  |    implied zero       |  | *  |  |  equals ==>
 // |  |    area; this data    |  |    ----
diff --git a/test/ReferenceConvolutions.h b/test/ReferenceConvolutions.h
index f38073c..eb046ec 100644
--- a/test/ReferenceConvolutions.h
+++ b/test/ReferenceConvolutions.h
@@ -1,7 +1,7 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 #pragma once
 
-#include "torch/fb/fbcunn/src/Tensor.h"
+#include "src/Tensor.h"
 
 #include <folly/Optional.h>
 #include <tuple>
diff --git a/test/TestUtils.cpp b/test/TestUtils.cpp
index e2b3fc6..2657039 100644
--- a/test/TestUtils.cpp
+++ b/test/TestUtils.cpp
@@ -131,7 +131,7 @@ bool isWithin(float a, float b, float relativeError) {
 
 
   // Special case for a or b very close to zero, only absolute check can work
-  if (std::abs(a) < relativeError || std::abs(a) < relativeError ||
+  if (std::abs(a) < relativeError || std::abs(b) < relativeError ||
       !std::isnormal(a)|| !std::isnormal(b)) {
     if (std::abs(a - b) > adjRelativeError) {
       return false;
diff --git a/test/TestUtils.h b/test/TestUtils.h
index 1b51195..9224d5b 100644
--- a/test/TestUtils.h
+++ b/test/TestUtils.h
@@ -2,11 +2,11 @@
 
 #pragma once
 #include "cuda/DeviceTensor.cuh"
-#include "torch/fb/fbcunn/src/CudaTensorUtils.h"
-#include "torch/fb/fbcunn/src/DeviceTensorUtils.h"
+#include "src/CudaTensorUtils.h"
+#include "src/DeviceTensorUtils.h"
 #include "THCTensor.h"
-#include "torch/fb/fbcunn/src/fft/CuFFTConvolution_UpdateOutput.cuh"
-#include "torch/fb/fbcunn/src/fft/Utils.h"
+#include "src/fft/CuFFTConvolution_UpdateOutput.cuh"
+#include "src/fft/Utils.h"
 
 #include <folly/Optional.h>
 #include <tuple>
diff --git a/test/test.lua b/test/test.lua
index 50b6f99..656bdbd 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -117,6 +117,50 @@ function fbcunntest.TemporalMaxPoolingBatch()
                            1e-4, 'error on backward batch')
 end
 
+function fbcunntest.testDoG()
+
+  -- load image:
+    require 'image'
+    local input = image.scale(image.lena(), 16, 16, 'bilinear'):cuda()
+    local numChannels = input:size(1)
+
+    -- construct module:
+    local nOctaves = 3
+    local nScalesPerOctave = 4
+    local module = nn.DifferenceOfGaussian(
+        numChannels,
+        nOctaves,
+        nScalesPerOctave
+    ):cuda()
+
+    -- test forward pass:
+    local output = module:forward(input)
+    assert(type(output) == 'table')
+    assert(#output == nOctaves)
+    for n = 1,nOctaves do
+        assert(output[n]:size(1) == nScalesPerOctave * numChannels)
+    end
+
+    -- repeat the forward tests in batch mode:
+    local batchSize = 8
+    local batchInput = input.new(
+        batchSize,
+        input:size(1),
+        input:size(2),
+        input:size(3)
+    )
+    for n = 1,batchSize do
+        batchInput[n]:copy(input):add(torch.randn(input:size()):cuda(), 0.05)
+    end
+    output = module:forward(batchInput)
+    assert(type(output) == 'table')
+    assert(#output == nOctaves)
+    for n = 1,nOctaves do
+        assert(output[n]:size(1) == batchSize)
+        assert(output[n]:size(2) == nScalesPerOctave * numChannels)
+    end
+end
+
 function fbcunntest.Optim()
     require 'cunn'
     local fboptim = require 'fboptim'
diff --git a/test/test_BatchNormalization.lua b/test/test_BatchNormalization.lua
new file mode 100644
index 0000000..adae5ec
--- /dev/null
+++ b/test/test_BatchNormalization.lua
@@ -0,0 +1,227 @@
+require 'fb.luaunit'
+require 'cunn'
+require 'fbcunn'
+require 'nn'
+require 'fbnn'
+
+local precision = 1e-4
+local threshold = 5e-5
+local relaxedPrecision = 5 * 0.01668
+local numRuns = 10
+local benchmark = false
+local debug = false
+local silence = true
+local seed = os.time()
+print('Seed: ', seed)
+math.randomseed(seed)
+torch.manualSeed(seed)
+
+local function BNTest(
+      refmod, gpumod, input, gradOutput, debug, benchmark, indim)
+
+   if debug then
+      input:fill(1)
+      gradOutput:fill(1)
+      input:copy(torch.linspace(1, input:nElement(), input:nElement()))
+      gradOutput:copy(torch.linspace(1, input:nElement(), input:nElement()))
+   end
+
+   -- batch norm without affine transform
+   local function assertDiff(ref, actual, name)
+      local rel, abs = nn.utils.relErr(ref, actual)
+      if abs > threshold then
+         assert(rel <= precision,
+                name .. ' max diff ' .. ' absolute ' .. abs)
+      else
+         assert(rel <= relaxedPrecision,
+                name .. ' max diff ' .. ' absolute ' .. abs)
+      end
+   end
+
+   local function uniformInit(t1, t2)
+      t1:uniform()
+      t2:copy(t1)
+   end
+
+   for _, affine in ipairs({false, true}) do
+      for _, train in ipairs({false, true}) do
+         if not silence then
+            print('affine, train', affine, train)
+         end
+         local modRef = refmod(indim, 1e-5, 0.1, affine):cuda()
+         local modGPU = gpumod(indim, 1e-5, 0.1, affine)
+         modGPU.train, modRef.train = train, train
+
+         -- Preconditions
+         if affine then
+            -- Uniform both for testing purposes
+            uniformInit(modRef.bias, modGPU.bias)
+            uniformInit(modRef.weight, modGPU.weight)
+            assertDiff(modRef.bias, modGPU.bias, 'bias')
+            assertDiff(modRef.weight, modGPU.weight, 'weight')
+         end
+         uniformInit(modRef.running_std, modGPU.running_std)
+         uniformInit(modRef.running_mean, modGPU.running_mean)
+         assertDiff(modRef.running_std, modGPU.running_std, 'running_std')
+         assertDiff(modRef.running_mean, modGPU.running_mean, 'running_mean')
+
+         -- UpdateOutput
+         modGPU:updateOutput(input)
+         modRef:updateOutput(input)
+
+         if debug then
+            print('Input', input:float())
+            print('GradOutput', gradOutput:float())
+            print('weight', modGPU.weight:float())
+            print('bias', modGPU.bias:float())
+            print('Expected running_mean', modRef.running_mean:float())
+            print('Actual running_mean', modGPU.running_mean:float())
+            print('Expected running_std', modRef.running_std:float())
+            print('Actual running_std', modGPU.running_std:float())
+            print('Expected output', modRef.output:float())
+            print('Actual output', modGPU.output:float())
+            if train then
+               print('Expected centered', modRef.centered:float())
+               print('Actual centered', modGPU.centered:float())
+               print('Expected std', modRef.std:float())
+               print('Actual std', modGPU.std:float())
+               print('Expected normalized', modRef.normalized:float())
+               print('Actual normalized', modGPU.normalized:float())
+            end
+         end
+
+         -- Postconditions
+         assertDiff(modRef.running_mean, modGPU.running_mean, 'running_mean')
+         assertDiff(modRef.running_std, modGPU.running_std, 'running_std')
+         if train then
+            assertDiff(modRef.centered, modGPU.centered, 'centered')
+            assertDiff(modRef.std, modGPU.std, 'std')
+            assertDiff(modRef.normalized, modGPU.normalized, 'normalized')
+         end
+        assertDiff(modRef.output, modGPU.output, 'output')
+
+
+
+         if train then
+            -- Preconditions
+            assertDiff(modRef.centered, modGPU.centered, 'centered')
+            assertDiff(modRef.std, modGPU.std, 'std')
+            if affine then
+               assertDiff(modRef.weight, modGPU.weight, 'std')
+            end
+
+            -- UpdateGradInput
+            modGPU:updateGradInput(input, gradOutput)
+            modRef:updateGradInput(input, gradOutput)
+
+            if debug then
+               print('Expected gradInput', modRef.gradInput:float())
+               print('Actual gradInput', modGPU.gradInput:float())
+            end
+
+            -- Postconditions
+            assertDiff(modRef.gradInput, modGPU.gradInput, 'gradInput')
+
+            if affine then
+               -- Preconditions
+               -- gradBias and gradWeight are unintialized, users usually
+               -- call zeroGradParameters first, emulate this
+               uniformInit(modRef.gradBias, modGPU.gradBias)
+               uniformInit(modRef.gradWeight, modGPU.gradWeight)
+               assertDiff(modRef.gradBias, modGPU.gradBias, 'gradBias')
+               assertDiff(modRef.gradWeight, modGPU.gradWeight, 'gradWeight')
+               assertDiff(modRef.normalized, modGPU.normalized, 'normalized')
+
+               local scale = torch.random(1000) / 1000.0
+               if debug then
+                  local val = 0
+                  gradOutput:apply(
+                     function()
+                        val = val + 1
+                        return val
+                     end
+                  )
+                  scale = 1.0
+                  modRef.normalized:copy(modGPU.normalized)
+                  print('Normalized', modRef.normalized:float())
+                  print('GradOutput', gradOutput:float())
+               end
+
+               -- AccGradParameters
+               modGPU:accGradParameters(input, gradOutput, scale)
+               modRef:accGradParameters(input, gradOutput, scale)
+
+               if debug then
+                  print('Expected gradWeight', modRef.gradWeight:float())
+                  print('Actual gradWeight', modGPU.gradWeight:float())
+                  print('Expected gradBias', modRef.gradBias:float())
+                  print('Actual gradBias', modGPU.gradBias:float())
+               end
+
+               -- Postconditions
+               assertDiff(modRef.gradBias, modGPU.gradBias, 'gradBias')
+               assertDiff(modRef.gradWeight, modGPU.gradWeight, 'gradWeight')
+            end
+         end
+      end
+   end
+end
+
+function testSpatialBatchNormalization()
+   for i = 1, numRuns do
+      local nframes, indim, ini, inj = torch.random(1, 17),
+      torch.random(1, 19),
+      torch.random(1, 35),
+      torch.random(1, 35)
+      if benchmark then
+         nframes, indim, ini, inj = 128, 64, 112, 112
+      end
+      if debug then
+         nframes, indim, ini, inj = 1, 1, 5, 7
+      end
+
+      local input = torch.zeros(nframes, indim, ini, inj):uniform():cuda()
+      local gradOutput = torch.zeros(nframes, indim, ini, inj):uniform():cuda()
+
+      BNTest(nn.SpatialBatchNormalization,
+             fbnn.SpatialBatchNormalization,
+             input,
+             gradOutput,
+             debug,
+             benchmark,
+             indim)
+   end
+end
+
+function testBatchNormalization()
+   for i = 1, numRuns do
+      local nframes, indim = torch.random(1, 17), torch.random(1, 19)
+      if benchmark then
+         nframes, indim = 128, 4096
+      end
+      if debug then
+         nframes, indim = 5, 7
+      end
+
+      local input = torch.zeros(nframes, indim):uniform():cuda()
+      local gradOutput = torch.zeros(nframes, indim):uniform():cuda()
+
+      BNTest(nn.BatchNormalization,
+             fbnn.BatchNormalization,
+             input,
+             gradOutput,
+             debug,
+             benchmark,
+             indim)
+   end
+end
+
+--[[
+ precision = 1e-6
+ numRuns = 10
+ benchmark = false
+ debug = false
+ silence = true
+--]]
+
+LuaUnit:main()
diff --git a/test/test_ClassHierarchicalNLLCriterion.lua b/test/test_ClassHierarchicalNLLCriterion.lua
index f89c0a6..58fbb54 100644
--- a/test/test_ClassHierarchicalNLLCriterion.lua
+++ b/test/test_ClassHierarchicalNLLCriterion.lua
@@ -219,10 +219,10 @@ for _, x in pairs{{criterion.clusterMatrix, criterion.clusterMatrixDx},
                )
             end
             if basic then
-               assert(math.abs(
+               local err =  math.abs(
                          criterion.classMatrixDx[i][j] -
-                            modelDefault.modules[2].gradWeight[i][j]) <
-                         1e-16)
+                            modelDefault.modules[2].gradWeight[i][j])
+               assert(err < 1e-14, "failed error check : " .. err .. ' < ' .. 1e-14)
             end
           end
         end
diff --git a/test/benchmark_cublas.lua b/test/test_CuBLAS.lua
similarity index 56%
rename from test/benchmark_cublas.lua
rename to test/test_CuBLAS.lua
index ce4acd7..138156a 100644
--- a/test/benchmark_cublas.lua
+++ b/test/test_CuBLAS.lua
@@ -1,13 +1,12 @@
 -- Copyright 2004-present Facebook. All Rights Reserved.
-require('fb.luaunit')
-
+require 'fb.luaunit'
+require 'fbtorch'
 require 'cunn'
-
 require 'fbcunn'
 
 torch.setdefaulttensortype('torch.FloatTensor')
 
-local test = {}
+local fb_test = {}
 
 -- Let C = m-by-n and A = m-by-k
 -- Format is m, n, k, seqIter, batch, numHandles, numStreams
@@ -43,6 +42,16 @@ local problemSize = {
     {1, 1024, 512, {1}, {16 * 32}, 1, 1},
   }
 
+-- This test exercises the performance of multi-handle + multi-stream on many
+-- small gemms.
+local _testMultiHandlePerf = {
+  {513, 513, 513, {53}, {}, 0, 0},
+  {513, 513, 513, {53}, {}, 1, 1},
+  {513, 513, 513, {53}, {}, 1, 4},
+  {513, 513, 513, {53}, {}, 4, 1},
+  {513, 513, 513, {53}, {}, 4, 4},
+}
+
 local function concat(t1,t2)
     local res = {}
     for i=1,#t1 do
@@ -54,61 +63,10 @@ local function concat(t1,t2)
     return res
 end
 
--- Soumith's inline print
-local ndepth = 4
-local function print_inline(...)
-   local function rawprint(o)
-      io.write(tostring(o or '') .. ' ')
-      io.flush()
-   end
-
-   local function printrecursive(obj, depth)
-      local depth = depth or 0
-      local tab = 0
-      local line = function(s) for i=1,tab do io.write(' ') end rawprint(s) end
-         if next(obj) then
-            line('{')
-            for k,v in pairs(obj) do
-               if type(v) == 'table' then
-                  if depth >= (ndepth-1) or next(v) == nil then
-                     line(tostring(k) .. ' : {}')
-                  else
-                     line(tostring(k) .. ' : ') printrecursive(v, depth + 1)
-                  end
-               else
-                  line(tostring(k) .. ' : ' .. v)
-               end
-               rawprint(',')
-            end
-            tab = tab-2
-            line('}')
-         else
-            line('{}')
-         end
-   end
-   for i = 1,select('#',...) do
-      local obj = select(i,...)
-      if type(obj) ~= 'table' then
-         if type(obj) == 'userdata' or type(obj) == 'cdata' then
-            rawprint(obj)
-         else
-            io.write(obj .. '\t')
-            if i == select('#',...) then
-               rawprint()
-            end
-         end
-      elseif getmetatable(obj) and getmetatable(obj).__tostring then
-         rawprint(obj)
-      else
-         printrecursive(obj)
-      end
-   end
-end
-
 local function testLoop(problemSize)
   -- Just allocate some dummy placeholder to get to the proper
   -- function in the lua module
-  local net = nn.CuBLASWrapper()
+  local net = nn.CuBLASWrapper(true)
 
   local m = problemSize[1]
   local n = problemSize[2]
@@ -125,13 +83,25 @@ local function testLoop(problemSize)
   local B = torch.Tensor(sB):cuda()
   local C = torch.Tensor(sC):cuda()
 
-  print_inline(problemSize)
-  print('')
-  net:matmult(A, B, C, seqIter, batch, handles, streams)
+  cutorch.reserveBlasHandles(handles)
+  cutorch.reserveStreams(streams)
+  cutorch.synchronize()
+  net:matmult(A, B, C, seqIter, batch)
+  mytester:assert(true)
 
+  cutorch.synchronize()
   collectgarbage()
 end
 
-for i = 1, table.getn(problemSize) do
-   testLoop(problemSize[i])
+function fb_test.testGEMMs()
+  for i = 1, table.getn(_testMultiHandlePerf) do
+    testLoop(_testMultiHandlePerf[i])
+  end
+  for i = 1, table.getn(problemSize) do
+    testLoop(problemSize[i])
+  end
 end
+
+mytester = torch.Tester()
+mytester:add(fb_test)
+mytester:run()
diff --git a/test/test_CuFFT.lua b/test/test_CuFFT.lua
new file mode 100644
index 0000000..ccb4895
--- /dev/null
+++ b/test/test_CuFFT.lua
@@ -0,0 +1,310 @@
+-- Copyright 2004-present Facebook. All Rights Reserved.
+require('fb.luaunit')
+local torch = require('fbtorch')
+
+require 'cunn'
+require 'fbcunn'
+require 'cutorch'
+require 'math'
+
+torch.setnumthreads(6)
+torch.setdefaulttensortype('torch.FloatTensor')
+
+local mytester = torch.Tester()
+
+local precision = 1e-4
+
+local test = {}
+local printResults = false
+local printMemory = false
+local timeResults = false
+
+local kNumGPUs = 1
+local maxSize = 128000000
+local maxBatch = 4
+local maxInputPlanes = 13
+local maxOutputPlanes = 13
+local maxKernelSize = 7
+local maxInputSize = 60
+
+local function timeFunction(printString, fun, module, arg1, arg2, arg3)
+   if not timeResults then
+      return fun(module, arg1, arg2, arg3)
+   end
+
+   local numTrials = 5
+   local time = 0
+   for i = 1, numTrials do
+      local timer = torch.Timer()
+      cutorch.synchronize()
+      fun(module, arg1, arg2, arg3)
+      cutorch.synchronize()
+      if i > 1 then
+         time = time + timer:time().real
+      end
+   end
+   time = time / (numTrials - 1)
+   print(printString .. time * 1000 .. " ms")
+
+   -- Avoid messing up the accGradParameters case, this is benchmarking
+   -- only so we're ok
+   module.gradBias:zero()
+   module.gradWeight:zero()
+   return fun(module, arg1, arg2, arg3)
+end
+
+local function testLoop(problemSize)
+   local batchSize = problemSize[1] or 4 * torch.random(maxBatch)
+   local nInputPlanes = problemSize[2] or torch.random(maxInputSize)
+   local nOutputPlanes = problemSize[3] or torch.random(maxOutputPlanes)
+   local kH = problemSize[4] or torch.random(maxKernelSize)
+   -- If not specified, make it square to avoid blatant rectangular
+   -- inefficiences with FBFFT atm
+   local kW = problemSize[5] or torch.random(maxKernelSize)
+   local iH = problemSize[6] or
+              math.max(kH, torch.random(maxInputSize) + 4 - kH + 1)
+   -- If not specified, make it square to avoid blatant rectangular
+   -- inefficiences with FBFFT atm
+   local iW = problemSize[7] or
+              math.max(kW, torch.random(maxInputSize) + 4 - kW + 1)
+
+   local padH = 0
+   local padW = 0
+
+   -- Only small tests, having many small random tests that also
+   -- exercise synchronizations is far more valuable than bigger ones
+   if iW * iH * batchSize * nInputPlanes > maxSize then
+     return
+   end
+   if iW * iH * nOutputPlanes * nInputPlanes > maxSize then
+     return
+   end
+   if iW * iH * batchSize * nOutputPlanes > maxSize then
+     return
+   end
+
+   local scale = torch.random(100) / 100.0
+   print('Running ', batchSize, nInputPlanes, nOutputPlanes,
+         kH, kW, iH, iW, scale, " pad by ", padH, "x", padW)
+
+   local net =
+      cudnn.SpatialConvolution(nInputPlanes, nOutputPlanes,
+                               kW, kH, 1, 1, padW, padH):cuda()
+   local input = torch.CudaTensor(batchSize, nInputPlanes, iH, iW):normal()
+   local gradOutput = torch.CudaTensor(batchSize,
+                                       nOutputPlanes,
+                                       iH + 2 * padH - kH + 1,
+                                       iW + 2 * padW - kW + 1):normal()
+   net.gradWeight:zero()
+   net.gradBias:zero()
+
+   local output = timeFunction("CUDNN updateOutput: ",
+                                net.updateOutput, net, input, scale):float()
+   local gradInput =
+      timeFunction("CUDNN updateGradInput: ",
+                   net.updateGradInput, net, input, gradOutput):float()
+   timeFunction("CUDNN accGradParameters: ",
+                net.accGradParameters, net, input, gradOutput, scale)
+   local gradWeight = net.gradWeight:float()
+   local gradBias = net.gradBias:float()
+
+   local netCuFFT = {}
+   local outputCuFFT = {}
+   local gradInputCuFFT = {}
+   local gradWeightCuFFT = {}
+   local gradBiasCuFFT = {}
+
+   for k = 1, kNumGPUs do -- Across kNumGPUs GPUs
+     if k > 1 then
+        cutorch.setDevice(k)
+     end
+
+     netCuFFT[k] =
+       nn.SpatialConvolutionCuFFT(nInputPlanes, nOutputPlanes,
+                                  kW, kH, 1, 1, padW, padH)
+     netCuFFT[k].cudnnDebug = true
+     netCuFFT[k].gradWeight:zero()
+     netCuFFT[k].gradBias:zero()
+     netCuFFT[k].weight:copy(net.weight)
+     netCuFFT[k].bias:copy(net.bias)
+     netCuFFT[k]:cuda()
+
+     outputCuFFT[k] = timeFunction("CuFFT updateOutput: ",
+                                   netCuFFT[k].updateOutput,
+                                   netCuFFT[k],
+                                   input,
+                                   scale):float()
+     gradInputCuFFT[k] = timeFunction("CuFFT updateGradInput: ",
+                                      netCuFFT[k].updateGradInput,
+                                      netCuFFT[k],
+                                      input,
+                                      gradOutput):float()
+     timeFunction("CuFFT accGradParameters: ",
+                  netCuFFT[k].accGradParameters,
+                  netCuFFT[k],
+                  input,
+                  gradOutput,
+                  scale)
+
+--[[
+     gradInputCuFFT[k] = timeFunction("CuFFT backward: ",
+                                      netCuFFT[k].backward,
+                                      netCuFFT[k],
+                                      input,
+                                      gradOutput,
+                                      scale):float()
+--]]
+
+     gradWeightCuFFT[k] = netCuFFT[k].gradWeight:float()
+     gradBiasCuFFT[k] = netCuFFT[k].gradBias:float()
+
+     if printResults then
+        print("Padding WxH = ", padW, "x", padH)
+        local norm = math.sqrt(output:dot(output) + 1e-8)
+        print("updateOutputCuFFT", output:dist(outputCuFFT[k]) / norm)
+        local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8)
+        print("updateGradInputCuFFT",
+              gradInput:dist(gradInputCuFFT[k]) / norm)
+        local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8)
+        print("accGradParametersCuFFT (weight)",
+              gradWeight:dist(gradWeightCuFFT[k]) / norm)
+        local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8)
+        print("accGradParametersCuFFT (bias)",
+              gradBias:dist(gradBiasCuFFT[k]) / norm)
+     end
+
+     local norm = math.sqrt(output:dot(output) + 1e-8)
+     mytester:assertle(output:dist(outputCuFFT[k]) / norm,
+       precision, 'error on output')
+     local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8)
+     mytester:assertle(gradInput:dist(gradInputCuFFT[k]) / norm,
+       precision, 'error on gradInput')
+     local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8)
+     mytester:assertle(gradWeight:dist(gradWeightCuFFT[k]) / norm,
+       precision, 'error on gradWeight')
+     local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8)
+     mytester:assertle(gradBias:dist(gradBiasCuFFT[k]) / norm,
+       precision, 'error on gradBias')
+  end
+
+   return netCuFFT
+end
+
+-- batch, inputPlanes, outputPlanes, kH, kW, iH, iW
+local problemSizes = {
+  {1, 1, 1, 1, 1, 1, 1},
+  {1, 1, 1, 1, 1, 1, 2},
+  {1, 1, 1, 1, 1, 1, 3},
+  {2, 1, 3, 1, 1, 1, 1},
+  {2, 3, 1, 1, 1, 1, 1},
+  {2, 3, 4, 5, 5, 5, 5},
+  {1, 1, 1, 3, 3, 3, 3},
+  {1, 1, 1, 2, 2, 2, 2},
+  {1, 1, 1, 1, 2, 1, 2},
+  {1, 1, 1, 1, 1, 2, 3},
+  {2, 3, 4, 5, 5, 5, 5},
+  {128, 64, 64, 1, 1, 1, 1},
+  {128, 64, 100, 1, 1, 1, 1},
+  {128, 64, 64, 3, 3, 3, 3},
+  {128, 64, 64, 3, 3, 3, 3},
+  {128, 64, 64, 3, 3, 3, 3},
+  {128, 64, 64, 3, 3, 3, 3},
+  {128, 64, 64, 3, 3, 3, 3},
+  {1, 1, 1, 7, 5, 13, 14},
+  -- Cannot put in unit tests due to 5GB memory limit
+  --  {128, 128, 128, 3, 3, 128, 128}, -- falls back to cudnn
+}
+
+local _problemSizesICLR2015 = {
+  {16, 16, 16, 3, 3, 13, 13},
+  {16, 16, 16, 3, 3, 16, 16},
+  {16, 16, 16, 3, 3, 27, 27},
+  {16, 16, 16, 3, 3, 32, 32},
+  {16, 16, 16, 3, 3, 57, 57},
+  {16, 16, 16, 3, 3, 64, 64},
+  {32, 32, 32, 3, 3, 13, 13},
+  {32, 32, 32, 3, 3, 16, 16},
+  {32, 32, 32, 3, 3, 27, 27},
+  {32, 32, 32, 3, 3, 32, 32},
+  {32, 32, 32, 3, 3, 57, 57},
+  {32, 32, 32, 3, 3, 64, 64},
+  {64, 64, 64, 3, 3, 13, 13},
+  {64, 64, 64, 3, 3, 16, 16},
+  {64, 64, 64, 3, 3, 27, 27},
+  {64, 64, 64, 3, 3, 32, 32},
+  {64, 64, 64, 3, 3, 57, 57},
+  {64, 64, 64, 3, 3, 64, 64},
+  {128, 128, 128, 3, 3, 13, 13},
+  {128, 128, 128, 3, 3, 16, 16},
+  {128, 128, 128, 3, 3, 27, 27},
+  {128, 128, 128, 3, 3, 32, 32},
+  {128, 128, 128, 3, 3, 57, 57},
+  {128, 128, 128, 3, 3, 64, 64},
+}
+
+local _problemSizesAlexNet = {
+  -- 1 GPU
+  {128,  96, 256, 5, 5, 31, 31},
+  {128, 256, 384, 3, 3, 15, 15},
+  {128, 384, 384, 3, 3, 15, 15},
+  {128, 384, 256, 3, 3, 15, 15},
+  -- 2 GPU model parallel
+  {128,  48, 128, 5, 5, 31, 31},
+  {128, 256, 192, 3, 3, 15, 15},
+  {128, 192, 192, 3, 3, 15, 15},
+  {128, 192, 128, 3, 3, 15, 15},
+  -- 4 GPU model parallel
+  {128,  24,  64, 5, 5, 31, 31},
+  {128, 256,  96, 3, 3, 15, 15},
+  {128,  96,  96, 3, 3, 15, 15},
+  {128,  96,  64, 3, 3, 15, 15},
+}
+
+local function reportAndFree(net)
+   if printResults or printMemory then
+      local free, total = cutorch.getMemoryUsage()
+      print("Pre Collect Memory: " , free , " free " , total , " total")
+   end
+   assert(torch.type(net) == 'table', torch.type(net))
+   -- Kill the local references to, as well as the global buffers
+   for i, v in ipairs(net) do
+      v:cleanupBuffers()
+   end
+   collectgarbage()
+   collectgarbage()
+   if printResults or printMemory then
+      local free, total = cutorch.getMemoryUsage()
+      print("Post Collect Memory: " , free , " free " , total , " total")
+   end
+end
+
+local num_random_configurations = 100
+local problemsToRun = problemSizes
+
+--[[
+-- Convenient override of the default that are used for unit tests
+local problemsToRun = _problemSizesAlexNet
+local problemsToRun = _problemSizesICLR2015
+printMemory = true
+timeResults = true
+printResults = true
+
+num_random_configurations = 0
+printMemory = true
+timeResults = true
+--]]
+
+function test.test()
+  for i = 1, #problemsToRun do
+      local net = testLoop(problemsToRun[i])
+      reportAndFree(net)
+  end
+  -- random configuration
+  for i = 1, num_random_configurations do
+      local net = testLoop({})
+      reportAndFree(net)
+  end
+end
+
+mytester:add(test)
+mytester:run()
diff --git a/test/test_DataParallel.lua b/test/test_DataParallel.lua
index 72030e4..9662b5f 100644
--- a/test/test_DataParallel.lua
+++ b/test/test_DataParallel.lua
@@ -1,13 +1,9 @@
 local fboptim = require('fboptim')
 -- Copyright 2004-present Facebook. All Rights Reserved.
 
-local dprintL = (require 'fb.util.dbg').new('parallel')
-local dprint = function(...)
-    return dprintL(1, ...)
-end
+require 'fb.luaunit'
 require 'optim'
 require 'fbcunn'
-print 'Requiring cunn. This will take a while. Talk amongst yourselves.'
 require 'cunn'
 
 -- Hyper-params. We're targeting a toy problem that computes
@@ -74,98 +70,92 @@ local function tensorsAreProbablySimilar(l, r, epsilon)
     return math.abs(l:norm() - r:norm()) < epsilon
 end
 
--- Set up models on each GPU.
-local dp = nn.DataParallel(1)
-local simpleModels = {}
-for i = 1,numGPUs do
-    if i == 1 then
-        simpleModels[i] = simpleModel()
-    else
-        simpleModels[i] = simpleModels[1]:clone()
-    end
-    dp:add(simpleModels[i])
-end
-
--- CPU models to cross-validate
-local cpuModels = {}
-local function syncCPUModels()
-    for i = 1,numGPUs do
-        cpuModels[i] = simpleModels[i]:clone()
-        cpuModels[i] = cpuModels[i]:double()
-    end
-end
-syncCPUModels()
-
--- Check an input/output pair against the CPU models
-local function checkWideResult(inputs, outputs)
-    local function checkOneResult(input, modIdx, expectedOutput)
-        input = input:double() -- de-cudify
-        assert(tensorsAreProbablySimilar(cpuModels[modIdx]:forward(input),
-                                         expectedOutput))
-    end
-    for j = 1, numGPUs do
-        checkOneResult(getNarrowedInput(inputs, j), j, outputs[{ {j} }])
-    end
-end
-
-local function checkCPUModelsAreEquivalent()
-    syncCPUModels()
-    local input = genInput()
-    local out = cpuModels[1]:forward(input)
-    for j = 2, numGPUs do
-        assert(tensorsAreProbablySimilar(out, cpuModels[j]:forward(input)))
-    end
-end
-checkCPUModelsAreEquivalent()
-
-dp:cuda()
-
--- Make sure forward produces same results as an individual copy
-print('forward test {')
-for i=1, 10 do
-    local inputs, targets = genWideExample()
-    dprint{ inputs, targets }
-    local outputs = dp:forward(inputs)
-    syncCPUModels()
-    checkWideResult(inputs, outputs)
-end
-print('} forward test done')
-
-print('optim test {')
-local optimState = {
-    learningRate = 1e-1,
-    weightDecay = 1e-4,
-    momentum = 0.9,
-    learningRateDecay = 1e-7
-}
-
-local timer = torch.Timer()
-local opt = nn.Optim(dp, optimState)
-local criterion = nn.MSECriterion():cuda()
-
-local num_iteration = 10
-timer:reset()
-for i=1, num_iteration do
-    local inputs, targets = genWideExample()
-    local outputs = dp:forward(inputs)
-    syncCPUModels()
-    checkWideResult(inputs, outputs)
-    opt:optimize(fboptim.sgd, inputs, targets, criterion)
-    local out = dp:forward(inputs)
-    local err = criterion:forward(out, targets)
-    print(i, err)
-end
-print(string.format("Total time spent = %f", timer:time().real / num_iteration))
-checkCPUModelsAreEquivalent()
-print('} optim test done ')
-
--- Check only the speed for forward/backward.
-timer:reset();
-for i=1, num_iteration do
-    local inputs, targets = genWideExample()
-    dp:forward(inputs)
-    opt:optimize(fboptim.sgd, inputs, targets, criterion)
-end
-print(string.format(
-    "Speedtest: Total time spent = %f",
-        timer:time().real / num_iteration));
+function testDataParallel()
+   -- Set up models on each GPU.
+   local dp = nn.DataParallel(1)
+   local simpleModels = {}
+   for i = 1,numGPUs do
+      if i == 1 then
+         simpleModels[i] = simpleModel()
+      else
+         simpleModels[i] = simpleModels[1]:clone()
+      end
+      dp:add(simpleModels[i])
+   end
+
+   -- CPU models to cross-validate
+   local cpuModels = {}
+   local function syncCPUModels()
+      for i = 1,numGPUs do
+         cpuModels[i] = simpleModels[i]:clone()
+         cpuModels[i] = cpuModels[i]:double()
+      end
+   end
+   syncCPUModels()
+
+   -- Check an input/output pair against the CPU models
+   local function checkWideResult(inputs, outputs)
+      local function checkOneResult(input, modIdx, expectedOutput)
+         input = input:double() -- de-cudify
+         assert(tensorsAreProbablySimilar(cpuModels[modIdx]:forward(input),
+                                          expectedOutput))
+      end
+      for j = 1, numGPUs do
+         checkOneResult(getNarrowedInput(inputs, j), j, outputs[{ {j} }])
+      end
+   end
+
+   local function checkCPUModelsAreEquivalent()
+      syncCPUModels()
+      local input = genInput()
+      local out = cpuModels[1]:forward(input)
+      for j = 2, numGPUs do
+         assert(tensorsAreProbablySimilar(out, cpuModels[j]:forward(input)))
+      end
+   end
+   checkCPUModelsAreEquivalent()
+
+   dp:cuda()
+
+   -- Make sure forward produces same results as an individual copy
+   for i=1, 10 do
+      local inputs, targets = genWideExample()
+      local outputs = dp:forward(inputs)
+      syncCPUModels()
+      checkWideResult(inputs, outputs)
+   end
+
+   local optimState = {
+      learningRate = 1e-1,
+      weightDecay = 1e-4,
+      momentum = 0.9,
+      learningRateDecay = 1e-7
+   }
+
+   local timer = torch.Timer()
+   local opt = nn.Optim(dp, optimState)
+   local criterion = nn.MSECriterion():cuda()
+
+   local num_iteration = 10
+   timer:reset()
+   for i=1, num_iteration do
+      local inputs, targets = genWideExample()
+      local outputs = dp:forward(inputs)
+      syncCPUModels()
+      checkWideResult(inputs, outputs)
+      opt:optimize(fboptim.sgd, inputs, targets, criterion)
+      local out = dp:forward(inputs)
+      local err = criterion:forward(out, targets)
+   end
+   checkCPUModelsAreEquivalent()
+
+   -- Check only the speed for forward/backward.
+   timer:reset();
+   for i=1, num_iteration do
+      local inputs, targets = genWideExample()
+      dp:forward(inputs)
+      opt:optimize(fboptim.sgd, inputs, targets, criterion)
+   end
+end
+
+LuaUnit:main()
diff --git a/test/test_DataParallelComprehensive.lua b/test/test_DataParallelComprehensive.lua
deleted file mode 100755
index 6df38ce..0000000
--- a/test/test_DataParallelComprehensive.lua
+++ /dev/null
@@ -1,132 +0,0 @@
--- Copyright 2004-present Facebook. All Rights Reserved.
-
-require 'optim'
-require 'cunn'
-require 'fbcunn'  -- For nn.DataParallel
-require 'fbnn'  -- For nn.Optim
-
-local base_gpu = 1  -- Primary GPU to use
-local num_gpus = 2  -- We will use {base_gpu, base_gpu+1, etc} with modulus
-torch.setdefaulttensortype('torch.DoubleTensor')
-torch.setnumthreads(8)
-cutorch.setDevice(base_gpu)
-
--- Create an instance of the test framework
-local precision = 5e-4
-local mytester = torch.Tester()
-local test = {}
-
-function copyTable(x)  -- Shallow copy
-  local ret = {}
-  for k,v in pairs(x) do ret[k] = v end
-  return ret
-end
-
--- Build a dummy binary classifier. We will split the BATCHES across GPUs.
-function buildNet(width, height, pool, feat, filt, num_convs)
-  local net = nn.Sequential()
-  assert(math.fmod(filt,2) == 1)
-  for i = 1, num_convs do
-    local fin = 3
-    if (i > 1) then fin = feat end
-    net:add(nn.SpatialConvolutionMM(fin, feat, filt, filt, 1, 1, (filt-1)/2))
-    net:add(nn.Threshold())
-  end
-  net:add(nn.SpatialMaxPooling(pool, pool))
-  net:add(nn.Reshape(width * height * feat / (pool * pool)))
-  net:add(nn.Linear(width * height * feat / (pool * pool), 2))
-  -- net:add(nn.SoftMax())  -- This is fake anyway, so just do regression :-)
-  return net
-end
-
-function test.DataParallel()
-  collectgarbage()
-  local width = 16
-  local height = 16
-  local pool = 4
-  local feat = 8
-  local filt = 5
-  local num_convs = 2
-  local num_sgd_steps = 2
-  local sync_gpu_cpu_params_every = 1
-  local batch_size = 2 * num_gpus
-  
-  -- Build a CPU model
-  local cpu_net = buildNet(width, height, pool, feat, filt, num_convs)
-
-  -- Build a multi-GPU model
-  local gpu_net = nn.DataParallel(1):cuda()
-  for i = 1, num_gpus do
-    local cur_gpu = math.fmod(base_gpu + (i-1)-1, cutorch.getDeviceCount())+1
-    cutorch.setDevice(cur_gpu)
-    gpu_net:add(cpu_net:clone():cuda(), cur_gpu)
-  end
-  cutorch.setDevice(base_gpu)
-
-  local cpu_input = torch.rand(batch_size, 3, height, width)
-  local gpu_input = cpu_input:cuda()
-  local cpu_target = torch.rand(batch_size, 2)
-  local gpu_target = cpu_target:cuda()   
- 
-  -- Set up an MSE optimizer on the GPU and CPU
-  local optim_state_cpu = {
-    learningRate = 1,  -- Artificially big learning rate
-    weightDecay = 0,
-  }
-  local optim_state_gpu = copyTable(optim_state_cpu)
-  local opt_cpu = nn.Optim(cpu_net, optim_state_cpu)
-  local opt_gpu = nn.Optim(gpu_net, optim_state_gpu)
- 
-  local criterion_cpu = nn.MSECriterion()
-  local criterion_gpu = criterion_cpu:clone():cuda()
-  
-  for i = 1, num_sgd_steps do
-    collectgarbage()
-     
-    -- Perform an SGD step on the GPU and CPU
-    opt_cpu:optimize(optim.sgd, cpu_input, cpu_target, criterion_cpu)
-    opt_gpu:optimize(optim.sgd, gpu_input, gpu_target, criterion_gpu)
-    assert(cutorch.getDevice() == base_gpu, 
-      'DataParallel didnt restore GPU state to base_gpu')
-    
-    -- Now make sure that everything is the same
-    local cpu_output = cpu_net.output
-    local gpu_output = gpu_net.output
-    local cpu_gradInput = cpu_net.gradInput
-    local gpu_gradInput = gpu_net.gradInput
-    local cpu_params, cpu_gradParams = cpu_net:parameters()
-    local gpu_params, gpu_gradParams = gpu_net:get(1):parameters()
-
-    mytester:assertlt((cpu_output - gpu_output:double()):abs():max(), 
-      precision, 'fprop error ')
-    mytester:assertlt((criterion_cpu.gradInput - 
-      criterion_gpu.gradInput:double()):abs():max(), precision, 
-      'CRITERION BPROP error ')
-    mytester:asserteq(#cpu_params, #gpu_params)
-    for j = 1, #cpu_params do
-      mytester:assertlt((cpu_params[j] - gpu_params[j]:double()):abs():max(),
-        precision, 'parameters error ')
-    end
-    mytester:asserteq(#cpu_gradParams, #gpu_gradParams)
-    for j = 1, #cpu_gradParams do
-      mytester:assertlt((cpu_gradParams[j] - 
-        gpu_gradParams[j]:double()):abs():max(), precision, 
-        'BPROP error (gradParams)')
-    end
-    mytester:assertlt((cpu_gradInput - gpu_gradInput:double()):abs():max(),
-      precision, 'BPROP error (gradInput)')
-    
-    -- Sync the CPU and GPU weights every few "epochs" to prevent floating point
-    -- drift between SGD iterations (ie, they will eventually be divergent after
-    -- enough iterations)
-    if math.fmod(i, sync_gpu_cpu_params_every) == 0 then
-      for j = 1, #cpu_gradParams do
-        cpu_params[j]:copy(gpu_params[j])
-      end
-    end
-  end
-end
-
--- Now run the test above
-mytester:add(test)
-mytester:run()
diff --git a/test/test_FBFFTTiling.lua b/test/test_FBFFTTiling.lua
new file mode 100644
index 0000000..bb9dfbc
--- /dev/null
+++ b/test/test_FBFFTTiling.lua
@@ -0,0 +1,208 @@
+require 'cunn'
+require 'fbcunn'
+require 'math'
+
+require 'fb.luaunit'
+require('fbtorch')
+g_mytester = torch.Tester()
+local fb_test = {}
+
+local silence = true
+local timeResults = false
+local printDebug = false
+local printMemory = false
+local testCuDNN = true
+local runUpdateOutput = true
+local runUpdateGradInput = true
+local runAccGradParameters = true
+
+local function reportAndFree(net)
+   if printMemory then
+      local free, total = cutorch.getMemoryUsage()
+      if not silence then
+         print('Pre Collect Memory: ' , free , ' free ' , total , ' total')
+      end
+   end
+   -- release entries from the global buffer table
+   if net then
+      net:cleanupBuffers()
+      net = nil
+   end
+   collectgarbage()
+   collectgarbage()
+   if printMemory then
+      local free, total = cutorch.getMemoryUsage()
+      if not silence then
+         print('Post Collect Memory: ' , free , ' free ' , total , ' total')
+      end
+   end
+end
+
+local function testTiledFFT(problem, FFTConvolutionClass)
+   local batches = problem[1] or torch.random(16)
+   local inputPlanes = problem[2] or torch.random(16)
+   local outputPlanes = problem[3] or torch.random(16)
+   -- Values that make sense, start from kernel size
+   local kH = problem[6] or 4 + math.random(11)
+   local kW = problem[7] or 4 + math.random(11)
+   local iH = problem[4] or 1 + 2 * kH + math.random(13)
+   local iW = problem[5] or 1 + 2 * kW + math.random(13)
+   local tileH = kH + math.random(5)
+   tileH = problem[8] or math.min(tileH, iH - 1)
+   local tileW = kW + math.random(5)
+   tileW = problem[9] or math.min(tileW, iW - 1)
+   local padH = problem[10] or math.min(kH - 1, tileH - kH, math.random(7))
+   local padW = problem[11] or math.min(kW - 1, tileW - kW, math.random(7))
+   local reuseRandom = math.min(torch.random(5) % 5 + 1)
+   local reuses = {
+      nn.SpatialConvolutionFFT.memoryReuseNone,
+      nn.SpatialConvolutionFFT.memoryReuseInput,
+      nn.SpatialConvolutionFFT.memoryReuseWeight,
+      nn.SpatialConvolutionFFT.memoryReuseOutput,
+      nn.SpatialConvolutionFFT.memoryReuseAll,
+   }
+   local reuse = problem[12] or reuses[reuseRandom]
+
+   if not silence then
+      print('Running ', batches, inputPlanes, outputPlanes,
+            ' kH = ', kH, ' x ', 'kW = ', kW,
+            ' x ', 'iH = ', iH, ' x ', 'iW = ', iW,
+            ' x ', 'padH = ', padH, ' x ', padW, ' tile by ', tileH, 'x', tileW,
+            ' reuse = ', reuse)
+   end
+
+   -- Testing tiling, 1 batch, input plane, output plane are enough
+   local ps = {batches, inputPlanes, iH, iW}
+   local input = torch.Tensor(torch.LongStorage(ps)):cuda():normal()
+   local ps = {batches,
+               outputPlanes,
+               iH - kH + 2 * padH + 1,
+               iW - kW + 2 * padW + 1}
+   local gradOutput = torch.Tensor(torch.LongStorage(ps)):cuda():normal()
+   local scale = torch.uniform()
+   local net = FFTConvolutionClass(inputPlanes,
+                                   outputPlanes,
+                                   kW,
+                                   kH,
+                                   1,
+                                   1,
+                                   padW,
+                                   padH,
+                                   tileW,
+                                   tileH,
+                                   reuse):cuda()
+   net.cudnnDebug = testCuDNN -- this line activates internal testing vs CuDNN
+
+   if silence then
+      net.reportErrors = false
+   end
+
+   if runUpdateOutput then
+      net.printDebugLevel = -1
+      if net.printDebugLevel >= 3 then
+         -- Nasty debugging to be expected
+         local val = 1
+         input:apply(function() val = val + 1 return val end)
+         local val = 1
+         net.weight:apply(function() val = val + 1 return val end)
+      end
+
+      net:updateOutput(input)
+   end
+
+
+   if runUpdateGradInput then
+      net.printDebugLevel = -1
+      if net.printDebugLevel >= 3 then
+         -- Nasty debugging to be expected
+         local val = 1
+         gradOutput:apply(function() val = val + 1 return val end)
+         local val = 1
+         net.weight:apply(function() val = val + 1 return val end)
+      end
+
+      net:updateGradInput(input, gradOutput)
+   end
+
+
+   if runAccGradParameters then
+      net.printDebugLevel = -1
+      if net.printDebugLevel >= 3 then
+         -- Nasty debugging to be expected
+         scale = 1.0
+         local val = 1
+         input:apply(function() val = val + 1 return val end)
+         local val = 1
+         gradOutput:apply(function() val = val + 1 return val end)
+      end
+      net:accGradParameters(input, gradOutput, scale)
+   end
+
+   g_mytester:assert(net.cudnnChecks)
+
+   return net
+end
+
+
+local problemsToRun = {
+   -- iH, iW, kH, kW, tileH, tileW, padH, padW, reuse
+   {2, 2, 2, 12, 12, 3, 3, 8, 8, 0, 0,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {2, 2, 2, 128, 128, 3, 3, 16, 16, 0, 0,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 128, 112, 112, 3, 3, 32, 32, 0, 0,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+}
+
+local numTests = 25
+
+-- Convenient override of the default that are used for unit tests
+-- numTests = 1
+-- silence = false
+-- timeResults = true
+-- printDebug = false
+-- printMemory = false
+-- runUpdateOutput = true
+-- runUpdateGradInput = true
+-- runAccGradParameters = true
+
+local testSync = true
+local testAsync = true
+local testIterated = true
+function fb_test.testTiledFFT()
+   for i = 1, #problemsToRun do
+      if testSync then
+         local net =
+            testTiledFFT(problemsToRun[i], nn.SpatialConvolutionFFTTiledSync)
+         reportAndFree(net)
+      end
+      if testAsync then
+         local net =
+            testTiledFFT(problemsToRun[i], nn.SpatialConvolutionFFTTiledAsync)
+         reportAndFree(net)
+      end
+      if testIterated then
+         local net = testTiledFFT(
+            problemsToRun[i], nn.SpatialConvolutionFFTTiledIterated)
+         reportAndFree(net)
+      end
+   end
+   for step = 1, numTests do
+      if testSync then
+         local net = testTiledFFT({}, nn.SpatialConvolutionFFTTiledSync)
+         reportAndFree(net)
+      end
+      if testAsync then
+         local net = testTiledFFT({}, nn.SpatialConvolutionFFTTiledAsync)
+         reportAndFree(net)
+      end
+      if testIterated then
+         local net = testTiledFFT({}, nn.SpatialConvolutionFFTTiledIterated)
+         reportAndFree(net)
+      end
+  end
+end
+
+g_mytester = torch.Tester()
+g_mytester:add(fb_test)
+g_mytester:run()
diff --git a/test/test_FFT.lua b/test/test_FFT.lua
index 04259d4..a2e64f9 100644
--- a/test/test_FFT.lua
+++ b/test/test_FFT.lua
@@ -1,6 +1,6 @@
 -- Copyright 2004-present Facebook. All Rights Reserved.
--- require('fb.luaunit')
-local torch = require('fbtorch')
+require('fb.luaunit')
+require('fbtorch')
 
 require 'cunn'
 require 'fbcunn'
@@ -15,19 +15,72 @@ local mytester = torch.Tester()
 local precision = 1e-4
 
 local test = {}
+local silence = true
 local printResults = false
+local printMemory = false
+local timeResults = false
+local skipTest = false
 
-local kNumGPUs = 1
-local maxSize = 128000000
+local maxSize = 1e30
 local maxBatch = 4
 local maxInputPlanes = 13
 local maxOutputPlanes = 13
 local maxKernelSize = 7
-local maxInputSize = 60
+local maxInputSize = 32 - maxKernelSize
 
-local function testLoop(problemSize)
+
+local function reportAndFree(net)
+   if (printResults or printMemory) and not silence then
+      local free, total = cutorch.getMemoryUsage()
+      print('Pre Collect Memory: ' , free , ' free ' , total , ' total',
+            total - free, 'consumption')
+   end
+   -- release entries from the global buffer table
+   if net then
+      net:cleanupBuffers()
+      net = nil
+   end
+   collectgarbage()
+   collectgarbage()
+   if (printResults or printMemory) and not silence then
+      local free, total = cutorch.getMemoryUsage()
+      print('Post Collect Memory: ' , free , ' free ' , total , ' total',
+            total - free, 'consumption')
+   end
+end
+
+local function timeFunction(
+      printString, fun, module, arg1, arg2, arg3, arg4, arg5)
+   if not timeResults then
+      return fun(module, arg1, arg2, arg3, arg4, arg5)
+   end
+
+   local numTrials = 5
+   local time = 0
+   for i = 1, numTrials do
+      local timer = torch.Timer()
+      cutorch.synchronize()
+      fun(module, arg1, arg2, arg3, arg4, arg5)
+      cutorch.synchronize()
+      if i > 1 then
+         time = time + timer:time().real
+      end
+   end
+   time = time / (numTrials - 1)
+   if not silence then
+      print(printString .. time * 1000 .. ' ms')
+   end
+
+   -- Avoid messing up the accGradParameters case, this is benchmarking
+   -- only so we're ok
+   module.gradBias:zero()
+   module.gradWeight:zero()
+   return fun(module, arg1, arg2, arg3, arg4, arg5)
+end
+
+local function testLoop(problemSize, fftImplementation)
    local batchSize = problemSize[1] or 4 * torch.random(maxBatch)
-   local nInputPlanes = problemSize[2] or torch.random(maxInputSize)
+   local nInputPlanes = problemSize[2] or torch.random(maxInputPlanes)
    local nOutputPlanes = problemSize[3] or torch.random(maxOutputPlanes)
    local kH = problemSize[4] or torch.random(maxKernelSize)
    -- If not specified, make it square to avoid blatant rectangular
@@ -39,6 +92,28 @@ local function testLoop(problemSize)
    -- inefficiences with FBFFT atm
    local iW = problemSize[7] or
               math.max(kW, torch.random(maxInputSize) + 4 - kW + 1)
+   local padH = problemSize[8] or math.min(torch.random(5) % 5, kH - 1)
+   local padW = problemSize[9] or math.min(torch.random(5) % 5, kW - 1)
+   local tileH = problemSize[10]
+   local tileW = problemSize[11]
+   local reuseRandom = math.min(torch.random(5) % 5 + 1)
+   local reuses = {
+      nn.SpatialConvolutionFFT.memoryReuseNone,
+      nn.SpatialConvolutionFFT.memoryReuseInput,
+      nn.SpatialConvolutionFFT.memoryReuseWeight,
+      nn.SpatialConvolutionFFT.memoryReuseOutput,
+      nn.SpatialConvolutionFFT.memoryReuseAll,
+   }
+   local reuse = problemSize[12] or reuses[reuseRandom]
+
+   if fftImplementation == 'cufft' then
+      iW = iW + 2 * padW
+      iH = iH + 2 * padH
+      padW = 0
+      padH = 0
+      tileW = nil
+      tileH = nil
+   end
 
    -- Only small tests, having many small random tests that also
    -- exercise synchronizations is far more valuable than bigger ones
@@ -53,106 +128,145 @@ local function testLoop(problemSize)
    end
 
    local scale = torch.random(100) / 100.0
-   print('Running ',
-         batchSize, nInputPlanes, nOutputPlanes, kH, kW, iH, iW, scale)
+   if not silence then
+      print('Running ', batchSize, nInputPlanes, nOutputPlanes,
+            kH, kW, iH, iW, scale, ' pad by ', padH, 'x', padW,
+            ' tile by ', tileH, 'x', tileW, ' reuse ', reuse)
+   end
+
+   local input = torch.CudaTensor(batchSize, nInputPlanes, iH, iW):normal()
+   local gradOutput = torch.CudaTensor(batchSize,
+                                       nOutputPlanes,
+                                       iH + 2 * padH - kH + 1,
+                                       iW + 2 * padW - kW + 1):normal()
+
+   local netCuDNN, output, gradInput, gradWeight, gradBias
+   -- Convenient way to skip tests to debug performance
+   if not skipTest then
+      netCuDNN =
+         cudnn.SpatialConvolution(nInputPlanes, nOutputPlanes,
+                                  kW, kH, 1, 1, padW, padH):cuda()
+      netCuDNN.gradWeight:zero()
+      netCuDNN.gradBias:zero()
+
+      output =
+         timeFunction('CUDNN updateOutput: ', netCuDNN.updateOutput,
+                      netCuDNN, input, scale):float()
+      gradInput =
+         timeFunction('CUDNN updateGradInput: ', netCuDNN.updateGradInput,
+                      netCuDNN, input, gradOutput):float()
+      timeFunction('CUDNN accGradParameters: ', netCuDNN.accGradParameters,
+                   netCuDNN, input, gradOutput, scale)
+      gradWeight = netCuDNN.gradWeight:float()
+      gradBias = netCuDNN.gradBias:float()
+   end
+
+   local net
+   if tileH and tileW then
+      net =
+         nn.SpatialConvolutionFFTTiled(nInputPlanes,
+                                       nOutputPlanes,
+                                       kW,
+                                       kH,
+                                       1,
+                                       1,
+                                       padW,
+                                       padH,
+                                       tileW,
+                                       tileH,
+                                       reuse)
+   else
+      if fftImplementation == 'fbfft' then
+         net = nn.SpatialConvolutionFBFFT(
+            nInputPlanes, nOutputPlanes, kW, kH, 1, 1, padW, padH, reuse)
+      elseif fftImplementation == 'cufft' then
+         net = nn.SpatialConvolutionCuFFT(
+            nInputPlanes, nOutputPlanes, kW, kH, 1, 1, padW, padH, reuse)
+      elseif fftImplementation == 'fbfftgemm' then
+         net = nn.SpatialConvolutionFBFFTGemm(
+            nInputPlanes, nOutputPlanes, kW, kH, 1, 1, padW, padH, reuse)
+      else
+         assert(false, 'Unknown fftImplementation ' .. fftImplementation)
+      end
+   end
 
-   local net = nn.SpatialConvolution(nInputPlanes, nOutputPlanes, kW, kH)
-   local input = torch.Tensor(batchSize, nInputPlanes, iH, iW):normal()
-   local gradOutput =
-     torch.Tensor(batchSize, nOutputPlanes, iH-kH+1, iW-kW+1):normal()
+   local name = fftImplementation
+   net:cuda()
    net.gradWeight:zero()
    net.gradBias:zero()
-   local output = net:updateOutput(input, scale):clone()
-
-   local gradInput = net:updateGradInput(input, gradOutput):clone()
-   net:accGradParameters(input, gradOutput, scale)
-   local gradWeight = net.gradWeight:clone()
-   local gradBias = net.gradBias:clone()
-
-   for j = 1,kNumGPUs do -- test cuda resources reuse with kNumGPUs iterations
-     local netCuFFT = {}
-     local outputCuFFT = {}
-     local gradInputCuFFT = {}
-     local gradWeightCuFFT = {}
-     local gradBiasCuFFT = {}
-
-     for k = 1, kNumGPUs do -- Across kNumGPUs GPUs
-       if k > 1 then
-          cutorch.setDevice(k)
-       end
-
-       netCuFFT[k] =
-         nn.SpatialConvolutionCuFFT(nInputPlanes, nOutputPlanes, kW, kH)
-       netCuFFT[k].debug = true
-       netCuFFT[k].gradWeight:zero()
-       netCuFFT[k].gradBias:zero()
-       netCuFFT[k].weight:copy(net.weight)
-       netCuFFT[k].bias:copy(net.bias)
-       netCuFFT[k]:cuda()
-
-       outputCuFFT[k] =
-         netCuFFT[k]:updateOutput(input:clone():cuda(), scale):float()
-       gradInputCuFFT[k] =
-         netCuFFT[k]:updateGradInput(input:clone():cuda(),
-                                 gradOutput:clone():cuda()):float()
-       netCuFFT[k]:accGradParameters(input:clone():cuda(),
-                                 gradOutput:clone():cuda(), scale)
-       gradWeightCuFFT[k] = netCuFFT[k].gradWeight:clone():float()
-       gradBiasCuFFT[k] = netCuFFT[k].gradBias:clone():float()
-
-       if printResults then
+   if netCuDNN then
+      net.weight:copy(netCuDNN.weight)
+      net.bias:copy(netCuDNN.bias)
+   end
+   -- net.cudnnDebug = false
+   -- net.printDebugLevel = -1
+
+   local outputFFT = timeFunction(name .. 'updateOutput: ',
+                                  net.updateOutput,
+                                  net,
+                                  input):float()
+
+   local gradInputFFT = timeFunction(name .. 'updateGradInput: ',
+                                     net.updateGradInput,
+                                     net,
+                                     input,
+                                     gradOutput):float()
+   timeFunction(name .. 'accGradParameters: ',
+                net.accGradParameters,
+                net,
+                input,
+                gradOutput,
+                scale)
+
+   if not skipTest then
+      local gradWeightFFT = net.gradWeight:float()
+      local gradBiasFFT = net.gradBias:float()
+
+      if printResults and not silence then
          local norm = math.sqrt(output:dot(output) + 1e-8)
-         print("updateOutputCuFFT", output:dist(outputCuFFT[k]) / norm)
+         print('updateOutput' .. name, output:dist(outputFFT) / norm)
          local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8)
-         print("updateGradInputCuFFT",
-               gradInput:dist(gradInputCuFFT[k]) / norm)
+         print('updateGradInput' .. name,
+               gradInput:dist(gradInputFFT) / norm)
          local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8)
-         print("accGradParametersCuFFT (weight)",
-               gradWeight:dist(gradWeightCuFFT[k]) / norm)
+         print('accGradParameters' .. name .. ' (weight)',
+               gradWeight:dist(gradWeightFFT) / norm)
          local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8)
-         print("accGradParametersCuFFT (bias)",
-               gradBias:dist(gradBiasCuFFT[k]) / norm)
-       end
-
-
-       local norm = math.sqrt(output:dot(output) + 1e-8)
-       mytester:assertle(output:dist(outputCuFFT[k]) / norm,
-         precision, 'error on output')
-       local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8)
-       mytester:assertle(gradInput:dist(gradInputCuFFT[k]) / norm,
-         precision, 'error on gradInput')
-       local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8)
-       mytester:assertle(gradWeight:dist(gradWeightCuFFT[k]) / norm,
-         precision, 'error on gradWeight')
-       local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8)
-       mytester:assertle(gradBias:dist(gradBiasCuFFT[k]) / norm,
-         precision, 'error on gradBias')
-    end
-  end
-
-  if printResults then
-    local free_bytes, total_bytes = cutorch.getMemoryUsage()
-    print ("free after collection, total", free_bytes, " ", total_bytes)
-  end
-
-  collectgarbage()
-
-  if printResults then
-    local free_bytes, total_bytes = cutorch.getMemoryUsage()
-    print ("free after collection, total", free_bytes, " ", total_bytes)
-  end
+         print('accGradParameters' .. name .. ' (bias)',
+               gradBias:dist(gradBiasFFT) / norm)
+      end
+
+      local norm = math.sqrt(output:dot(output) + 1e-8)
+      mytester:assertle(output:dist(outputFFT) / norm,
+                        precision, 'error on output')
+      local norm = math.sqrt(gradInput:dot(gradInput) + 1e-8)
+      mytester:assertle(gradInput:dist(gradInputFFT) / norm,
+                        precision, 'error on gradInput')
+      local norm = math.sqrt(gradWeight:dot(gradWeight) + 1e-8)
+      mytester:assertle(gradWeight:dist(gradWeightFFT) / norm,
+                        precision, 'error on gradWeight')
+      local norm = math.sqrt(gradBias:dot(gradBias) + 1e-8)
+      mytester:assertle(gradBias:dist(gradBiasFFT) / norm,
+                        precision, 'error on gradBias')
+   end
+
+   return net
 end
 
 -- batch, inputPlanes, outputPlanes, kH, kW, iH, iW
 local problemSizes = {
   {1, 1, 1, 1, 1, 1, 1},
+  {1, 1, 1, 1, 1, 1, 2},
+  {1, 1, 1, 1, 1, 1, 3},
+  {1, 1, 1, 3, 3, 4, 4},
+  {1, 1, 1, 3, 3, 8, 8},
+  {2, 1, 3, 1, 1, 1, 1},
+  {2, 3, 1, 1, 1, 1, 1},
   {2, 3, 4, 5, 5, 5, 5},
   {1, 1, 1, 3, 3, 3, 3},
   {1, 1, 1, 2, 2, 2, 2},
   {1, 1, 1, 1, 2, 1, 2},
   {1, 1, 1, 1, 1, 2, 3},
-  {1, 1, 1, 1, 1, 1, 2},
-  {1, 1, 1, 1, 1, 1, 1},
   {2, 3, 4, 5, 5, 5, 5},
   {128, 64, 64, 1, 1, 1, 1},
   {128, 64, 100, 1, 1, 1, 1},
@@ -161,6 +275,21 @@ local problemSizes = {
   {128, 64, 64, 3, 3, 3, 3},
   {128, 64, 64, 3, 3, 3, 3},
   {128, 64, 64, 3, 3, 3, 3},
+  {1, 1, 1, 7, 5, 13, 14},
+  -- Cannot put in unit tests due to 5GB memory limit
+  --  {128, 128, 128, 3, 3, 128, 128}, -- falls back to cudnn
+  {1,  1, 1, 5, 5, 27, 27, 0, 0},
+  {1,  1, 1, 5, 5, 27, 27, 1, 0},
+  {1,  1, 1, 5, 5, 27, 27, 0, 1},
+  {1,  1, 1, 5, 5, 27, 27, 1, 2},
+  {1,  1, 1, 5, 5, 27, 27, 2, 1},
+  {1,  1, 1, 5, 5, 27, 27, 2, 2},
+  {1,  1, 1, 3, 4, 19, 23, 0, 0},
+  {1,  1, 1, 3, 4, 19, 23, 1, 0},
+  {1,  1, 1, 3, 4, 19, 23, 0, 1},
+  {1,  1, 1, 3, 4, 19, 23, 1, 2},
+  {1,  1, 1, 3, 4, 19, 23, 2, 1},
+  {1,  1, 1, 3, 4, 19, 23, 2, 2},
 }
 
 local _problemSizesICLR2015 = {
@@ -191,33 +320,293 @@ local _problemSizesICLR2015 = {
 }
 
 local _problemSizesAlexNet = {
-  -- 1 GPU
-  {128,  96, 256, 5, 5, 31, 31},
-  {128, 256, 384, 3, 3, 15, 15},
-  {128, 384, 384, 3, 3, 15, 15},
-  {128, 384, 256, 3, 3, 15, 15},
-  -- 2 GPU model parallel
-  {128,  48, 128, 5, 5, 31, 31},
-  {128, 256, 192, 3, 3, 15, 15},
-  {128, 192, 192, 3, 3, 15, 15},
-  {128, 192, 128, 3, 3, 15, 15},
-  -- 4 GPU model parallel
-  {128,  24,  64, 5, 5, 31, 31},
-  {128, 256,  96, 3, 3, 15, 15},
-  {128,  96,  96, 3, 3, 15, 15},
-  {128,  96,  64, 3, 3, 15, 15},
+   -- 1 GPU
+   {128,  96, 256, 5, 5, 27, 27, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128,  96, 256, 5, 5, 27, 27, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 256, 384, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 256, 384, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 384, 384, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 384, 384, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 384, 256, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 384, 256, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   -- 2 GPU model parallel
+   {128,  48, 128, 5, 5, 27, 27, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128,  48, 128, 5, 5, 27, 27, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 256, 192, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 256, 192, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 192, 192, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 192, 192, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 192, 128, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 192, 128, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   -- 4 GPU model parallel
+   {128,  24,  64, 5, 5, 27, 27, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128,  24,  64, 5, 5, 27, 27, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 256,  96, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 256,  96, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128,  96,  96, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128,  96,  96, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128,  96,  64, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128,  96,  64, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
 }
 
-local num_random_configurations = 5
+local _problemSizesVGG = {
+   {64, 64, 64, 3, 3, 32, 32, 0, 0, 8, 8,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 64, 3, 3, 32, 32, 0, 0, 8, 8,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 64, 64, 3, 3, 32, 32, 0, 0, 16, 16,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 64, 3, 3, 32, 32, 0, 0, 16, 16,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+   -- Test fallback to FBFFT convolutions
+   {64, 64, 64, 3, 3, 32, 32, 0, 0, 32, 32,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 64, 3, 3, 32, 32, 0, 0, 32, 32,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 64, 64, 3, 3, 64, 64, 0, 0, 16, 16,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 64, 3, 3, 64, 64, 0, 0, 16, 16,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 64, 64, 3, 3, 64, 64, 0, 0, 32, 32,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 64, 3, 3, 64, 64, 0, 0, 32, 32,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 64, 64, 3, 3, 128, 128, 0, 0, 16, 16,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 64, 3, 3, 128, 128, 0, 0, 16, 16,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 64, 64, 3, 3, 128, 128, 0, 0, 32, 32,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 64, 3, 3, 128, 128, 0, 0, 32, 32,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 64, 128, 3, 3, 112, 112, 1, 1, 16, 16,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 128, 3, 3, 112, 112, 1, 1, 16, 16,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 64, 128, 3, 3, 112, 112, 1, 1, 32, 32,
+    nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 128, 3, 3, 112, 112, 1, 1, 32, 32,
+    nn.SpatialConvolutionFFT.memoryReuseAll},
+}
+
+
+-- These should correspond with Soumith's benchmarks
+-- https://raw.githubusercontent.com/soumith/convnet-benchmarks/master/torch7/imagenet_winners/output_raw.log
+local _benchmarkAlexNet = {
+   -- 1 GPU
+   {128, 64, 192, 5, 5, 27, 27, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 192, 384, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 384, 256, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 256, 256, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+
+   -- 1 GPU
+   {128, 64, 192, 5, 5, 27, 27, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 192, 384, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 384, 256, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 256, 256, 3, 3, 13, 13, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+}
+
+local _benchmarkOverFeat = {
+   -- 1 GPU
+   {128, 96, 256, 5, 5, 24, 24, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 256, 512, 3, 3, 12, 12, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 512, 1024, 3, 3, 12, 12, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {128, 1024, 1024, 3, 3, 12, 12, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+
+   -- 1 GPU
+   {128, 96, 256, 5, 5, 24, 24, 2, 2,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 256, 512, 3, 3, 12, 12, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 512, 1024, 3, 3, 12, 12, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {128, 1024, 1024, 3, 3, 12, 12, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+}
+
+local _benchmarkVGG = {
+   -- 1 GPU
+   {64, 3, 64, 3, 3, 224, 224, 1, 1,
+    32, 32, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 64, 128, 3, 3, 112, 112, 1, 1,
+    32, 32, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 128, 256, 3, 3, 56, 56, 1, 1,
+    32, 32, nn.SpatialConvolutionFFT.memoryReuseAll},
+
+   {64, 256, 256, 3, 3, 56, 56, 1, 1,
+    32, 32, nn.SpatialConvolutionFFT.memoryReuseAll},
+
+   {64, 256, 512, 3, 3, 28, 28, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 512, 512, 3, 3, 28, 28, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+
+   {64, 512, 512, 3, 3, 14, 14, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+   {64, 512, 512, 3, 3, 14, 14, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseAll},
+
+   -- 1 GPU
+   {64, 3, 64, 3, 3, 224, 224, 1, 1,
+    32, 32, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 64, 128, 3, 3, 112, 112, 1, 1,
+    32, 32, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 128, 256, 3, 3, 56, 56, 1, 1,
+    32, 32, nn.SpatialConvolutionFFT.memoryReuseNone},
+
+   {64, 256, 256, 3, 3, 56, 56, 1, 1,
+    32, 32, nn.SpatialConvolutionFFT.memoryReuseNone},
+
+   {64, 256, 512, 3, 3, 28, 28, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 512, 512, 3, 3, 28, 28, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+
+   {64, 512, 512, 3, 3, 14, 14, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 512, 512, 3, 3, 14, 14, 1, 1,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+}
+
+local _stressTest = {
+   {1, 128, 128, 3, 3, 8, 8, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 3, 128, 3, 3, 8, 8, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 3, 512, 3, 3, 8, 8, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {1, 256, 512, 3, 3, 8, 8, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 128, 128, 3, 3, 8, 8, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 256, 512, 3, 3, 8, 8, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {1, 16, 16, 3, 3, 16, 16, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {1, 128, 128, 3, 3, 16, 16, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {1, 256, 512, 3, 3, 16, 16, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 3, 128, 3, 3, 16, 16, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 3, 512, 3, 3, 16, 16, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 128, 128, 3, 3, 16, 16, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 256, 512, 3, 3, 16, 16, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {1, 16, 16, 3, 3, 32, 32, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+-- Investigation says the cost of FFT weights is too high since
+-- they are only used once in this case. Good thing is that batch
+-- size of 1 should be for inference only and precomputing the FFT
+-- of the weights is a viable approach
+   {1, 128, 128, 3, 3, 32, 32, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {1, 256, 512, 3, 3, 32, 32, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+----------------------------------------------------------------
+   {64, 3, 128, 3, 3, 32, 32, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 3, 512, 3, 3, 32, 32, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 128, 128, 3, 3, 32, 32, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+   {64, 256, 512, 3, 3, 32, 32, 0, 0,
+    nil, nil, nn.SpatialConvolutionFFT.memoryReuseNone},
+}
+
+local problemsToRun = _stressTest
+local num_random_configurations = 25
+
+--[[
+-- Convenient override of the default that are used for unit tests
+problemsToRun = _problemSizesAlexNet
+problemsToRun = _problemSizesICLR2015
+problemsToRun = _problemSizesVGG
+printMemory = true
+timeResults = true
+num_random_configurations = 0
+--]]
+
+local testCuFFT = false
+local testFBFFT = true
+local testFBFFTGemm = true
 
 function test.test()
-  for i = 1, #problemSizes do
-      testLoop(problemSizes[i])
-  end
-  -- random configuration
-  for i = 1, num_random_configurations do
-      testLoop({})
-  end
+   for i = 1, #problemsToRun do
+      if testFBFFT then
+         local net = testLoop(problemsToRun[i], 'fbfft')
+         reportAndFree(net)
+      end
+      if testFBFFTGemm then
+         local net = testLoop(problemsToRun[i], 'fbfftgemm')
+         reportAndFree(net)
+      end
+      if testCuFFT then
+         local net = testLoop(problemsToRun[i], 'cufft')
+         reportAndFree(net)
+      end
+   end
+
+   for size in pairs({'big', 'small'}) do
+      if size == 'big' then
+         maxInputSize = 32 - maxKernelSize
+      else
+         maxInputSize = 128 - maxKernelSize
+      end
+      -- random configuration
+      for i = 1, num_random_configurations do
+         if testFBFFT then
+            local net = testLoop({}, 'fbfft')
+            reportAndFree(net)
+         end
+         if testFBFFTGemm then
+            local net = testLoop({}, 'fbfftgemm')
+            reportAndFree(net)
+         end
+         if testCuFFT then
+            local net = testLoop({}, 'cufft')
+            reportAndFree(net)
+         end
+      end
+   end
 end
 
 mytester:add(test)
diff --git a/test/test_FFTModule.lua b/test/test_FFTModule.lua
index d052473..e5de3cb 100644
--- a/test/test_FFTModule.lua
+++ b/test/test_FFTModule.lua
@@ -10,11 +10,11 @@ local mytester = torch.Tester()
 
 local FFTTester = {}
 local printResults = false
-local precision = 2e-7
+local precision = 2 * 2e-7 -- 2 ULPs relative to the largest input
 
 -- We exploit hermitian symmetry to write out only 1/2 the data.
 -- CuFFT exploits hermitian symmetry along the innermost dimension
--- FBFFT is parameteriazble  only determined by the output tensor dimesions.
+-- FBFFT is parametriazble  only determined by the output tensor dimensions.
 -- Ideally we would use outermost dimension hermitian symmetry for better
 -- coalescing but if we check correctness vs CuFFT then we match it.
 local runTests = true
@@ -31,55 +31,47 @@ local _iclr2015TestCases = {
   {1, {4, 32}},
   {1, {4, 64}},
   {1, {4, 128}},
-  {1, {4, 256}},
 
   {1, {32, 8}},
   {1, {32, 16}},
   {1, {32, 32}},
   {1, {32, 64}},
   {1, {32, 128}},
-  {1, {32, 256}},
 
   {1, {128, 8}},
   {1, {128, 16}},
   {1, {128, 32}},
   {1, {128, 64}},
   {1, {128, 128}},
-  {1, {128, 256}},
 
   {1, {1024, 8}},
   {1, {1024, 16}},
   {1, {1024, 32}},
   {1, {1024, 64}},
   {1, {1024, 128}},
-  {1, {1024, 256}},
 
   {1, {4096, 8}},
   {1, {4096, 16}},
   {1, {4096, 32}},
   {1, {4096, 64}},
   {1, {4096, 128}},
-  {1, {4096, 256}},
 
   {1, {128 * 128, 8}},
   {1, {128 * 128, 16}},
   {1, {128 * 128, 32}},
   {1, {128 * 128, 64}},
   {1, {128 * 128, 128}},
-  {1, {128 * 128, 256}},
 
   {1, {256 * 256, 8}},
   {1, {256 * 256, 16}},
   {1, {256 * 256, 32}},
   {1, {256 * 256, 64}},
   {1, {256 * 256, 128}},
-  {1, {256 * 256, 256}},
 
   {2, {4, 8, 8}},
   {2, {4, 16, 16}},
   {2, {4, 32, 32}},
   {2, {4, 64, 64}},
-  {2, {4, 128, 128}},
 
   {2, {32, 8, 8}},
   {2, {32, 16, 16}},
@@ -105,19 +97,23 @@ local _iclr2015TestCases = {
   {2, {4096, 64, 64}},
   {2, {4096, 128, 128}},
 
+  {2, {1, 8, 8}},
+  {2, {1, 16, 16}},
+  {2, {1, 32, 32}},
+  {2, {1, 64, 64}},
+
   {2, {128 * 128, 8, 8}},
   {2, {128 * 128, 16, 16}},
   {2, {128 * 128, 32, 32}},
   {2, {128 * 128, 64, 64}},
   {2, {128 * 128, 128, 128}},
 
---[[
   {2, {256 * 256, 8, 8}},
   {2, {256 * 256, 16, 16}},
   {2, {256 * 256, 32, 32}},
   {2, {256 * 256, 64, 64}},
-  {2, {256 * 256, 128, 128}},
---]]
+-- Too much memory
+-- {2, {256 * 256, 128, 128}},
 }
 
 local _stressTestCases = {
@@ -128,7 +124,6 @@ local _stressTestCases = {
   {1, {32 * 32, 32}},
   {1, {32 * 32, 64}},
   {1, {32 * 32, 128}},
-  {1, {32 * 32, 256}},
 
   {2, {32 * 32, 2, 2}},
   {2, {32 * 32, 4, 4}},
@@ -145,7 +140,6 @@ local _stressTestCases = {
   {1, {64 * 64, 32}},
   {1, {64 * 64, 64}},
   {1, {64 * 64, 128}},
-  {1, {64 * 64, 256}},
 
   {2, {64 * 64, 2, 2}},
   {2, {64 * 64, 4 ,4}},
@@ -162,7 +156,6 @@ local _stressTestCases = {
   {1, {128 * 128, 32}},
   {1, {128 * 128, 64}},
   {1, {128 * 128, 128}},
-  {1, {128 * 128, 256}},
 
   {2, {128 * 128, 2, 2}},
   {2, {128 * 128, 4 ,4}},
@@ -175,6 +168,14 @@ local _stressTestCases = {
 }
 
 local testCases = {
+  {1, {1, 2}},
+  {1, {1, 4}},
+  {1, {1, 8}},
+  {1, {1, 16}},
+  {1, {1, 32}},
+  {1, {1, 64}},
+  {1, {1, 128}},
+
   {1, {127, 2}},
   {1, {127, 4}},
   {1, {127, 8}},
@@ -182,7 +183,6 @@ local testCases = {
   {1, {127, 32}},
   {1, {127, 64}},
   {1, {127, 128}},
-  {1, {127, 256}},
 
   {1, {437, 2}},
   {1, {437, 4}},
@@ -191,7 +191,14 @@ local testCases = {
   {1, {437, 32}},
   {1, {437, 64}},
   {1, {437, 128}},
-  {1, {437, 256}},
+
+  {2, {1, 2, 2}},
+  {2, {1, 4 ,4}},
+  {2, {1, 8, 8}},
+  {2, {1, 16, 16}},
+  {2, {1, 32, 32}},
+  {2, {1, 64, 64}},
+  {2, {1, 128, 128}},
 
   {2, {9, 2, 2}},
   {2, {9, 4 ,4}},
@@ -269,30 +276,28 @@ local function benchmarkCuFFT(problemSize, timeCuda)
     local freqSize = {}
     for i = 1, #timeSize do
       if i == #timeSize then
-        freqSize = concat(freqSize, {math.floor(timeSize[i] / 2) + 1})
+        table.insert(freqSize, math.floor(timeSize[i] / 2) + 1)
       else
-        freqSize = concat(freqSize, {timeSize[i]})
+        table.insert(freqSize, timeSize[i])
       end
     end
-    freqSize = concat(freqSize, {2})
-
+    table.insert(freqSize, 2)
 
     local timeInvCuda = timeCuda:clone()
     local frequencyCuda =
-        torch.CudaTensor(torch.LongStorage(freqSize)):fill(-47.0)
+        torch.CudaTensor(torch.LongStorage(freqSize)):fill(0 / 0)
 
     local batchDims = #timeSize - fftDim
-    local net = nn.FFTWrapper(1)
+    local net = nn.FFTWrapper("cufft", 0, 0, "timed") -- no padding
     net:fft(timeCuda, frequencyCuda, batchDims)
     net:ffti(timeInvCuda, frequencyCuda, batchDims)
 
     local timeInv = timeInvCuda:double()
     local frequency = frequencyCuda:double()
     if printResults then
-        print('forward re:', frequency:select(fftDim + 2, 1))
-        print('forward im:', frequency:select(fftDim + 2, 2))
-
-        print('inverse re:', timeInv)
+        print('cufft forward re:', frequency:select(fftDim + 2, 1))
+        print('cufft forward im:', frequency:select(fftDim + 2, 2))
+        print('cufft inverse re:', timeInv)
     end
 
     timeInvCuda = {}
@@ -302,6 +307,11 @@ local function benchmarkCuFFT(problemSize, timeCuda)
     return frequency, timeInv
 end
 
+local function transposedLayout(fftDim, fftSize)
+   if fftDim == 2 and (fftSize < 8 or fftSize > 32) then return true end
+   return false
+end
+
 local function benchmarkFBFFT(problemSize, timeCuda, frequency2)
     local fftDim   = problemSize[1]
     local timeSize = problemSize[2]
@@ -310,32 +320,31 @@ local function benchmarkFBFFT(problemSize, timeCuda, frequency2)
 
     for i = 1, #timeSize do
         if i == hermitianDim then
-            freqSize = concat(freqSize, {math.floor(timeSize[i] / 2) + 1})
+            table.insert(freqSize, math.floor(timeSize[i] / 2) + 1)
         else
-            freqSize = concat(freqSize, {timeSize[i]})
+            table.insert(freqSize, timeSize[i])
         end
     end
-    freqSize = concat(freqSize, {2})
+    table.insert(freqSize, 2)
 
     local timeInvCuda = timeCuda:clone()
     local frequencyCuda =
-        torch.CudaTensor(torch.LongStorage(freqSize)):fill(-47.0)
+        torch.CudaTensor(torch.LongStorage(freqSize)):fill(0 / 0)
     local batchDims = #timeSize - fftDim
-    local net = nn.FFTWrapper(0)
+    local net = nn.FFTWrapper("fbfft", 0, 0, "timed") -- no padding
     net:fft(timeCuda, frequencyCuda, batchDims)
     net:ffti(timeInvCuda, frequencyCuda, batchDims)
 
     local timeInv = timeInvCuda:double()
     local frequency = frequencyCuda:double()
-    if fftDim == 2 then
+    if transposedLayout(fftDim, timeSize[hermitianDim]) then
         frequency = frequency:transpose(2, 3)
     end
 
     if printResults then
-        print('forward re:', frequency:select(fftDim + 2, 1))
-        print('forward im:', frequency:select(fftDim + 2, 2))
-
-        print('inverse re:', timeInv)
+        print('fbfft forward re:', frequency:select(fftDim + 2, 1))
+        print('fbfft forward im:', frequency:select(fftDim + 2, 2))
+        print('fb inverse re:', timeInv)
     end
 
     timeInvCuda = {}
@@ -376,8 +385,25 @@ local function initCuda(ps, localInit)
               res = res + 1
           end
           return res
-            end):cuda()
+        end):cuda()
     elseif localInit == 5 then
+        local val = 0
+        local res = 0
+        timeCudaTensor = torch.Tensor(
+            torch.LongStorage(ps)):apply(function()
+          val = val + 1
+          if val == ps[#ps] + 1 then
+              val = 1
+              res = res + 1
+          end
+          return res
+        end)
+        if #timeCudaTensor:size() == 3 then
+           timeCudaTensor = timeCudaTensor:transpose(2,3):contiguous():cuda()
+        else
+           timeCudaTensor = timeCudaTensor:cuda()
+        end
+    elseif localInit == 6 then
         local val = 0
         timeCudaTensor = torch.Tensor(
             torch.LongStorage(ps)):apply(function()
@@ -404,33 +430,190 @@ local function run(localInit, problemSizes)
               print(timeCudaTensor:float())
           end
 
-          local function assertdiff(reffft, fbfft, fftDim, fftSize)
-              if ps[1] > 512 then
-                  print('Skip horrendously long test, need to transpose',
-                        ' the data efficiently to test')
-                  return
-              end
-              local m = (reffft:double() - fbfft:double()):abs():max()
-              local n = reffft:double():norm() + 1e-10
-              local nfbfft = fbfft:double():norm() + 1e-10
-              if m / n > precision then
-                  if printResults then
-                      print('Check max diff, norm, norm fbfft, max normalized = ',
-                            m, n, nfbfft, m / n)
-                      print('FAILS CHECK !!')
-                      print(m, n, m / n)
-                  end
-              end
-              assert(m / n < precision)
-              return
+          local function checkEqual(a, b, complexCheck)
+             if printResults then
+                print('Top left block equality\n', a, b)
+             end
+             local a = a:double():abs() + 1e-10
+             local b = b:double():abs() + 1e-10
+             local delta = (a:double() - b:double()):abs()
+             local max = a:max() + 1e-20
+             local deltaNorm = delta:div(max)
+
+             if printResults and deltaNorm:max() > precision then
+                print('Check max delta, norm ref fft, max normalized, prec = ',
+                      delta:max(), b:norm(), deltaNorm:max(), precision)
+                print('RE:\n',
+                      a:select(#a:size(), 1, 1),
+                      b:select(#b:size(), 1, 1))
+                print('IM:\n',
+                      a:select(#a:size(), 2, 1),
+                      b:select(#b:size(), 2, 1))
+             end
+             if deltaNorm:max() > precision then
+                print('Error Delta RE', delta:select(#delta:size(), 1, 1))
+                print('Error Delta IM', delta:select(#delta:size(), 2, 1))
+             end
+             assert(deltaNorm:max() <= precision,
+                    deltaNorm:max() .. ' > ' .. precision)
+          end
+
+          local function checkOrthogonalSymmetry(r, fftSize)
+             if printResults then
+                print('Row orthogonal symmetry\n', r)
+             end
+
+             local max = r:clone():abs():max() + 1e-20
+             for k = 1, fftSize / 2 - 1 do
+                local d1 = r[fftSize / 2 + 1 - k][1] - r[fftSize / 2 + 1 + k][1]
+                assert(
+                   math.abs(d1) / max < precision,
+                   d1 .. ' ' .. math.abs(d1) / max .. ' ' .. precision
+                )
+                local d2 = r[fftSize / 2 + 1 - k][2] + r[fftSize / 2 + 1 + k][2]
+                assert(
+                   math.abs(d2) / max < precision,
+                   d2 .. ' ' .. math.abs(d2) / max .. ' ' .. precision
+                )
+             end
           end
 
+          local
+             function checkCentralSymmetry(cuFFT, fbFFT, fftSize, imaginaryPart)
+                if printResults then
+                   print('Remaining block central symmetry', cuFFT, fbFFT)
+                end
+                assert(cuFFT:size(1) == fbFFT:size(1))
+                assert(cuFFT:size(2) == fbFFT:size(2))
+                assert(cuFFT:size(3) == fbFFT:size(3))
+                assert(cuFFT:size(2) == cuFFT:size(3))
+
+                local max = cuFFT:clone():abs():max() + 1e-20
+                for i = 1, cuFFT:size(1) do
+                   for j = 1, cuFFT:size(2) do
+                      for k = 1, cuFFT:size(2) do
+                         local fbFFTVal = fbFFT
+                            [i][1 + cuFFT:size(2) - j][1 + cuFFT:size(2) - k]
+
+                         local d1 = cuFFT[i][j][k] - fbFFTVal
+                         if imaginaryPart then
+                            d1 = cuFFT[i][j][k] + fbFFTVal
+                         end
+
+                         if math.abs(d1) / max > precision then
+                            print('Error Delta\n', d1, ' @ ', i, j, k)
+                            print(cuFFT[i][j][k],
+                                  ' vs ',
+                                  fbFFTVal)
+                         end
+                         assert(
+                            math.abs(d1) / max < precision,
+                            d1 .. ' ' .. math.abs(d1) / max .. ' ' .. precision
+                         )
+                      end
+                   end
+                end
+             end
+
+          local function assertdiffHermitian(
+                reffft, fbfft, fftDim, fftSize, complexCheck)
+             if ps[1] > 512 then
+                print('Skip long test based on lua side loops')
+                return
+             end
+
+             if fftDim == 1 or (fftDim == 2 and not complexCheck) then
+                -- Just check tensor relative equality modulo precision
+                checkEqual(reffft:clone(), fbfft:clone())
+             else
+                assert(complexCheck)
+                assert(fftDim == 2)
+                -- Hermitian check is comprised of 4 checks, one is fbfft vs
+                -- cufft, the others are symmetry checks
+                checkEqual(
+                   fbfft:narrow(
+                      2, 1, fftSize / 2 + 1
+                   ):narrow(3, 1, fftSize / 2 + 1):clone(),
+                   reffft:narrow(
+                      2, 1, fftSize / 2 + 1
+                   ):narrow(3, 1, fftSize / 2 + 1):clone()
+                )
+
+                -- Orthogonal symmetry for first and middle rows along vertical
+                -- plane FFTSize / 2 = 1
+                for i = 1, reffft:size(1) do
+                   for k = 1, fftSize / 2 + 1 do
+                      checkOrthogonalSymmetry(fbfft[i][1]:clone(), fftSize)
+                      checkOrthogonalSymmetry(
+                         fbfft[i][fftSize / 2 + 1]:clone(), fftSize)
+                   end
+                end
+
+                if fftSize > 2 then
+                   -- Central symmetry for:
+                   --   [1, FFTSize / 2) x [FFTSize / 2 + 1, FFTSize) and
+                   --   [FFTSize / 2 + 1, FFTSize) x [FFTSize / 2 + 1, FFTSize)
+                   local f = fbfft:narrow(
+                      2, 2, (fftSize / 2 - 1)
+                   ):narrow(
+                      3, fftSize / 2 + 1 + 1, (fftSize / 2 - 1)
+                           ):clone()
+                   local c = reffft:narrow(
+                      3, 2, (fftSize / 2 - 1)
+                   ):narrow(
+                      2, fftSize / 2 + 1 + 1, (fftSize / 2 - 1)
+                           ):clone()
+                   checkCentralSymmetry(
+                      c:select(4, 1), f:select(4, 1), fftSize)
+                   checkCentralSymmetry(
+                      c:select(4, 2), f:select(4, 2), fftSize, true)
+                end
+             end
+             return
+          end
+
+          local function assertdiffTransposed(reffft, fbfft, fftDim, fftSize)
+             if ps[1] > 512 then
+                print('Skip horrendously long test, need to transpose',
+                      ' the data efficiently to test')
+                return
+             end
+             local m = (reffft:double() - fbfft:double()):abs():max()
+             local n = reffft:double():norm() + 1e-10
+             local nfbfft = fbfft:double():norm() + 1e-10
+             if m / n > precision then
+                print('Check max diff, norm, norm fbfft, max normalized = ',
+                      m, n, nfbfft, m / n)
+                print('FAILS CHECK !!')
+                print(m, n, m / n)
+                if fftDim == 2 and #reffft:size() == 4 then
+                   print('DIFFTENSOR REAL!\n')
+                   print(reffft:add(-fbfft):float():select(fftDim + 2, 1))
+                   print('DIFFTENSOR IM!\n')
+                   print(reffft:add(-fbfft):float():select(fftDim + 2, 2))
+                else
+                   print(reffft, fbfft)
+                   print('DIFFTENSOR REAL!\n')
+                   print(reffft:add(-fbfft):float())
+                end
+             end
+             assert(m / n < precision)
+             return
+          end
+
+          local cufft, cuifft = benchmarkCuFFT(problemSizes[i], timeCudaTensor)
           local fbfft, fbifft =
               benchmarkFBFFT(problemSizes[i], timeCudaTensor, matchCuFFTAlloc)
-          local cufft, cuifft = benchmarkCuFFT(problemSizes[i], timeCudaTensor)
+
+          local fftSize = ps[2]
           if runTests then
-              assertdiff(cufft, fbfft, fftDim, ps[2])
-              assertdiff(cuifft, fbifft, fftDim, ps[2])
+             if not transposedLayout(fftDim, fftSize) then
+                assertdiffHermitian(cufft, fbfft, fftDim, fftSize, true)
+                assertdiffHermitian(cuifft, fbifft, fftDim, fftSize, false)
+             else
+                assertdiffTransposed(cufft, fbfft, fftDim, fftSize)
+                assertdiffTransposed(cuifft, fbifft, fftDim, fftSize)
+             end
           end
 
           timeCudaTensor = {}
@@ -439,20 +622,30 @@ local function run(localInit, problemSizes)
     end
 end
 
+printResults = false
+local localInits = {7} -- only run on random inputs to cut down testing time
+local runCases = testCases
+
+--[[
+-- Convenient override of the default that are used for unit tests
+localInits = {1}
+runCases = _iclr2015TestCases
+--]]
+
 function FFTTester.test()
 -- Type of initialization:
 -- 1: fill(1.0f)
 -- 2: 1.0f if 0 mod 2 else 2.0f
 -- 3: val % 4 + 1
 -- 4: val == row
--- 5: starts at 1.0f and += 1.0f at each entry
+-- 5: val == col
+-- 6: starts at 1.0f and += 1.0f at each entry
 -- else: random
-    local localInits = {1, 2, 3, 4, 5, 6}
-    for i = 1, #localInits do
-        run(localInits[i], testCases)
-        collectgarbage()
-        cutorch.synchronize()
-    end
+   for i = 1, #localInits do
+      run(localInits[i], runCases)
+      collectgarbage()
+      cutorch.synchronize()
+   end
 end
 
 mytester:add(FFTTester)
diff --git a/test/test_SequentialCriterion.lua b/test/test_SequentialCriterion.lua
index fae3a01..e831dc0 100644
--- a/test/test_SequentialCriterion.lua
+++ b/test/test_SequentialCriterion.lua
@@ -77,7 +77,8 @@ function testSequentialCriterion()
       local n_classes = torch.random(200)
       local module = nn.Linear(input_size, n_classes)
       local crit = nn.ClassNLLCriterion()
-      testSequentialCriterion_run(input_size, n_classes, module, crit)
+      testSequentialCriterion_run(input_size, n_classes, module,
+                                  crit, 'torch.LongTensor')
 
       -- try with HSM
       local input1_size = torch.random(200)
diff --git a/test/test_SparseNLLCriterion.lua b/test/test_SparseNLLCriterion.lua
index b9cfffa..576c201 100644
--- a/test/test_SparseNLLCriterion.lua
+++ b/test/test_SparseNLLCriterion.lua
@@ -79,9 +79,9 @@ end
 
 function testSparseNLLCriterion()
    for k = 1, test_repeats do
-      local n_classes = torch.random(1000)
+      local n_classes = torch.random(100)
       local K = torch.random(n_classes)
-      local batch_size = torch.random(100)
+      local batch_size = torch.random(32)
       local err1, err2 = test_sparseNLL(K, n_classes, batch_size, false)
       assertTrue(err1 < 1e-3)
       assertTrue(err2 < 1e-3)
diff --git a/test/test_SpatialConvolutionTuned.lua b/test/test_SpatialConvolutionTuned.lua
new file mode 100644
index 0000000..959501e
--- /dev/null
+++ b/test/test_SpatialConvolutionTuned.lua
@@ -0,0 +1,209 @@
+require 'cunn'
+require 'fbcunn'
+require 'math'
+
+require 'fb.luaunit'
+require('fbtorch')
+g_mytester = torch.Tester()
+local fb_test = {}
+
+local silence = true
+local printMemory = false
+local inferenceOnly = false
+
+local function reportAndFree(net)
+   if printMemory and not silence then
+      local free, total = cutorch.getMemoryUsage()
+      print("Pre Collect Memory: " , free , " free " , total , " total")
+   end
+   -- release entries from the global buffer table
+   if net and net.cleanupBuffers then
+      net:cleanupBuffers()
+      net = nil
+   end
+   collectgarbage()
+   collectgarbage()
+   if printMemory and not silence  then
+      local free, total = cutorch.getMemoryUsage()
+      print("Post Collect Memory: " , free , " free " , total , " total")
+   end
+end
+
+local function testSpatialConvolutionTuned(problem, FFTConvolutionClass)
+   local batches = problem[1]
+   local inputPlanes = problem[2]
+   local outputPlanes = problem[3]
+   local iH = problem[4]
+   local iW = problem[5]
+   local kH = problem[6]
+   local kW = problem[7]
+   local padH = problem[8]
+   local padW = problem[9]
+
+   if not silence then
+      print('Running ', batches, inputPlanes, outputPlanes,
+            " kH = ", kH, " x ", "kW = ", kW,
+            " x ", "iH = ", iH, " x ", "iW = ", iW,
+            " x ", "padH = ", padH, " x ", padW)
+   end
+
+   -- All the necessary checks are already performed while searching
+   -- for the best convolution
+   local netForward = fbnn.SpatialConvolution(
+      inputPlanes,
+      outputPlanes,
+      kW,
+      kH,
+      1,
+      1,
+      padW,
+      padH,
+      nil,   -- no memory limit
+      inferenceOnly  -- not just inference
+   )
+   if not silence then
+      netForward.reportLevel = 2
+   end
+
+   local ps = {batches, inputPlanes, iH, iW}
+   local input = torch.Tensor(torch.LongStorage(ps)):cuda()
+   local ps = {batches,
+               outputPlanes,
+               iH - kH + 2 * padH + 1,
+               iW - kW + 2 * padW + 1}
+   local gradOutput = torch.Tensor(torch.LongStorage(ps)):cuda()
+   local scale = torch.random(100) / 100.0
+   netForward:updateOutput(input)
+   if not inferenceOnly then
+      netForward:updateGradInput(input, gradOutput)
+      netForward:accGradParameters(input, gradOutput, scale)
+   end
+
+   return netForward
+end
+
+
+local problemsToRun = {
+   -- batch, input, output, iH, iW, kH, kW, padH, padW
+   {1, 1, 1, 4, 4, 3, 3, 0, 0},
+   {1, 1, 1, 1, 1, 1, 1, 0, 0},
+   {1, 1, 1, 1, 2, 1, 2, 0, 0},
+   {1, 1, 1, 1, 3, 1, 3, 0, 0},
+   {1, 1, 1, 6, 6, 4, 4, 0, 0},
+   {1, 1, 1, 11, 11, 8, 8, 0, 0},
+   {2, 1, 3, 1, 1, 1, 1, 0, 0},
+   {2, 3, 1, 1, 1, 1, 1, 0, 0},
+   {2, 3, 4, 5, 5, 5, 5, 0, 0},
+   {1, 1, 1, 3, 3, 3, 3, 0, 0},
+   {1, 1, 1, 2, 2, 2, 2, 0, 0},
+   {1, 1, 1, 1, 2, 1, 2, 0, 0},
+   {1, 1, 1, 2, 3, 2, 3, 0, 0},
+   {2, 3, 4, 5, 5, 5, 5, 0, 0},
+   {128, 64, 64, 1, 1, 1, 1, 0, 0},
+   {128, 64, 100, 1, 1, 1, 1, 0, 0},
+   {128, 64, 64, 3, 3, 3, 3, 0, 0},
+   {128, 64, 64, 3, 3, 3, 3, 0, 0},
+   {128, 64, 64, 3, 3, 3, 3, 0, 0},
+   {128, 64, 64, 3, 3, 3, 3, 0, 0},
+   {128, 64, 64, 3, 3, 3, 3, 0, 0},
+   {1, 1, 1, 20, 17, 13, 14, 0, 0},
+   -- Cannot put in unit tests due to 5GB memory limit
+   --  {128, 128, 128, 128, 128, 3, 3, 0, 0}, -- falls back to cudnn
+   {1,  1, 1, 27, 27, 5, 5, 0, 0},
+   {1,  1, 1, 27, 27, 5, 5, 1, 0},
+   {1,  1, 1, 27, 27, 5, 5, 0, 1},
+   {1,  1, 1, 27, 27, 5, 5, 1, 2},
+   {1,  1, 1, 27, 27, 5, 5, 2, 1},
+   {1,  1, 1, 27, 27, 5, 5, 2, 2},
+   {1,  1, 1, 19, 23, 3, 4, 0, 0},
+   {1,  1, 1, 19, 23, 3, 4, 1, 0},
+   {1,  1, 1, 19, 23, 3, 4, 0, 1},
+   {1,  1, 1, 19, 23, 3, 4, 1, 2},
+   {1,  1, 1, 19, 23, 3, 4, 2, 1},
+   {1,  1, 1, 19, 23, 3, 4, 2, 2},
+
+   {1, 1, 1, 128, 128, 3, 3, 0, 0},
+}
+
+local _expensiveProblemsToRun = {
+   {1, 512, 768, 16, 16, 14, 14, 0, 0},
+   {2, 512, 768, 16, 16, 14, 14, 0, 0},
+   {8, 512, 768, 16, 16, 14, 14, 0, 0},
+   {1, 512, 768, 24, 24, 14, 14, 0, 0},
+   {2, 512, 768, 24, 24, 14, 14, 0, 0},
+   {8, 512, 768, 24, 24, 14, 14, 0, 0},
+   {1, 512, 768, 72, 72, 14, 14, 0, 0},
+   {2, 512, 768, 72, 72, 14, 14, 0, 0},
+   {8, 512, 768, 72, 72, 14, 14, 0, 0},
+}
+
+local _benchmark3x3 = {
+   {64,   3,  64, 224, 224, 3, 3, 1, 1},
+   {32, 32, 32, 30, 30, 3, 3, 0, 0},
+   {64, 64, 64, 30, 30, 3, 3, 0, 0},
+   {128, 128, 128, 30, 30, 3, 3, 0, 0},
+   {32, 32, 32, 27, 27, 3, 3, 1, 1},
+   {64, 64, 64, 27, 27, 3, 3, 1, 1},
+   {128, 128, 128, 27, 27, 3, 3, 1, 1},
+   {32, 32, 32, 14, 14, 3, 3, 0, 0},
+   {64, 64, 64, 14, 14, 3, 3, 0, 0},
+   {128, 128, 128, 14, 14, 3, 3, 0, 0},
+   {32, 32, 32, 12, 12, 3, 3, 1, 1},
+   {64, 64, 64, 12, 12, 3, 3, 1, 1},
+   {128, 128, 128, 12, 12, 3, 3, 1, 1},
+   {64, 128, 128,  14,  14, 3, 3, 1, 1},
+   {64, 256, 256,  14,  14, 3, 3, 1, 1},
+   {64, 512, 512,  14,  14, 3, 3, 1, 1},
+}
+
+-- These should correspond with Soumith's benchmarks
+-- https://raw.githubusercontent.com/soumith/convnet-benchmarks/master/torch7/imagenet_winners/output_raw.log
+local _benchmarkAlexNet = {
+   -- 1 GPU
+   {128,  64, 192, 27, 27, 5, 5, 2, 2},
+   {128, 192, 384, 13, 13, 3, 3, 1, 1},
+   {128, 384, 256, 13, 13, 3, 3, 1, 1},
+   {128, 256, 256, 13, 13, 3, 3, 1, 1},
+}
+
+local _benchmarkOverFeat = {
+   -- 1 GPU
+   {128,   96,  256, 24, 24, 5, 5, 2, 2},
+   {128,  256,  512, 12, 12, 3, 3, 1, 1},
+   {128,  512, 1024, 12, 12, 3, 3, 1, 1},
+   {128, 1024, 1024, 12, 12, 3, 3, 1, 1},
+}
+
+local _benchmarkVGG = {
+   -- 1 GPU
+   {64,   3,  64, 224, 224, 3, 3, 1, 1},
+   {64,  64, 128, 112, 112, 3, 3, 1, 1},
+   {64, 128, 256,  56,  56, 3, 3, 1, 1},
+   {64, 256, 256,  56,  56, 3, 3, 1, 1},
+   {64, 256, 512,  28,  28, 3, 3, 1, 1},
+   {64, 512, 512,  28,  28, 3, 3, 1, 1},
+   {64, 512, 512,  14,  14, 3, 3, 1, 1},
+   {64, 512, 512,  14,  14, 3, 3, 1, 1},
+}
+
+--[[
+   Uncomment this for expensive problems
+   problemsToRun = _expensiveProblemsToRun
+   problemsToRun = _benchmarkAlexNet
+   problemsToRun = _benchmarkOverFeat
+   problemsToRun = _benchmarkVGG
+   problemsToRun = _benchmark3x3
+   inferenceOnly = true
+--]]
+
+function fb_test.testSpatialConvolutionTuned()
+   for i = 1, #problemsToRun do
+      local net =
+         testSpatialConvolutionTuned(problemsToRun[i])
+      reportAndFree(net)
+   end
+end
+
+g_mytester = torch.Tester()
+g_mytester:add(fb_test)
+g_mytester:run()
diff --git a/test/test_TemporalKMaxPooling.lua b/test/test_TemporalKMaxPooling.lua
index 33c9921..395ba3d 100644
--- a/test/test_TemporalKMaxPooling.lua
+++ b/test/test_TemporalKMaxPooling.lua
@@ -166,19 +166,5 @@ function TemporalKMaxPoolingTest.sequential()
    assert (gradInput_matches:sum() == gradInput_matches:numel())
 end
 
-function TemporalKMaxPoolingTest.dynamic()
-  local kmax = nn.TemporalKMaxPooling(2, 0.5)
-  local seq = nn.Sequential()
-  seq:add(nn.TemporalKMaxPooling(2, 0.5))
-
-  for n=12,13 do
-    local input = torch.randn(n, 1):cuda()
-    local kmax_output = kmax:updateOutput(input)
-    local seq_output = seq:updateOutput(input)
-    assert (kmax_output:size(1) == 6)
-    assert (torch.all(kmax_output:eq(seq_output)))
-  end
-end
-
 tester:add(TemporalKMaxPoolingTest)
 tester:run()
diff --git a/test/test_WeightedLookupTable.lua b/test/test_WeightedLookupTable.lua
index 06ee917..0bd1cb3 100644
--- a/test/test_WeightedLookupTable.lua
+++ b/test/test_WeightedLookupTable.lua
@@ -26,8 +26,8 @@ function test_WeightedLookupTable_forward()
     local input_length = 9
     local tol = 1e-8
 
-    local wlut = nn.WeightedLookupTable(table_size, embedding_dim)
-    local ulut = nn.LookupTable(table_size, embedding_dim)
+    local wlut = nn.WeightedLookupTable(table_size, embedding_dim):cuda()
+    local ulut = nn.LookupTable(table_size, embedding_dim):cuda()
     ulut.weight:copy(wlut.weight)
     assert(all(torch.eq(wlut.weight, ulut.weight)))
 
@@ -35,22 +35,22 @@ function test_WeightedLookupTable_forward()
     local weights = torch.rand(input_length, 1)
     local winput = torch.cat(uinput, weights, 2)
 
-    local woutput = wlut:forward(winput)
-    local uoutput = ulut:forward(uinput)
-
+    local woutput = wlut:forward(winput:cuda())
+    local uoutput = ulut:forward(uinput:cuda())
+    weights = weights:cuda()
     local expected_woutput = torch.cmul(uoutput, weights:expandAs(uoutput))
 
-    assert(all(almost_equal(woutput, expected_woutput, tol)))
+    assert(all(almost_equal(woutput:float(), expected_woutput:float(), tol)))
 end
 
 function test_WeightedLookupTable_accGradParameters()
     local embedding_dim = 4
     local table_size = 30
     local input_length = 9
-    local tol = 1e-8
+    local tol = 1e-5
 
-    local wlut = nn.WeightedLookupTable(table_size, embedding_dim)
-    local ulut = nn.LookupTable(table_size, embedding_dim)
+    local wlut = nn.WeightedLookupTable(table_size, embedding_dim):cuda()
+    local ulut = nn.LookupTable(table_size, embedding_dim):cuda()
     ulut.weight:copy(wlut.weight)
     assert(all(torch.eq(wlut.weight, ulut.weight)))
 
@@ -58,16 +58,18 @@ function test_WeightedLookupTable_accGradParameters()
     local weights = torch.range(1, input_length):reshape(input_length, 1)
     local winput = torch.cat(uinput, weights, 2)
 
+    winput = winput:cuda()
+    uinput = uinput:cuda()
     local woutput = wlut:forward(winput)
     local uoutput = ulut:forward(uinput)
 
     local wgradOutput = torch.randn(woutput:size())
     local ugradOutput = torch.cmul(wgradOutput, weights:expandAs(wgradOutput))
 
-    wlut:accGradParameters(winput, wgradOutput, 1)
-    ulut:accGradParameters(uinput, ugradOutput, 1)
+    wlut:accGradParameters(winput, wgradOutput:cuda(), 1)
+    ulut:accGradParameters(uinput, ugradOutput:cuda(), 1)
 
-    assert(all(almost_equal(wlut.gradWeight, ulut.gradWeight, tol)))
+    assert(all(almost_equal(wlut.gradWeight:float(), ulut.gradWeight:float(), tol)))
 end